In [42]:
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
# To ignore warinings
import warnings
warnings.filterwarnings('ignore')

In [12]:
# Reading the dataset
df = pd.read_csv('../../data/full_cleaned.csv')

In [13]:
df.head()

Unnamed: 0,title,news_stopwords,news_no_stopwords,category
0,निखिल उप्रेतीको भैरव फिल्मले अमेरिकामा रहेका न...,अमेरिकाका नेपालीको प्रतिक्रिया लिने इच्छा व्यक...,अमेरिकाका नेपालीको प्रतिक्रिया लिने इच्छा व्यक...,मनोरञ्जन
1,सुशील कोइरालाको निधनपछि चौरासी बाले खोले यस्ता...,झण्डै बर्षअघि सुशील कोइरालाले प्रधानमन्त्रीको ...,झण्डै बर्षअघि सुशील कोइरालाले प्रधानमन्त्रीको ...,मनोरञ्जन
2,लिटल प्रिन्स एण्ड प्रिन्सेसको ग्रान्ड फिनाले,ग्ल्यामरस नेपालले सुरुङ्गामा लिटल प्रिन्स एन्ड...,ग्ल्यामरस नेपालले सुरुङ्गामा लिटल प्रिन्स एन्ड...,मनोरञ्जन
3,अमेरिकामा सबैभन्दा धेरै कमाउने सिईओ बने पिचाई ...,गूगलका सिईओ सुन्दर पिचाई अमेरिकामा सबैभन्दा धे...,गूगलका सिईओ सुन्दर पिचाई अमेरिकामा सबैभन्दा कम...,मनोरञ्जन
4,ज्योती मगरको धमाका दोहोरीमा र्याप,दोहोरीमा र्याप बोलको गीतको भिडियो सार्वजनिक भए...,दोहोरीमा र्याप बोलको गीतको भिडियो सार्वजनिक चर...,मनोरञ्जन


In [15]:
df.shape

(369800, 4)

In [16]:
df.drop(columns=["title", "news_stopwords"], inplace=True)

In [23]:
# Remove sikshya category
df = df[df["category"] != "शिक्षा"]
# Remove desh pradesh category
df = df[df["category"] != "देश/प्रदेश"]

In [24]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['category'])

In [25]:
def random_undersampling(data, random_state):
    if len(data) > 25000:
        return data.sample(25000, random_state=random_state)
    return data

In [59]:
parameters = {
    'MAX_NEWS_LENGTH': 256,
    'TOTAL_CATEGORIES': df['label'].nunique(),
    'VOCAB_SIZE': 52_000,
    'EMBEDDING_DIMENSION': 50,
    'FIRST_HIDDEN_DIM': 16,
    'DROPOUT': 0.2,
    'BATCH_SIZE': 512,
    'EPOCHS': 16,
    'EARLY_STOPPING': 3
}

In [26]:
# Drop Duplicates
df.drop_duplicates(["news_no_stopwords"], inplace=True)
# Remove rows with null values
df.dropna(inplace=True, axis=0)
# Compute length of each news articles
df["length"] = df["news_no_stopwords"].apply(lambda x: len(x.split()))
# Remove all news with <30 words
df = df[df["length"] >= 30]

So, we have 361955 unique news scattered along 10 different catgeories.<br>
There seems to be duplicate news. So lets remove them.

In [28]:
df["news_no_stopwords"] = df["news_no_stopwords"].apply(lambda x: " ".join(x.split()[:parameters["MAX_NEWS_LENGTH"]]))
# Perform random undersampling
df_balanced = pd.DataFrame(columns=['news_no_stopwords', 'label'])
for i in range(parameters["TOTAL_CATEGORIES"]):
    res = random_undersampling(df[df["label"] == i], i * np.random.randint(100))
    if df_balanced.empty == True:
        df_balanced = res
    else:
        df_balanced = pd.concat([df_balanced, res], ignore_index=True)

# Shuffle the dataset
for i in range(20):
    df_balanced = df_balanced.sample(frac=1)

In [29]:
df_balanced['category'].value_counts()

category
विश्व                25000
मनोरञ्जन             25000
अर्थ / वाणिज्य       25000
राजनीति              25000
समाज                 25000
खेलकुद               25000
विज्ञान र प्रविधि    23027
स्वास्थ्य            21490
Name: count, dtype: int64

In [34]:
X_train, X_val_test, y_train, y_val_test = train_test_split(df_balanced["news_no_stopwords"], df_balanced["label"], test_size=0.20, random_state=34, stratify=df_balanced["label"])

In [35]:
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=28, stratify=y_val_test, shuffle=True)

In [36]:
X_train.shape, X_val.shape, X_test.shape 

((155613,), (19452,), (19452,))

Now we will create our tokenizer

In [40]:
tokenizer = Tokenizer(num_words=parameters['VOCAB_SIZE'])
tokenizer.fit_on_texts(list(X_train))

Now we convert our words to integers

In [41]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_val_seq = tokenizer.texts_to_sequences(X_val)

Finally we will pad our sequences so that all the inputs are of same length, 200

In [43]:
X_train_pad_seq = pad_sequences(X_train_seq,  maxlen=parameters["MAX_NEWS_LENGTH"], padding='post')
X_test_pad_seq = pad_sequences(X_test_seq,  maxlen=parameters["MAX_NEWS_LENGTH"], padding='post')
X_val_pad_seq = pad_sequences(X_val_seq,  maxlen=parameters["MAX_NEWS_LENGTH"], padding='post')

In [44]:
X_train_pad_seq.shape, X_val_pad_seq.shape, X_test_pad_seq.shape

((155613, 256), (19452, 256), (19452, 256))

In [45]:
y_train = tf.keras.utils.to_categorical(y_train, parameters["TOTAL_CATEGORIES"])
y_val = tf.keras.utils.to_categorical(y_val, parameters["TOTAL_CATEGORIES"])
y_test = tf.keras.utils.to_categorical(y_test, parameters["TOTAL_CATEGORIES"])

In [46]:
y_train.shape, y_val.shape, y_test.shape

((155613, 8), (19452, 8), (19452, 8))

Now lets create our LSTM model for news classification

In [66]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Input(shape=(parameters['MAX_NEWS_LENGTH'],)))
model.add(tf.keras.layers.Embedding(parameters["VOCAB_SIZE"], parameters["EMBEDDING_DIMENSION"], input_length=parameters["MAX_NEWS_LENGTH"], name="Embedding_Layer"))
model.add(tf.keras.layers.GlobalAveragePooling1D())
model.add(tf.keras.layers.Dense(parameters['FIRST_HIDDEN_DIM'], activation='relu'))
model.add(tf.keras.layers.Dropout(parameters['DROPOUT']))
model.add(tf.keras.layers.Dense(parameters['FIRST_HIDDEN_DIM'] * 2, activation='relu'))
model.add(tf.keras.layers.Dropout(parameters['DROPOUT']))
# model.add(tf.keras.layers.Dense(parameters['FIRST_HIDDEN_DIM'], activation='relu'))
model.add(tf.keras.layers.Dense(parameters["TOTAL_CATEGORIES"], activation='softmax', name="Softmax_Layer"))
model.summary()

In [67]:
model.compile(loss='categorical_crossentropy',
    optimizer="adam",
    metrics=['accuracy', tf.keras.metrics.F1Score(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

In [68]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=parameters["EARLY_STOPPING"])
cp = tf.keras.callbacks.ModelCheckpoint("../../outputs/mlp/MLP_nepali_news_classifier_model.keras", monitor="val_loss", mode="min", save_best_only=True)

In [69]:
history = model.fit(X_train_pad_seq, y_train,
    batch_size=parameters["BATCH_SIZE"],
    epochs=parameters["EPOCHS"],
    validation_data=(X_val_pad_seq, y_val),
    callbacks=[es, cp]
)

Epoch 1/16
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.4002 - f1_score: 0.3742 - loss: 1.7241 - precision_3: 0.6755 - recall_3: 0.0995 - val_accuracy: 0.8317 - val_f1_score: 0.8297 - val_loss: 0.6127 - val_precision_3: 0.9104 - val_recall_3: 0.6792
Epoch 2/16
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.7882 - f1_score: 0.7869 - loss: 0.6742 - precision_3: 0.8752 - recall_3: 0.6721 - val_accuracy: 0.8630 - val_f1_score: 0.8614 - val_loss: 0.4631 - val_precision_3: 0.9000 - val_recall_3: 0.8155
Epoch 3/16
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.8384 - f1_score: 0.8378 - loss: 0.5317 - precision_3: 0.8871 - recall_3: 0.7732 - val_accuracy: 0.8728 - val_f1_score: 0.8721 - val_loss: 0.4249 - val_precision_3: 0.9018 - val_recall_3: 0.8411
Epoch 4/16
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.8592 - f1_scor

In [70]:
model.evaluate(X_test_pad_seq, y_test)

[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 553us/step - accuracy: 0.8754 - f1_score: 0.8745 - loss: 0.4308 - precision_3: 0.8888 - recall_3: 0.8666


[0.43674236536026,
 0.874254584312439,
 <tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([0.8072855 , 0.97948617, 0.9542352 , 0.87502474, 0.8890389 ,
        0.9099756 , 0.7238019 , 0.8572086 ], dtype=float32)>,
 0.8866115212440491,
 0.8650524616241455]

Finally, lets save our tokenizer, CONSTANTS

In [73]:
import io, json, pickle

In [74]:
with open('../../outputs/mlp/train_history.pkl', 'wb') as hist:
    pickle.dump(history.history, hist)

In [75]:
with open("../../outputs/mlp/parameters.json", "w") as const: 
    json.dump(parameters, const)

In [76]:
with io.open('../../outputs/mlp//tokenizer.json', 'w', encoding='utf-8') as tok:
    tok.write(json.dumps(tokenizer.to_json(), ensure_ascii=False))

In [77]:
with open('../../outputs/mlp/label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)