In [24]:
# To ignore warinings
import warnings
warnings.filterwarnings('ignore')

In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix, classification_report

In [26]:
# Reading the dataset
df = pd.read_csv('../../data/full_cleaned.csv')

In [27]:
df.head()

Unnamed: 0,title,news_stopwords,news_no_stopwords,category
0,निखिल उप्रेतीको भैरव फिल्मले अमेरिकामा रहेका न...,अमेरिकाका नेपालीको प्रतिक्रिया लिने इच्छा व्यक...,अमेरिकाका नेपालीको प्रतिक्रिया लिने इच्छा व्यक...,मनोरञ्जन
1,सुशील कोइरालाको निधनपछि चौरासी बाले खोले यस्ता...,झण्डै बर्षअघि सुशील कोइरालाले प्रधानमन्त्रीको ...,झण्डै बर्षअघि सुशील कोइरालाले प्रधानमन्त्रीको ...,मनोरञ्जन
2,लिटल प्रिन्स एण्ड प्रिन्सेसको ग्रान्ड फिनाले,ग्ल्यामरस नेपालले सुरुङ्गामा लिटल प्रिन्स एन्ड...,ग्ल्यामरस नेपालले सुरुङ्गामा लिटल प्रिन्स एन्ड...,मनोरञ्जन
3,अमेरिकामा सबैभन्दा धेरै कमाउने सिईओ बने पिचाई ...,गूगलका सिईओ सुन्दर पिचाई अमेरिकामा सबैभन्दा धे...,गूगलका सिईओ सुन्दर पिचाई अमेरिकामा सबैभन्दा कम...,मनोरञ्जन
4,ज्योती मगरको धमाका दोहोरीमा र्याप,दोहोरीमा र्याप बोलको गीतको भिडियो सार्वजनिक भए...,दोहोरीमा र्याप बोलको गीतको भिडियो सार्वजनिक चर...,मनोरञ्जन


In [28]:
df.drop(columns=["title", "news_stopwords"], inplace=True)

In [29]:
# Remove sikshya category
df = df[df["category"] != "शिक्षा"]
# Remove desh pradesh category
df = df[df["category"] != "देश/प्रदेश"]

In [30]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['category'])

In [31]:
df['category'].value_counts()

category
राजनीति              59975
समाज                 50205
खेलकुद               42367
अर्थ / वाणिज्य       41231
विश्व                37816
मनोरञ्जन             34169
विज्ञान र प्रविधि    23095
स्वास्थ्य            22389
Name: count, dtype: int64

In [32]:
def random_undersampling(data, random_state):
    if len(data) > 25000:
        return data.sample(25000, random_state=random_state)
    return data

In [33]:
# Drop Duplicates
df.drop_duplicates(["news_no_stopwords"], inplace=True)
# Remove rows with null values
df.dropna(inplace=True, axis=0)
# Compute length of each news articles
df["length"] = df["news_no_stopwords"].apply(lambda x: len(x.split()))
# Remove all news with <30 words
df = df[df["length"] >= 30]

In [34]:
df.head(2)

Unnamed: 0,news_no_stopwords,category,label,length
0,अमेरिकाका नेपालीको प्रतिक्रिया लिने इच्छा व्यक...,मनोरञ्जन,2,206
1,झण्डै बर्षअघि सुशील कोइरालाले प्रधानमन्त्रीको ...,मनोरञ्जन,2,316


In [35]:
parameters = {
    'TOTAL_CATEGORIES': df['label'].nunique(),
    'MAX_NEWS_LENGTH': 256,
}

In [36]:
df['label'].value_counts()

label
3    57301
6    49745
1    42073
0    40592
5    37160
2    33949
4    23033
7    22074
Name: count, dtype: int64

In [37]:
df["news_no_stopwords"] = df["news_no_stopwords"].apply(lambda x: " ".join(x.split()[:parameters["MAX_NEWS_LENGTH"]]))
# Perform random undersampling
df_balanced = pd.DataFrame(columns=['news_no_stopwords', 'label'])
for i in range(parameters["TOTAL_CATEGORIES"]):
    res = random_undersampling(df[df["label"] == i], i * np.random.randint(100))
    if df_balanced.empty == True:
        df_balanced = res
    else:
        df_balanced = pd.concat([df_balanced, res], ignore_index=True)

# Shuffle the dataset
for i in range(20):
    df_balanced = df_balanced.sample(frac=1)

In [38]:
df_balanced.head()

Unnamed: 0,news_no_stopwords,category,label,length
99109,प्रधानमन्त्री पुष्पकमल दाहाल प्रचण्ड सरकारको ए...,राजनीति,3,79
13714,बैंकका कर्मचारीलाई दु:ख दिएको भन्दै नेपाल बैंक...,अर्थ / वाणिज्य,0,127
90982,नेपाल कम्युनिस्ट पार्टीका महासचिव नेत्र विक्रम...,राजनीति,3,123
177086,अछाममा सय जना कोरोना संक्रमित निको भएर घर फर्क...,स्वास्थ्य,7,112
88068,सञ्चार सूचना प्रविधि मन्त्री ज्ञानेन्द्रबहादुर...,राजनीति,3,114


In [39]:
df_balanced['category'].value_counts()

category
राजनीति              25000
अर्थ / वाणिज्य       25000
मनोरञ्जन             25000
खेलकुद               25000
विश्व                25000
समाज                 25000
विज्ञान र प्रविधि    23033
स्वास्थ्य            22074
Name: count, dtype: int64

In [40]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(df_balanced["news_no_stopwords"], df_balanced["label"], test_size=0.10, random_state=34, stratify=df_balanced["label"])

In [41]:
len(X_train), len(X_test)

(175596, 19511)

In [42]:
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=52000)

In [43]:
X_train_tfidf = tfidf.fit_transform(X_train)

In [44]:
X_test_tfidf = tfidf.transform(X_test)

In [45]:
models = [('Naive Bayes Classifier', MultinomialNB()), ("Gradient Boosting Classifier", GradientBoostingClassifier()), ("XG Boost Classifier", xgb.XGBClassifier())]

In [46]:
y_train

32861     1
73466     2
160887    6
151982    6
59184     2
         ..
20276     0
183666    7
120807    4
100549    4
106631    4
Name: label, Length: 175596, dtype: int64

In [47]:
results = dict()

for model_name, model in models:
    print(f'Training {model_name}...')
    model.fit(X_train_tfidf, y_train)
    model_preds = model.predict(X_test_tfidf)
    results[model_name] = {
        'model': model,
        'classification_report': classification_report(y_test, model_preds)
    }

Training Naive Bayes Classifier...
Training Gradient Boosting Classifier...
Training XG Boost Classifier...


In [48]:
from joblib import dump, load
import pickle

In [49]:
with open('../../outputs/ml/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

In [50]:
for key, value in results.items():
    print(key)
    dump(value['model'], f'../../outputs/ml/{"_".join(key.split())}.joblib')
    with open(f'../../outputs/ml/{"_".join(key.split())}_Classification_Report.pkl', 'wb') as f:
        pickle.dump(value['classification_report'], f)
    print(value['classification_report'])

Naive Bayes Classifier
              precision    recall  f1-score   support

           0       0.75      0.77      0.76      2500
           1       0.98      0.91      0.94      2500
           2       0.92      0.93      0.92      2500
           3       0.77      0.88      0.82      2500
           4       0.84      0.83      0.84      2303
           5       0.86      0.83      0.85      2500
           6       0.70      0.63      0.66      2500
           7       0.79      0.83      0.81      2208

    accuracy                           0.83     19511
   macro avg       0.83      0.83      0.83     19511
weighted avg       0.83      0.83      0.83     19511

Gradient Boosting Classifier
              precision    recall  f1-score   support

           0       0.73      0.74      0.74      2500
           1       0.95      0.93      0.94      2500
           2       0.90      0.88      0.89      2500
           3       0.80      0.84      0.82      2500
           4       0.83   

In [52]:
import json
with open("../../outputs/ml/parameters.json", "w") as const: 
    json.dump(parameters, const)

with open('../../outputs/ml/label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)