# **Topic Modelling Using Multiclass Classification**

---



# **Preparing dataset**

In [None]:
import pandas as pd
df = pd.read_csv('drive/My Drive/news_data.csv')
df.replace({'category': 
             {'WEIRD NEWS': 'COMEDY', 
              'WOMEN': 'VOICE',
             'QUEER VOICES': 'VOICE',
             'LATINO VOICES': 'VOICE',
             'BLACK VOICES': 'VOICE',
             'COLLEGE': 'EDUCATION',
             'GREEN': 'ENVIRONMENT',
              'PARENTS': 'FAMILY', 
              'PARENTING': 'FAMILY',
             'DIVORCE': 'FAMILY',
             'ARTS': 'ARTS & CULTURE',
             'CULTURE & ARTS': 'ARTS & CULTURE',
              'STYLE': 'LIFESTYLE',
              'HEALTHY LIVING': 'LIFESTYLE',
              'WELLNESS': 'LIFESTYLE',
              'HOME & LIVING': 'LIFESTYLE',
              'STYLE & BEAUTY': 'LIFESTYLE',
              'FOOD & DRINK': 'LIFESTYLE',
             'TASTE': 'LIFESTYLE'}}, 
            inplace= True)
df = df.set_index("category")
df = df.drop(["MEDIA","FIFTY","THE WORLDPOST","WORLDPOST","WEDDINGS", "GOOD NEWS"], axis=0)
df.reset_index(inplace = True) 

# **Preprocessing Data**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing

data = df['text']
labels = df['category']
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1, 2), stop_words='english')
X = vectorizer.fit_transform(data)
le = preprocessing.LabelEncoder()
y = le.fit_transform(labels)

# **Apply Model**

In [None]:
#train-test-split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)

In [None]:
#As data is imbalanced

import numpy as np
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', np.unique(y), y)
dct = {idx : val for idx, val in enumerate(class_weights)}

In [None]:
#Naive Bayes

from sklearn.naive_bayes import MultinomialNB

nb_clf = MultinomialNB()
nb_clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
#Naive Bayes Report

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

predictions = nb_clf.predict(X_test)

print('accuracy %s' % accuracy_score(predictions, y_test))
print(classification_report(y_test, predictions, target_names=le.classes_))

accuracy 0.5777888080799362
                precision    recall  f1-score   support

ARTS & CULTURE       1.00      0.01      0.03       963
      BUSINESS       0.85      0.05      0.09      1534
        COMEDY       0.86      0.09      0.16      2008
         CRIME       0.85      0.08      0.14       845
     EDUCATION       0.67      0.00      0.01       520
 ENTERTAINMENT       0.69      0.61      0.65      3998
   ENVIRONMENT       0.95      0.04      0.07      1021
        FAMILY       0.78      0.45      0.57      3955
        IMPACT       0.00      0.00      0.00       858
     LIFESTYLE       0.51      0.96      0.67     12479
         MONEY       0.00      0.00      0.00       427
      POLITICS       0.57      0.94      0.71      8062
      RELIGION       0.97      0.05      0.10       608
       SCIENCE       1.00      0.02      0.04       533
        SPORTS       0.92      0.20      0.33      1189
          TECH       1.00      0.05      0.10       506
        TRAVEL     

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#Linear SVM

from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(loss='log', class_weight=dct)
sgd_clf.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False,
              class_weight={0: 2.5154718927282103, 1: 1.6430857335354556,
                            2: 1.2434671765455705, 3: 2.8649045521292216,
                            4: 4.541433891992551, 5: 0.6074853655498816,
                            6: 2.472750316856781, 7: 0.6074853655498816,
                            8: 2.820179242555652, 9: 0.19932163216934676,
                            10: 5.714704159343878, 11: 0.29796267448608693,
                            12: 3.816510172143975, 13: 4.4788797061524335,
                            14: 1.9973382473382473, 15: 4.685398655139289,
                            16: 0.9866491352280773, 17: 0.6309423711273527,
                            18: 4.480937069361507},
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
 

In [None]:
#Linear SVM Report
predictions = sgd_clf.predict(X_test)

print('accuracy %s' % accuracy_score(predictions, y_test))
print(classification_report(y_test, predictions, target_names=le.classes_))

accuracy 0.6388631115523232
                precision    recall  f1-score   support

ARTS & CULTURE       0.47      0.49      0.48       963
      BUSINESS       0.51      0.42      0.46      1534
        COMEDY       0.56      0.29      0.39      2008
         CRIME       0.37      0.73      0.49       845
     EDUCATION       0.32      0.65      0.43       520
 ENTERTAINMENT       0.66      0.59      0.62      3998
   ENVIRONMENT       0.46      0.54      0.49      1021
        FAMILY       0.68      0.69      0.69      3955
        IMPACT       0.31      0.38      0.34       858
     LIFESTYLE       0.75      0.77      0.76     12479
         MONEY       0.26      0.57      0.36       427
      POLITICS       0.80      0.71      0.75      8062
      RELIGION       0.48      0.59      0.53       608
       SCIENCE       0.38      0.59      0.46       533
        SPORTS       0.60      0.75      0.66      1189
          TECH       0.37      0.62      0.46       506
        TRAVEL     

In [None]:
#Logistic Regression

from sklearn.linear_model import LogisticRegression

logreg_clf = LogisticRegression(random_state = 0, class_weight = dct, solver = 'newton-cg', multi_class = 'multinomial')
logreg_clf.fit(X_train, y_train)

LogisticRegression(C=1.0,
                   class_weight={0: 2.5154718927282103, 1: 1.6430857335354556,
                                 2: 1.2434671765455705, 3: 2.8649045521292216,
                                 4: 4.541433891992551, 5: 0.6074853655498816,
                                 6: 2.472750316856781, 7: 0.6074853655498816,
                                 8: 2.820179242555652, 9: 0.19932163216934676,
                                 10: 5.714704159343878, 11: 0.29796267448608693,
                                 12: 3.816510172143975, 13: 4.4788797061524335,
                                 14: 1.9973382473382473, 15: 4.685398655139289,
                                 16: 0.9866491352280773, 17: 0.6309423711273527,
                                 18: 4.480937069361507},
                   dual=False, fit_intercept=True, intercept_scaling=1,
                   l1_ratio=None, max_iter=100, multi_class='multinomial',
                   n_jobs=None, penalty='l2', random_st

In [None]:
#Logistic Regression Report

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

predictions = logreg_clf.predict(X_test)

print('accuracy %s' % accuracy_score(predictions, y_test))
print(classification_report(y_test, predictions, target_names=le.classes_))

accuracy 0.6707814489500831
                precision    recall  f1-score   support

ARTS & CULTURE       0.44      0.59      0.50       963
      BUSINESS       0.46      0.55      0.50      1534
        COMEDY       0.45      0.50      0.47      2008
         CRIME       0.44      0.71      0.55       845
     EDUCATION       0.39      0.66      0.49       520
 ENTERTAINMENT       0.68      0.68      0.68      3998
   ENVIRONMENT       0.48      0.64      0.55      1021
        FAMILY       0.72      0.74      0.73      3955
        IMPACT       0.30      0.50      0.37       858
     LIFESTYLE       0.88      0.70      0.78     12479
         MONEY       0.34      0.58      0.43       427
      POLITICS       0.86      0.70      0.78      8062
      RELIGION       0.47      0.66      0.55       608
       SCIENCE       0.40      0.62      0.49       533
        SPORTS       0.65      0.80      0.72      1189
          TECH       0.40      0.63      0.49       506
        TRAVEL     

# **Predicting Topics for New Data**

In [None]:
new_text = ["Planning to celebrate Holi? Make sure you protect your eyes first"]
vectorizer_new = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1, 2), stop_words='english', vocabulary = vectorizer.vocabulary_)
X_new= vectorizer_new.fit_transform(new_text)
categories = logreg_clf.predict_proba(X_new)

In [None]:
res = []
res.append(dict(zip(list(categories[0]), list(le.classes_))))
res

[{0.006532598230786581: 'WORLD NEWS',
  0.00980879648870517: 'EDUCATION',
  0.011697322376439135: 'SPORTS',
  0.02185187061298773: 'TECH',
  0.024319862032169064: 'ENTERTAINMENT',
  0.024561486567263858: 'CRIME',
  0.02743093216791502: 'POLITICS',
  0.03352130578840888: 'COMEDY',
  0.03455831985087993: 'BUSINESS',
  0.04007904836401336: 'RELIGION',
  0.04284170240408259: 'MONEY',
  0.048498685246924945: 'SCIENCE',
  0.05039271075546483: 'IMPACT',
  0.05388009805345149: 'VOICE',
  0.05542906332810841: 'ARTS & CULTURE',
  0.09062196467246121: 'ENVIRONMENT',
  0.10706972495412476: 'FAMILY',
  0.10801646517409848: 'TRAVEL',
  0.20888804293171465: 'LIFESTYLE'}]

# **Predicting Topic Distribution for Scraped Data**

In [None]:
import pandas as pd
dataframe = pd.read_csv('drive/My Drive/scraped_data.csv')
dataframe['news_text'] = dataframe['Headlines'] + " " + dataframe['Short Descsriptions']
text = dataframe['news_text'].tolist() 

In [None]:
vectorizer_new = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1, 2), stop_words='english', vocabulary = vectorizer.vocabulary_)

In [None]:
X_text = vectorizer_new.fit_transform(text)
categories = logreg_clf.predict_proba(X_text)

In [None]:
result = []
c = 0
for i in categories:
  entry = dict(zip(list(le.classes_), list(i)))
  entry['news_id'] = c
  c += 1
  result.append(entry)

In [None]:
categories_df = pd.DataFrame(result)
categories_df.to_csv('topics.csv', index=True, encoding='utf-8')