In [56]:
!python -m spacy download en_core_web_lg -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [57]:
import pandas as pd
import numpy as np
  
import spacy

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

## Initializing the NLP Object

In [58]:
nlp = spacy.load("en_core_web_lg")

nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

## Loading the Dataset

In [59]:
from pathlib import Path
import zipfile


zip_path = Path("/content/news_categories.zip")
dest_dir = Path("/content")

if not dest_dir.is_file():
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        print(f"[INFO] Unzipping dataset `{zip_path}` to `{dest_dir}`...")
        zip_ref.extractall(dest_dir)

print(f"[INFO] Dataset succesfully downloaded to `{dest_dir}`..")

[INFO] Unzipping dataset `/content/news_categories.zip` to `/content`...
[INFO] Dataset succesfully downloaded to `/content`..


In [60]:
df = pd.read_json("/content/News_Category_Dataset_v3.json", lines=True)
df = df[df["short_description"] != '']

print(df.shape)
df.head(3)

(189815, 6)


Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23


In [61]:
df.value_counts("category")

category
POLITICS          32441
WELLNESS          17943
ENTERTAINMENT     14774
STYLE & BEAUTY     9802
TRAVEL             9421
PARENTING          8790
FOOD & DRINK       6331
QUEER VOICES       5603
HEALTHY LIVING     5265
BUSINESS           5132
COMEDY             4642
SPORTS             4414
HOME & LIVING      4317
BLACK VOICES       4177
THE WORLDPOST      3664
WEDDINGS           3653
PARENTS            3556
DIVORCE            3426
WORLD NEWS         3297
WOMEN              3184
IMPACT             3086
CRIME              2832
MEDIA              2404
WEIRD NEWS         2316
TECH               2103
GREEN              2046
TASTE              1940
RELIGION           1879
SCIENCE            1803
MONEY              1755
STYLE              1567
ENVIRONMENT        1442
U.S. NEWS          1377
ARTS & CULTURE     1339
WORLDPOST          1242
CULTURE & ARTS     1063
FIFTY              1042
GOOD NEWS          1039
LATINO VOICES      1022
COLLEGE             921
EDUCATION           902
ARTS   

## Preprocessing the Dataset

In [62]:
# g = df.groupby("category", group_keys=False)
# df = g.apply(lambda x: x.sample(1600, replace=True))

# print(df.shape)
# df.head(3)

In [63]:
# df.value_counts("category")

In [64]:
category_l = list(set(category for category in df["category"]))

df["category_num"] = df["category"].map(
    {category_l[i]: i for i in range(len(category_l))}
)

In [65]:
df.head(3)

Unnamed: 0,link,headline,category,short_description,authors,date,category_num
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23,6
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23,6
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23,21


In [66]:
# Creating a new field that cointains the preprocessed text
df["preprocessed_text"] = df["short_description"].apply(lambda x: " ".join(token.lemma_ for token in nlp(x) if (not token.is_stop) and (not token.is_punct)))

In [67]:
# Creaing a new field that contains the vector representation of each test
df["vectored_text"] = df["short_description"].apply(lambda x: nlp(x).vector)

In [77]:
# Getting the final Dataset
df = df[["vectored_text", "category_num"]]
df.rename(columns={"vectored_text": "Text", "category_num": "Category"}, inplace=True)

print(df.shape)
df.head(3)

(189815, 2)


Unnamed: 0,Text,Category
0,"[-2.2348204, 1.4418973, -1.6081636, 1.2546554,...",6
1,"[-2.8690436, 1.383921, -2.131473, 0.79687876, ...",6
2,"[-0.9373005, 4.8455877, -3.5144181, -1.0967755...",21


## Splitting the Dataset into Training and Testing Sets

In [79]:
test_prop = 0.2

x_train, x_test, y_train, y_test = train_test_split(
    df["Text"].values,
    df["Category"].values,
    test_size = test_prop,
)

print(len(x_train), len(y_train), len(x_test), len(y_test))

151852 151852 37963 37963


In [80]:
# Reshaping the input arrays into a more compatable format
x_train = np.stack(x_train)
x_test = np.stack(x_test)

## Decision Tree

In [71]:
clf_dt = DecisionTreeClassifier()

clf_dt.fit(x_train, y_train)

print(classification_report(y_test, clf_dt.predict(x_test)))

              precision    recall  f1-score   support

           0       0.01      0.01      0.01       268
           1       0.06      0.05      0.06       421
           2       0.02      0.02      0.02       180
           3       0.28      0.28      0.28      3589
           4       0.06      0.06      0.06       731
           5       0.01      0.01      0.01       204
           6       0.02      0.02      0.02       275
           7       0.15      0.15      0.15       863
           8       0.41      0.39      0.40      6488
           9       0.04      0.04      0.04       313
          10       0.06      0.06      0.06      1027
          11       0.01      0.01      0.01       184
          12       0.07      0.07      0.07      1053
          13       0.06      0.06      0.06       685
          14       0.04      0.04      0.04       637
          15       0.07      0.08      0.08       388
          16       0.02      0.02      0.02       361
          17       0.02    

## Naive Bayes

In [72]:
# Applying MinMaxScaler to the inputs
scaler = MinMaxScaler()

x_train_transformed = scaler.fit_transform(x_train)
x_test_transformed = scaler.transform(x_test)

clf_nb = MultinomialNB()

clf_nb.fit(x_train_transformed, y_train)

print(classification_report(y_test, clf_nb.predict(x_test_transformed)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       268
           1       0.00      0.00      0.00       421
           2       0.00      0.00      0.00       180
           3       0.23      0.08      0.12      3589
           4       0.00      0.00      0.00       731
           5       0.00      0.00      0.00       204
           6       0.00      0.00      0.00       275
           7       0.00      0.00      0.00       863
           8       0.18      0.98      0.30      6488
           9       0.00      0.00      0.00       313
          10       0.00      0.00      0.00      1027
          11       0.00      0.00      0.00       184
          12       0.00      0.00      0.00      1053
          13       0.00      0.00      0.00       685
          14       0.00      0.00      0.00       637
          15       0.00      0.00      0.00       388
          16       0.00      0.00      0.00       361
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## KNN

In [81]:
clf_knn = KNeighborsClassifier(n_neighbors=10, metric="euclidean")

clf_knn.fit(x_train, y_train)

print(classification_report(y_test, clf_knn.predict(x_test)))

              precision    recall  f1-score   support

           0       0.09      0.04      0.05       264
           1       0.15      0.07      0.10       392
           2       0.08      0.07      0.08       180
           3       0.25      0.69      0.37      3511
           4       0.19      0.15      0.17       708
           5       0.00      0.00      0.00       201
           6       0.11      0.06      0.08       273
           7       0.34      0.15      0.21       889
           8       0.41      0.70      0.52      6608
           9       0.11      0.04      0.06       296
          10       0.17      0.12      0.14      1039
          11       0.14      0.02      0.03       202
          12       0.09      0.05      0.07      1097
          13       0.18      0.07      0.10       692
          14       0.10      0.02      0.04       697
          15       0.21      0.08      0.11       397
          16       0.23      0.05      0.08       351
          17       0.17    

## Random Forest

In [74]:
clf_rf = RandomForestClassifier()

clf_rf.fit(x_train, y_train)

print(classification_report(y_test, clf_rf.predict(x_test)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       268
           1       0.61      0.03      0.06       421
           2       0.00      0.00      0.00       180
           3       0.30      0.74      0.43      3589
           4       0.63      0.05      0.09       731
           5       0.00      0.00      0.00       204
           6       0.00      0.00      0.00       275
           7       0.56      0.15      0.24       863
           8       0.35      0.87      0.50      6488
           9       0.39      0.02      0.04       313
          10       0.40      0.03      0.06      1027
          11       0.00      0.00      0.00       184
          12       0.35      0.02      0.04      1053
          13       0.71      0.02      0.03       685
          14       0.63      0.02      0.04       637
          15       0.23      0.01      0.01       388
          16       1.00      0.01      0.01       361
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Confusion Matrix of the Best Model

In [75]:
print(confusion_matrix(y_test, clf_knn.predict(x_test)))

[[  7   3   1 ...   3   7   2]
 [  2  39   2 ...   2   5   1]
 [  0   1   5 ...   0  10   0]
 ...
 [  7   3   5 ...  80  45   1]
 [  3   6  12 ...  19 262   0]
 [  0   4   3 ...   6   7  46]]
