In [103]:
!python -m spacy download en_core_web_sm -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [104]:
import pandas as pd

import spacy

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

In [106]:
# Creating the NLP object
nlp = spacy.load("en_core_web_sm")

nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

## Loading the Database

In [52]:
from pathlib import Path
import zipfile


zip_path = Path("/content/news_categories.zip")
dest_dir = Path("/content")

if not dest_dir.is_file():
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        print(f"[INFO] Unzipping dataset `{zip_path}` to `{dest_dir}/News_Category_Dataset_v3.json`...")
        zip_ref.extractall(dest_dir)

print(f"[INFO] Dataset succesfully downloaded to `{dest_dir}/News_Category_Dataset_v3.json`..")

[INFO] Unzipping dataset `/content/news_categories.zip` to `/content/News_Category_Dataset_v3.json`...
[INFO] Dataset succesfully downloaded to `/content/News_Category_Dataset_v3.json`..


## Understanding the Dataset

In [53]:
df = pd.read_json(dest_dir / "News_Category_Dataset_v3.json", lines=True)

df.head(3)

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23


In [54]:
df.shape

(209527, 6)

In [55]:
df.value_counts("category")

category
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATI

We see that the Dataset is imbalance.

## Preprocessing the Dataset

In [56]:
df = df[["short_description", "category"]]
df.rename(columns={"short_description": "Text"}, inplace=True)

df.head(3)

Unnamed: 0,Text,category
0,Health experts said it is too early to predict...,U.S. NEWS
1,He was subdued by passengers and crew when he ...,U.S. NEWS
2,"""Until you have a dog you don't understand wha...",COMEDY


In [57]:
# Creating the Categories Directory
categories_l = list(set(df["category"].tolist()))
categories_d = {categories_l[i]: i for i in range(len(categories_l))}

len(categories_d)

42

In [58]:
df["Category"] = df["category"].apply(lambda x: categories_d[x])
df = df[["Text", "Category"]]

df.head(3)

Unnamed: 0,Text,Category
0,Health experts said it is too early to predict...,30
1,He was subdued by passengers and crew when he ...,30
2,"""Until you have a dog you don't understand wha...",26


In [59]:
# Handling the imbalance of the Dataset, using undersampling
g = df.groupby("Category", group_keys=False)
df_balanced = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))

df_balanced.head(3)

Unnamed: 0,Text,Category
0,Not everyone was immediately so gung-ho about ...,0
1,"Well, you know what they say ... older sibling...",0
2,"In the children's interviews, researchers read...",0


In [60]:
df_balanced.value_counts("Category")

Category
0     1014
31    1014
23    1014
24    1014
25    1014
26    1014
27    1014
28    1014
29    1014
30    1014
32    1014
1     1014
33    1014
34    1014
35    1014
36    1014
37    1014
38    1014
39    1014
40    1014
22    1014
21    1014
20    1014
19    1014
2     1014
3     1014
4     1014
5     1014
6     1014
7     1014
8     1014
9     1014
10    1014
11    1014
12    1014
13    1014
14    1014
15    1014
16    1014
17    1014
18    1014
41    1014
dtype: int64

## Creating a Preprocessing Function

In [109]:
def preprocess(text: str) -> str:
    return " ".join(token.lemma_ for token in nlp(text) if (not token.is_stop) and (not token.is_punct))

## Splitting the Dataset into Training and Testing Sets

In [113]:
test_prop = 0.2

x_train, x_test, y_train, y_test = train_test_split(
    df["Text"].values,
    df["Category"].values,
    test_size=test_prop
)
x_train_bal, x_test_bal, y_train_bal, y_test_bal = train_test_split(
    df_balanced["Text"].values,
    df_balanced["Category"].values,
    test_size=test_prop,
    stratify=df_balanced["Category"] # gives us same number of elements per category on balanced datasets
)
x_train_bal_pre, x_test_bal_pre, y_train_bal_pre, y_test_bal_pre = train_test_split(
    df_balanced["Text"].apply(preprocess).values,
    df_balanced["Category"].values,
    test_size=test_prop,
    stratify=df_balanced["Category"]
)

print(len(x_train), len(y_train), len(x_test), len(y_test))
print(len(x_train_bal), len(y_train_bal), len(x_test_bal), len(y_test_bal))
print(len(x_train_bal_pre), len(y_train_bal_pre), len(x_test_bal_pre), len(y_test_bal_pre))

167621 167621 41906 41906
34070 34070 8518 8518
34070 34070 8518 8518


## BOW Model

### Training and Evaluating using the Imbalanced Dataset

In [115]:
clf = Pipeline([
    ("vectorizer_bow", CountVectorizer()),
    ("Multi NB", MultinomialNB())
])

clf.fit(x_train, y_train)

print("BOW: Imbalanced Data:")
print(classification_report(y_test, clf.predict(x_test), zero_division=0))

BOW: Imbalanced Data:
              precision    recall  f1-score   support

           0       0.31      0.52      0.39      1701
           1       0.84      0.04      0.08       510
           2       0.51      0.03      0.05       689
           3       0.88      0.02      0.05       281
           4       0.57      0.40      0.47      1316
           5       0.33      0.00      0.01       331
           6       0.51      0.07      0.12       649
           7       0.42      0.16      0.23      1149
           8       0.00      0.00      0.00       447
           9       0.58      0.64      0.60      1983
          10       0.29      0.85      0.44      3557
          11       0.68      0.15      0.25      1247
          12       0.80      0.16      0.27       887
          13       0.82      0.29      0.43       724
          14       1.00      0.01      0.01       177
          15       0.36      0.01      0.02       466
          16       0.56      0.01      0.03       721
     

### Training and Evaluating using the Balanced Dataset

In [116]:
clf = Pipeline([
    ("vectorizer_bow", CountVectorizer()),
    ("Multi NB", MultinomialNB())
])

clf.fit(x_train_bal, y_train_bal)

print("BOW: Balanced Data:")
print(classification_report(y_test_bal, clf.predict(x_test_bal), zero_division=0))

BOW: Balanced Data:
              precision    recall  f1-score   support

           0       0.05      0.40      0.09       202
           1       0.64      0.16      0.25       203
           2       0.19      0.22      0.20       203
           3       0.26      0.35      0.30       203
           4       0.44      0.29      0.35       203
           5       0.28      0.12      0.17       203
           6       0.40      0.23      0.29       203
           7       0.22      0.09      0.13       203
           8       0.25      0.00      0.01       203
           9       0.48      0.47      0.47       202
          10       0.13      0.47      0.20       203
          11       0.46      0.13      0.21       203
          12       0.45      0.24      0.32       203
          13       0.40      0.61      0.48       203
          14       0.26      0.53      0.35       203
          15       0.26      0.22      0.24       203
          16       0.33      0.11      0.16       203
       

### Training and Evaluating using the Preprocessed and Balanced Dataset

In [117]:
clf = Pipeline([
    ("vectorizer_bow", CountVectorizer()),
    ("Multi NB", MultinomialNB())
])

clf.fit(x_train_bal_pre, y_train_bal_pre)

print("BOW: Preprocessed Data:")
print(classification_report(y_test_bal_pre, clf.predict(x_test_bal_pre), zero_division=0))

BOW: Preprocessed Data:
              precision    recall  f1-score   support

           0       0.16      0.33      0.21       203
           1       0.42      0.21      0.28       203
           2       0.06      0.37      0.10       202
           3       0.30      0.36      0.33       202
           4       0.43      0.39      0.41       203
           5       0.18      0.12      0.15       203
           6       0.35      0.28      0.31       203
           7       0.29      0.19      0.23       203
           8       0.31      0.02      0.05       203
           9       0.40      0.53      0.46       203
          10       0.21      0.45      0.29       203
          11       0.39      0.20      0.27       203
          12       0.39      0.37      0.38       203
          13       0.43      0.62      0.51       203
          14       0.33      0.56      0.42       203
          15       0.29      0.22      0.25       203
          16       0.25      0.11      0.16       203
   

## BO-ngrams Model

### Training and Evaluating using the Imbalanced Dataset

In [119]:
clf = Pipeline([
    ("vectorizer_bow", CountVectorizer(ngram_range=(1, 2))),
    ("Multi NB", MultinomialNB())
])

clf.fit(x_train, y_train)

print("BOW: Imbalanced Data:")
print(classification_report(y_test, clf.predict(x_test), zero_division=0))

BOW: Imbalanced Data:
              precision    recall  f1-score   support

           0       0.37      0.18      0.24      1701
           1       1.00      0.02      0.05       510
           2       0.50      0.00      0.00       689
           3       1.00      0.01      0.02       281
           4       0.60      0.09      0.16      1316
           5       0.00      0.00      0.00       331
           6       1.00      0.00      0.00       649
           7       0.70      0.01      0.02      1149
           8       0.00      0.00      0.00       447
           9       0.70      0.48      0.56      1983
          10       0.21      0.92      0.34      3557
          11       0.86      0.05      0.10      1247
          12       0.98      0.09      0.17       887
          13       0.89      0.15      0.26       724
          14       0.00      0.00      0.00       177
          15       0.00      0.00      0.00       466
          16       1.00      0.01      0.01       721
     

### Training and Evaluating using the Balanced Dataset

In [120]:
clf = Pipeline([
    ("vectorizer_bow", CountVectorizer(ngram_range=(1, 2))),
    ("Multi NB", MultinomialNB())
])

clf.fit(x_train_bal, y_train_bal)

print("BOW: Balanced Data:")
print(classification_report(y_test_bal, clf.predict(x_test_bal), zero_division=0))

BOW: Balanced Data:
              precision    recall  f1-score   support

           0       0.04      0.35      0.08       202
           1       0.58      0.11      0.18       203
           2       0.16      0.18      0.17       203
           3       0.25      0.33      0.29       203
           4       0.41      0.15      0.22       203
           5       0.25      0.06      0.10       203
           6       0.53      0.25      0.34       203
           7       0.18      0.06      0.09       203
           8       0.50      0.00      0.01       203
           9       0.49      0.39      0.43       202
          10       0.12      0.48      0.19       203
          11       0.44      0.11      0.17       203
          12       0.52      0.22      0.31       203
          13       0.41      0.59      0.49       203
          14       0.23      0.50      0.32       203
          15       0.28      0.21      0.24       203
          16       0.40      0.08      0.14       203
       

### Training and Evaluating using the Preprocessed and Balanced Dataset

In [121]:
clf = Pipeline([
    ("vectorizer_bow", CountVectorizer(ngram_range=(1, 2))),
    ("Multi NB", MultinomialNB())
])

clf.fit(x_train_bal_pre, y_train_bal_pre)

print("BOW: Preprocessed Data:")
print(classification_report(y_test_bal_pre, clf.predict(x_test_bal_pre), zero_division=0))

BOW: Preprocessed Data:
              precision    recall  f1-score   support

           0       0.16      0.36      0.22       203
           1       0.45      0.20      0.28       203
           2       0.05      0.35      0.09       202
           3       0.28      0.39      0.33       202
           4       0.44      0.36      0.40       203
           5       0.16      0.10      0.12       203
           6       0.41      0.28      0.33       203
           7       0.31      0.18      0.23       203
           8       0.30      0.01      0.03       203
           9       0.43      0.53      0.47       203
          10       0.20      0.47      0.28       203
          11       0.43      0.22      0.29       203
          12       0.43      0.34      0.38       203
          13       0.43      0.62      0.51       203
          14       0.33      0.62      0.43       203
          15       0.30      0.22      0.25       203
          16       0.27      0.11      0.15       203
   