In [1]:
import spacy

In [2]:
import pandas as pd
import numpy as np

In [3]:
nlp = spacy.load("en_core_web_lg")

In [4]:
df = pd.read_json("news_dataset.json")

In [5]:
df.head()

Unnamed: 0,text,category
0,"Larry Nassar Blames His Victims, Says He 'Was ...",CRIME
1,"Woman Beats Cancer, Dies Falling From Horse",CRIME
2,Vegas Taxpayers Could Spend A Record $750 Mill...,SPORTS
3,This Richard Sherman Interception Literally Sh...,SPORTS
4,7 Things That Could Totally Kill Weed Legaliza...,BUSINESS


In [6]:
df.category.value_counts()

category
CRIME       2500
SPORTS      2500
BUSINESS    2500
Name: count, dtype: int64

In [7]:
df.shape

(7500, 2)

In [8]:
df['category_num'] = df['category'].map({'CRIME':0, 'SPORTS':1, 'BUSINESS':2})

In [9]:
df.head()

Unnamed: 0,text,category,category_num
0,"Larry Nassar Blames His Victims, Says He 'Was ...",CRIME,0
1,"Woman Beats Cancer, Dies Falling From Horse",CRIME,0
2,Vegas Taxpayers Could Spend A Record $750 Mill...,SPORTS,1
3,This Richard Sherman Interception Literally Sh...,SPORTS,1
4,7 Things That Could Totally Kill Weed Legaliza...,BUSINESS,2


In [10]:
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return ' '.join(filtered_tokens)

In [11]:
df['preprocessed_txt'] = df['text'].apply(lambda x: preprocess(x))

In [12]:
 df.head()

Unnamed: 0,text,category,category_num,preprocessed_txt
0,"Larry Nassar Blames His Victims, Says He 'Was ...",CRIME,0,Larry Nassar blame victim say victimize newly ...
1,"Woman Beats Cancer, Dies Falling From Horse",CRIME,0,woman Beats Cancer die fall horse
2,Vegas Taxpayers Could Spend A Record $750 Mill...,SPORTS,1,vegas taxpayer spend Record $ 750 million New ...
3,This Richard Sherman Interception Literally Sh...,SPORTS,1,Richard Sherman Interception literally shake W...
4,7 Things That Could Totally Kill Weed Legaliza...,BUSINESS,2,7 thing totally kill Weed Legalization Buzz


In [13]:
df['vector'] = df['preprocessed_txt'].apply(lambda x: nlp(x).vector)

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df.vector.values, df.category_num, test_size = 0.2, random_state = 2022)

In [16]:
X_train_2d = np.stack(X_train)

In [17]:
X_test_2d = np.stack(X_test)

In [18]:
from sklearn.tree import DecisionTreeClassifier

In [19]:
clf = DecisionTreeClassifier()

In [20]:
clf.fit(X_train_2d, y_train)

In [21]:
y_pred = clf.predict(X_test_2d)

In [22]:
from sklearn.metrics import classification_report

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.75      0.73       486
           1       0.70      0.72      0.71       479
           2       0.78      0.73      0.75       535

    accuracy                           0.73      1500
   macro avg       0.73      0.73      0.73      1500
weighted avg       0.73      0.73      0.73      1500



In [24]:
from sklearn.naive_bayes import MultinomialNB

In [25]:
from sklearn.preprocessing import MinMaxScaler

In [26]:
scaler = MinMaxScaler()

In [27]:
X_train_embed = scaler.fit_transform(X_train_2d)
X_test_embed = scaler.fit_transform(X_test_2d)

In [28]:
clf = MultinomialNB()
clf.fit(X_train_embed, y_train)
y_pred = clf.predict(X_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.88      0.85       486
           1       0.85      0.81      0.83       479
           2       0.83      0.83      0.83       535

    accuracy                           0.84      1500
   macro avg       0.84      0.84      0.84      1500
weighted avg       0.84      0.84      0.84      1500



In [29]:
from sklearn.neighbors import KNeighborsClassifier

In [30]:
clf = KNeighborsClassifier(n_neighbors = 5, metric ='euclidean')
clf.fit(X_train_2d, y_train)
y_pred = clf.predict(X_test_2d)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.89      0.83       486
           1       0.87      0.80      0.83       479
           2       0.88      0.83      0.86       535

    accuracy                           0.84      1500
   macro avg       0.84      0.84      0.84      1500
weighted avg       0.85      0.84      0.84      1500



In [31]:
from sklearn.ensemble import RandomForestClassifier

In [32]:
clf = RandomForestClassifier()
clf.fit(X_train_2d, y_train)
y_pred = clf.predict(X_test_2d)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.90      0.88       486
           1       0.88      0.89      0.89       479
           2       0.90      0.86      0.88       535

    accuracy                           0.88      1500
   macro avg       0.88      0.88      0.88      1500
weighted avg       0.88      0.88      0.88      1500



In [35]:
from sklearn.ensemble import GradientBoostingClassifier

In [36]:
clf = GradientBoostingClassifier()
clf.fit(X_train_2d, y_train)
y_pred = clf.predict(X_test_2d)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88       486
           1       0.90      0.90      0.90       479
           2       0.92      0.90      0.91       535

    accuracy                           0.90      1500
   macro avg       0.90      0.90      0.90      1500
weighted avg       0.90      0.90      0.90      1500

