In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
v = CountVectorizer(ngram_range=(2,2))
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor hathodawala': 4,
 'hathodawala is': 1,
 'is looking': 2,
 'looking for': 3,
 'for job': 0}

In [9]:
v = CountVectorizer(ngram_range=(1,3))
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 12,
 'hathodawala': 2,
 'is': 5,
 'looking': 9,
 'for': 0,
 'job': 8,
 'thor hathodawala': 13,
 'hathodawala is': 3,
 'is looking': 6,
 'looking for': 10,
 'for job': 1,
 'thor hathodawala is': 14,
 'hathodawala is looking': 4,
 'is looking for': 7,
 'looking for job': 11}

In [10]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

In [12]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [14]:
def preprocess_text(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if not token.is_stop and not token.is_punct:
            filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [15]:
preprocess_text(corpus[2])

'Loki eat pizza'

In [17]:
corpus_processed = [preprocess_text(text) for text in corpus]
corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [18]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(corpus_processed)
v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [23]:
import pandas as pd

df = pd.read_json("news_dataset.json")
df.head()

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [24]:
df["category"].value_counts()

BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: category, dtype: int64

In [28]:
min_samples = 1381

df_business = df[df.category=="BUSINESS"].sample(min_samples, random_state=42)
df_sports = df[df.category=="SPORTS"].sample(min_samples, random_state=42)
df_crime = df[df.category=="CRIME"].sample(min_samples, random_state=42)
df_science = df[df.category=="SCIENCE"].sample(min_samples, random_state=42)

In [30]:
df_balanced = pd.concat([df_business, df_sports, df_crime, df_science], axis=0)
df_balanced

Unnamed: 0,text,category
594,How to Develop the Next Generation of Innovato...,BUSINESS
3093,"Madoff Victims' Payout Nears $7.2 Billion, Tru...",BUSINESS
7447,Bay Area Floats 'Sanctuary In Transit Policy' ...,BUSINESS
10388,Microsoft Agrees To Acquire LinkedIn For $26.2...,BUSINESS
1782,"Inside A Legal, Multibillion Dollar Weed Market",BUSINESS
...,...,...
9924,A Call for Data Literacy If we care about youn...,SCIENCE
10219,Here's What Happens When Someone Sneezes On An...,SCIENCE
11884,Most People Don't See How Climate Change Is Af...,SCIENCE
7854,"Watch Octopuses Meet for Blind Date, Tricky Se...",SCIENCE


In [31]:
df_balanced["category"].value_counts()

BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: category, dtype: int64

In [32]:
df_balanced['category_num'] = df_balanced['category'].map(
    {
        "BUSINESS":0,
        "SPORTS":1,
        "CRIME":2,
        "SCIENCE":3
    }
)

In [34]:
df_balanced

Unnamed: 0,text,category,category_num
594,How to Develop the Next Generation of Innovato...,BUSINESS,0
3093,"Madoff Victims' Payout Nears $7.2 Billion, Tru...",BUSINESS,0
7447,Bay Area Floats 'Sanctuary In Transit Policy' ...,BUSINESS,0
10388,Microsoft Agrees To Acquire LinkedIn For $26.2...,BUSINESS,0
1782,"Inside A Legal, Multibillion Dollar Weed Market",BUSINESS,0
...,...,...,...
9924,A Call for Data Literacy If we care about youn...,SCIENCE,3
10219,Here's What Happens When Someone Sneezes On An...,SCIENCE,3
11884,Most People Don't See How Climate Change Is Af...,SCIENCE,3
7854,"Watch Octopuses Meet for Blind Date, Tricky Se...",SCIENCE,3


In [35]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced["text"],
    df_balanced["category_num"],
    test_size=0.2,
    stratify=df_balanced["category_num"]    
)

In [36]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [45]:
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer()),
    ('multi nb', MultinomialNB())
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.90      0.82       276
           1       0.93      0.85      0.89       277
           2       0.87      0.88      0.88       276
           3       0.91      0.81      0.86       276

    accuracy                           0.86      1105
   macro avg       0.87      0.86      0.86      1105
weighted avg       0.87      0.86      0.86      1105



In [46]:
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range=(1,2))),
    ('multi nb', MultinomialNB())
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.93      0.80       276
           1       0.94      0.81      0.87       277
           2       0.89      0.87      0.88       276
           3       0.92      0.76      0.83       276

    accuracy                           0.84      1105
   macro avg       0.86      0.84      0.84      1105
weighted avg       0.86      0.84      0.84      1105



In [47]:
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range=(1,3))),
    ('multi nb', MultinomialNB())
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.93      0.78       276
           1       0.95      0.79      0.86       277
           2       0.88      0.86      0.87       276
           3       0.91      0.74      0.82       276

    accuracy                           0.83      1105
   macro avg       0.85      0.83      0.83      1105
weighted avg       0.85      0.83      0.83      1105



In [49]:
df_balanced['preprocessed_text'] = df_balanced["text"].apply(preprocess_text)

In [50]:
df_balanced.head()

Unnamed: 0,text,category,category_num,preprocessed_text
594,How to Develop the Next Generation of Innovato...,BUSINESS,0,develop Generation Innovators stop treat way g...
3093,"Madoff Victims' Payout Nears $7.2 Billion, Tru...",BUSINESS,0,Madoff Victims Payout near $ 7.2 billion Trust...
7447,Bay Area Floats 'Sanctuary In Transit Policy' ...,BUSINESS,0,Bay Area Floats Sanctuary Transit Policy prote...
10388,Microsoft Agrees To Acquire LinkedIn For $26.2...,BUSINESS,0,Microsoft agree acquire linkedin $ 26.2 billio...
1782,"Inside A Legal, Multibillion Dollar Weed Market",BUSINESS,0,inside Legal Multibillion Dollar Weed Market


In [51]:
X_train, X_test, y_train, y_test = train_test_split(
    df_balanced["preprocessed_text"],
    df_balanced["category_num"],
    test_size=0.2,
    stratify=df_balanced["category_num"]    
)

In [52]:
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range=(1,2))),
    ('multi nb', MultinomialNB())
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.90      0.85       276
           1       0.92      0.84      0.88       276
           2       0.86      0.92      0.89       276
           3       0.93      0.84      0.88       277

    accuracy                           0.88      1105
   macro avg       0.88      0.88      0.88      1105
weighted avg       0.88      0.88      0.88      1105

