In [2]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer(ngram_range=(2,2))
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_


{'thor hathodawala': 4,
 'hathodawala is': 1,
 'is looking': 2,
 'looking for': 3,
 'for job': 0}

In [3]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

In [4]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [5]:
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []

    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)


In [6]:
preprocess("Loki is eating pizza")

'Loki eat pizza'

In [7]:
corpus_processed = [preprocess(text) for text in corpus]
corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [9]:
v = CountVectorizer(ngram_range=(1,2))

v.fit(corpus_processed)

v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [11]:
v.transform(["Thor ate pizza"]).toarray()

array([[0, 0, 0, 0, 0, 1, 0, 1, 0]])

In [13]:
import pandas as pd

df = pd.read_json('news_dataset.json')

print(df.shape)

df.head()

(12695, 2)


Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [14]:
df.category.value_counts()

category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

In [15]:
min_samples = 1381

df_business = df[df.category == 'BUSINESS'].sample(min_samples, random_state=2022)
df_business

Unnamed: 0,text,category
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS
2912,From the Other Side; an Honest Review from Emp...,BUSINESS
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS
502,How to Market Your Business While Traveling th...,BUSINESS
5279,How to Leverage Intuition in Decision-making I...,BUSINESS
...,...,...
4963,"Interview With Amanda Barbara of Pubslush, Cro...",BUSINESS
589,Welcome to the Age of Context-Driven Sales and...,BUSINESS
687,Crude Oil Train Derails In Montana An oil trai...,BUSINESS
2961,Helping a Person Live Like They Were Dying Lor...,BUSINESS


In [16]:
df_sports = df[df.category == 'SPORTS'].sample(min_samples, random_state=2022)
df_crime = df[df.category == 'CRIME'].sample(min_samples, random_state=2022)
df_cience = df[df.category == 'SCIENCE'].sample(min_samples, random_state=2022)


In [17]:
df_balanced = pd.concat([df_business, df_sports, df_crime, df_cience], axis=0)
df_balanced.category.value_counts()

category
BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: count, dtype: int64

In [18]:
target = {'BUSINESS' : 0, 'SPORTS' : 1, 'CRIME' : 2, 'SCIENCE' : 3}

df_balanced['category_no'] = df_balanced.category.map(target)

In [19]:
df_balanced.head()

Unnamed: 0,text,category,category_no
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS,0
2912,From the Other Side; an Honest Review from Emp...,BUSINESS,0
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS,0
502,How to Market Your Business While Traveling th...,BUSINESS,0
5279,How to Leverage Intuition in Decision-making I...,BUSINESS,0


In [20]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df_balanced.text, df_balanced.category_no, test_size=0.2, random_state=2022, stratify=df_balanced.category_no)

In [21]:
x_train.shape


(4419,)

In [22]:
y_train.shape

(4419,)

In [23]:
y_train.value_counts()

category_no
3    1105
2    1105
0    1105
1    1104
Name: count, dtype: int64

In [25]:
y_test.value_counts()

category_no
1    277
0    276
3    276
2    276
Name: count, dtype: int64

In [27]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    {'vectorizer' : CountVectorizer()},
    {'Multi_NB' : MultinomialNB()}
])

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

print(classification_report(y_test, y_pred))

ValueError: not enough values to unpack (expected 2, got 1)