In [13]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer(ngram_range=(2,2))
v.fit(["Thor Hathodwala is looking of a job"])
v.vocabulary_


corpus = [
    "Thor at pizza" ,
    "Loki is tall",
    "Loki is eating pizza"
]


In [14]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocessor(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop or token.is_punct])


print(preprocessor("Thor ate pizza"))


thor eat pizza


In [15]:
corpus_processed = [preprocessor(text) for text in corpus ]
corpus_processed

['thor pizza', 'Loki tall', 'Loki eat pizza']

In [16]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(corpus_processed)
v.vocabulary_

{'thor': 7,
 'pizza': 5,
 'thor pizza': 8,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'eat': 0,
 'loki eat': 3,
 'eat pizza': 1}

In [18]:
v.transform(["Thor eat pizza"]).toarray()



array([[1, 1, 0, 0, 0, 1, 0, 1, 0]], dtype=int64)

In [19]:
import pandas as pd

df = pd.read_json("news_dataset.json")

print(df)

df.shape

                                                    text  category
0      Watching Schrödinger's Cat Die University of C...   SCIENCE
1         WATCH: Freaky Vortex Opens Up In Flooded Lake    SCIENCE
2      Entrepreneurs Today Don't Need a Big Budget to...  BUSINESS
3      These Roads Could Recharge Your Electric Car A...  BUSINESS
4      Civilian 'Guard' Fires Gun While 'Protecting' ...     CRIME
...                                                  ...       ...
12690  Coach Shakes Hands Of Imaginary Players After ...    SPORTS
12691  This Minivan-Sized Sea Sponge Is Thought To Be...   SCIENCE
12692  RECAP: Dramatic Eclipse Photos Don't miss the ...   SCIENCE
12693  Richard Sherman Wants To Talk About Police Sho...    SPORTS
12694  Your Customers Ignore Your Emails -- How Will ...  BUSINESS

[12695 rows x 2 columns]


(12695, 2)

In [20]:
df.category.value_counts()


category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

In [26]:
min_sample = 1381

df_business = df[df.category == "BUSINESS"].sample(min_sample , random_state=1030)
df_sports = df[df.category == "SPORTS"].sample(min_sample , random_state=1030)
df_crime = df[df.category == "CRIME"].sample(min_sample , random_state=1030)
df_science = df[df.category == "SCIENCE"].sample(min_sample , random_state=1030)

df_balanced = pd.concat( [df_business , df_crime , df_sports , df_science ] )
df_balanced.category.value_counts()



category
BUSINESS    1381
CRIME       1381
SPORTS      1381
SCIENCE     1381
Name: count, dtype: int64

In [52]:
df_balanced["category_num"] = df_balanced.category.map( {
    "BUSINESS" : 0,
    "SPORTS" : 1,
    "CRIME" : 2,
    "SCIENCE" : 3
} )

df.head()

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [29]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(
    df_balanced.text , 
    df_balanced.category_num ,
    test_size= 0.2,
    random_state=2022,
    stratify=df_balanced.category_num
)



In [31]:
X_train.shape

(4419,)

In [34]:
y_test.value_counts()

category_num
1    277
0    276
3    276
2    276
Name: count, dtype: int64

In [50]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


clf = Pipeline(
    [
        ("vectorizer_bow" , CountVectorizer()),
        ("Multi NB" , MultinomialNB())
    ]
)

# train
clf.fit(X_train, y_train)

# predict
y_pred = clf.predict(X_test)


print(classification_report(y_test , y_pred) )


              precision    recall  f1-score   support

           0       0.81      0.87      0.84       276
           1       0.90      0.86      0.88       277
           2       0.86      0.90      0.88       276
           3       0.88      0.82      0.85       276

    accuracy                           0.86      1105
   macro avg       0.86      0.86      0.86      1105
weighted avg       0.86      0.86      0.86      1105



In [38]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


clf = Pipeline(
    [
        ("vectorizer_bow" , CountVectorizer(ngram_range=(1,2))),
        ("Multi NB" , MultinomialNB())
    ]
)

# train
clf.fit(X_train, y_train)

# predict
y_pred = clf.predict(X_test)


print(classification_report(y_test , y_pred) )


              precision    recall  f1-score   support

           0       0.72      0.93      0.81       276
           1       0.94      0.81      0.87       277
           2       0.91      0.89      0.90       276
           3       0.90      0.78      0.83       276

    accuracy                           0.85      1105
   macro avg       0.87      0.85      0.85      1105
weighted avg       0.87      0.85      0.85      1105



In [40]:
#check accuracy

X_test[:4]

8971    AT&T's Fiber Optic Construction Last Six Years...
4634    The Crisis of Confidence in Medical Research I...
2064    Think Weather Forecasts Are Bad? Try Forecasti...
7619    How to Find Your Business Leadership Blind Spo...
Name: text, dtype: object

In [41]:
y_test[:4]

8971    0
4634    3
2064    3
7619    0
Name: category_num, dtype: int64

In [42]:
y_pred[:4]

array([0, 3, 3, 0], dtype=int64)

# after preprocess

In [44]:
df_balanced["preprecess_text"] = df_balanced.text.apply(preprocessor)

df_balanced.head()

Unnamed: 0,text,category,category_num,preprecess_text
3940,"Economists Are Wrong To boil things down, ther...",BUSINESS,0,"economist wrong boil thing , road follow envir..."
2971,Hidden-Camera Video Reveals Chicken McNuggets'...,BUSINESS,0,Hidden - Camera Video reveal Chicken McNuggets...
8442,5 Things That Made Our Start-Up Work Our missi...,BUSINESS,0,"5 thing start - work mission ground , strategy..."
2214,Chevron's Ecuador Plan B The big news this wee...,BUSINESS,0,Chevron Ecuador Plan B big news week Chevron -...
9536,Deutsche Bank Won't Expand In North Carolina B...,BUSINESS,0,Deutsche Bank will expand North Carolina Anti ...


In [45]:
X_train , X_test , y_train , y_test = train_test_split(
    df_balanced.preprecess_text , 
    df_balanced.category_num ,
    test_size= 0.2,
    random_state=2022,
    stratify=df_balanced.category_num
)

In [46]:
clf = Pipeline(
    [
        ("vectorizer_bow" , CountVectorizer()),
        ("Multi NB" , MultinomialNB())
    ]
)

# train
clf.fit(X_train, y_train)

# predict
y_pred = clf.predict(X_test)


print(classification_report(y_test , y_pred) )

              precision    recall  f1-score   support

           0       0.81      0.87      0.84       276
           1       0.90      0.86      0.88       277
           2       0.86      0.90      0.88       276
           3       0.88      0.82      0.85       276

    accuracy                           0.86      1105
   macro avg       0.86      0.86      0.86      1105
weighted avg       0.86      0.86      0.86      1105

