In [86]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

In [2]:
cv=CountVectorizer(ngram_range=(1,2))

In [5]:
cv.fit(['How can i help you'])

In [6]:
cv.vocabulary_

{'how': 4,
 'can': 0,
 'help': 2,
 'you': 6,
 'how can': 5,
 'can help': 1,
 'help you': 3}

In [7]:
corpus=[
    'Thore ate pizza',
    'Loki is tall',
    'Loki is eating pizza'
]

In [8]:
corpus

['Thore ate pizza', 'Loki is tall', 'Loki is eating pizza']

# Preprocessing

In [10]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [111]:
def preprocessing(text):
    doc=nlp(text)
    
    filtered_token=[]
    
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_token.append(token.lemma_)
        
    return " ".join(filtered_token)
    

In [12]:
preprocessing("our skill is very important than cgpa")

'skill important cgpa'

In [13]:
corpus_processed=[preprocessing(text) for text in corpus]

In [14]:
corpus_processed

['thore eat pizza', 'Loki tall', 'Loki eat pizza']

In [15]:
v=CountVectorizer(ngram_range=(1,2))

In [23]:
v_transfrom=v.fit_transform(corpus_processed)

In [26]:
v_transform_cv=v_transfrom.toarray()

In [30]:
v.vocabulary_

{'thore': 7,
 'eat': 0,
 'pizza': 5,
 'thore eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [18]:
v.transform(["Thore eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]], dtype=int64)

what happern if we give new word which is out of corpus

In [20]:
v.transform(["Ironman eat pizza"]).toarray() # oov problem

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

In [21]:
v.get_feature_names_out()

array(['eat', 'eat pizza', 'loki', 'loki eat', 'loki tall', 'pizza',
       'tall', 'thore', 'thore eat'], dtype=object)

In [35]:
import pandas as pd

In [36]:
df=pd.read_json("D:/Data/news_dataset.json")

In [42]:
print(df.shape)
df.head()

(12695, 2)


Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [38]:
df.category.value_counts()

BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: category, dtype: int64

# Look at above catdgory has  some unbalace data

In [44]:
min_sample=1381

In [55]:
df_business=df[df.category=='BUSINESS'].sample(min_sample,random_state=2022)
df_sports=df[df.category=='SPORTS'].sample(min_sample,random_state=2022)
df_crime=df[df.category=='CRIME'].sample(min_sample,random_state=2022)
df_science=df[df.category=='SCIENCE'].sample(min_sample,random_state=2022)

In [56]:
df_business.category.value_counts()

BUSINESS    1381
Name: category, dtype: int64

In [57]:
df_science.category.value_counts()

SCIENCE    1381
Name: category, dtype: int64

In [59]:
df_balanced=pd.concat([df_business,df_crime,df_science,df_sports],axis=0)

In [62]:
df_balanced.sample(5)

Unnamed: 0,text,category
5173,'Warm Neptune' Exoplanet Atmosphere Consists O...,SCIENCE
12620,Holiday Blues: 5 Ways to Beat Holiday Overspen...,BUSINESS
11393,"How Battles Over God, Guns And Gays Infiltrate...",BUSINESS
6503,Young and Entrepreneurial: Lessons Tip'd Off C...,BUSINESS
9715,Chicago Fan's Obituary Takes Passing Shot At J...,SPORTS


In [63]:
df_balanced.category.value_counts()

BUSINESS    1381
CRIME       1381
SCIENCE     1381
SPORTS      1381
Name: category, dtype: int64

# Now see in category data is balance

In [65]:
target={'BUSINESS':0,'SPORTS':1,'CRIME':2,'SCIENCE':3}

In [67]:
df_balanced['category_num']=df_balanced.category.map(target)

In [68]:
df_balanced

Unnamed: 0,text,category,category_num
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS,0
2912,From the Other Side; an Honest Review from Emp...,BUSINESS,0
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS,0
502,How to Market Your Business While Traveling th...,BUSINESS,0
5279,How to Leverage Intuition in Decision-making I...,BUSINESS,0
...,...,...,...
4052,The Problem With Asking Football Players To Ac...,SPORTS,1
2292,NFL Clubs Seek to Feed at the Public Trough On...,SPORTS,1
10994,Boxer Wearing 'America 1st' Shorts Gets Pounde...,SPORTS,1
9359,Garbine Muguruza Blows Away Venus Williams To ...,SPORTS,1


In [69]:
from sklearn.model_selection import train_test_split

In [73]:
X_train,X_test,y_train,y_test=train_test_split(
    df_balanced.text,df_balanced.category_num,test_size=0.2,random_state=2022,stratify=df_balanced.category_num)

In [75]:
X_train[7589]

'Ovulating Women Prefer Images of Penetration Over Images of Oral Sex Canadian researchers found that during ovulation the female genitalia is more responsive to images of penetration than to images of oral sex. The difference is significantly reduced during non-fertile phases of the menstrual cycle.'

In [76]:
X_train.shape

(4419,)

In [77]:
y_train.shape

(4419,)

In [78]:
X_test.shape

(1105,)

In [79]:
y_test.shape

(1105,)

In [84]:
y_test.value_counts()

1    277
0    276
3    276
2    276
Name: category_num, dtype: int64

In [89]:
clf=Pipeline([
    ('vectrizor_bow',CountVectorizer()),
    ('Multi_NB',MultinomialNB())
])

In [90]:
clf.fit(X_train,y_train)

In [92]:
y_predict=clf.predict(X_test)

In [93]:
print(classification_report(y_predict,y_test))

              precision    recall  f1-score   support

           0       0.87      0.75      0.81       321
           1       0.80      0.93      0.86       240
           2       0.90      0.83      0.86       300
           3       0.80      0.90      0.85       244

    accuracy                           0.84      1105
   macro avg       0.84      0.85      0.84      1105
weighted avg       0.85      0.84      0.84      1105



In [94]:
clf=Pipeline([
    ('vectrizor_bow',CountVectorizer(ngram_range=(1,2))),
    ('Multi_NB',MultinomialNB())
])
clf.fit(X_train,y_train)
y_predict=clf.predict(X_test)
print(classification_report(y_predict,y_test))

              precision    recall  f1-score   support

           0       0.90      0.69      0.78       361
           1       0.74      0.95      0.83       217
           2       0.88      0.82      0.85       295
           3       0.78      0.92      0.84       232

    accuracy                           0.82      1105
   macro avg       0.82      0.85      0.83      1105
weighted avg       0.84      0.82      0.82      1105



# As compared to bow is better than bow(1,2)

In [106]:
X_train[:5]

7589     Ovulating Women Prefer Images of Penetration O...
10442    Scientists Discover Spooky Influence On Baby N...
8792     Olympic Race Walker Steps Up To Propose To His...
1733     Beloved Bipedal Bear Named Pedals Believed Kil...
2526     Elizabeth Smart Gave Birth To Baby Girl, Fathe...
Name: text, dtype: object

In [97]:
y_test[:5]

3716     0
608      3
11172    3
1346     0
1356     2
Name: category_num, dtype: int64

In [98]:
y_predict[:5]

array([0, 0, 3, 0, 2], dtype=int64)

- BUSINESS :0
- SPORTS: 1
- CRIME:2
- SCIENCE:3

In [104]:
clf.score(X_test,y_test)

0.8244343891402715

# Now Apply in dataframe  preprocess step and predict it

In [112]:
df_balanced['processed_txt']=df_balanced.text.apply(preprocessing)

In [116]:
df_balanced.head()

Unnamed: 0,text,category,category_num,processed_txt
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS,0,GCC Business Leaders remain Confident Face Reg...
2912,From the Other Side; an Honest Review from Emp...,BUSINESS,0,Honest Review employee wake morning love impor...
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS,0,Mike McDerment ceo FreshBooks Talks give build...
502,How to Market Your Business While Traveling th...,BUSINESS,0,market business travel World recently amazing ...
5279,How to Leverage Intuition in Decision-making I...,BUSINESS,0,Leverage intuition decision making feel safe r...


In [117]:
X_train,X_test,y_train,y_test=train_test_split(
    df_balanced.processed_txt,df_balanced.category_num,test_size=0.2,random_state=2022,stratify=df_balanced.category_num)

In [118]:
clf=Pipeline([
    ('vectrizor_bow',CountVectorizer(ngram_range=(1,2))),
    ('Multi_NB',MultinomialNB())
])
clf.fit(X_train,y_train)
y_predict=clf.predict(X_test)
print(classification_report(y_predict,y_test))

              precision    recall  f1-score   support

           0       0.88      0.80      0.84       304
           1       0.82      0.92      0.87       247
           2       0.92      0.83      0.87       306
           3       0.81      0.90      0.85       248

    accuracy                           0.86      1105
   macro avg       0.86      0.86      0.86      1105
weighted avg       0.86      0.86      0.86      1105



In [119]:
clf.score(X_test,y_test)

0.857918552036199