In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import spacy 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix 

In [2]:
v = CountVectorizer(ngram_range = (1,3))
v.fit(['Thor Hatwala is looking for a job'])

In [3]:
v.vocabulary_

{'thor': 12,
 'hatwala': 2,
 'is': 5,
 'looking': 9,
 'for': 0,
 'job': 8,
 'thor hatwala': 13,
 'hatwala is': 3,
 'is looking': 6,
 'looking for': 10,
 'for job': 1,
 'thor hatwala is': 14,
 'hatwala is looking': 4,
 'is looking for': 7,
 'looking for job': 11}

In [4]:
nlp = spacy.load('en_core_web_sm') # load an english model library 

In [5]:
def preprocess(text):
    doc = nlp(text)
    filterd_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filterd_tokens.append(token.lemma_)
        
    return ' '.join(filterd_tokens)

In [6]:
corpus = ['Thor ate pizza','Loki is tall','Loki eating pizza']

In [7]:
corpus_processed = [ preprocess(text) for text in corpus ]
corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [8]:
v = CountVectorizer(ngram_range = (1,2))

In [9]:
v.fit(corpus_processed)

In [10]:
data_fram = pd.read_json('News_Category_Dataset_v3.json',lines=True)

In [11]:
data_fram.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [12]:
category_list = list(data_fram.category.value_counts().index) # Imbalance model is there So we have to handle the class imbalance 
category_list

['POLITICS',
 'WELLNESS',
 'ENTERTAINMENT',
 'TRAVEL',
 'STYLE & BEAUTY',
 'PARENTING',
 'HEALTHY LIVING',
 'QUEER VOICES',
 'FOOD & DRINK',
 'BUSINESS',
 'COMEDY',
 'SPORTS',
 'BLACK VOICES',
 'HOME & LIVING',
 'PARENTS',
 'THE WORLDPOST',
 'WEDDINGS',
 'WOMEN',
 'CRIME',
 'IMPACT',
 'DIVORCE',
 'WORLD NEWS',
 'MEDIA',
 'WEIRD NEWS',
 'GREEN',
 'WORLDPOST',
 'RELIGION',
 'STYLE',
 'SCIENCE',
 'TECH',
 'TASTE',
 'MONEY',
 'ARTS',
 'ENVIRONMENT',
 'FIFTY',
 'GOOD NEWS',
 'U.S. NEWS',
 'ARTS & CULTURE',
 'COLLEGE',
 'LATINO VOICES',
 'CULTURE & ARTS',
 'EDUCATION']

In [13]:
min_splict = 1014 
df_POLITICS		       = data_fram[data_fram.category == 'POLITICS'      ].sample(min_splict,random_state = 1)                          
df_WELLNESS            = data_fram[data_fram.category ==  'WELLNESS'     ].sample(min_splict,random_state = 1)                          
df_ENTERTAINMENT       = data_fram[data_fram.category ==  'ENTERTAINMENT'].sample(min_splict,random_state = 1)                          
df_TRAVEL              = data_fram[data_fram.category ==  'TRAVEL'       ].sample(min_splict,random_state = 1)                          
df_STYLE__BEAUTY       = data_fram[data_fram.category ==  'STYLE & BEAUTY' ].sample(min_splict,random_state = 1)                        
df_PARENTING           = data_fram[data_fram.category ==  'PARENTING'    ].sample(min_splict,random_state = 1)                          
df_HEALTHY_LIVING      = data_fram[data_fram.category ==  'HEALTHY LIVING'].sample(min_splict,random_state = 1)                         
df_QUEER_VOICES        = data_fram[data_fram.category ==  'QUEER VOICES' ].sample(min_splict,random_state = 1)                          
df_FOOD_DRINK          = data_fram[data_fram.category =='FOOD & DRINK'   ].sample(min_splict,random_state = 1)                          
df_BUSINESS            = data_fram[data_fram.category ==  'BUSINESS'     ].sample(min_splict,random_state = 1)                          
df_COMEDY              = data_fram[data_fram.category ==  'COMEDY'       ].sample(min_splict,random_state = 1)                          
df_SPORTS              = data_fram[data_fram.category ==  'SPORTS'       ].sample(min_splict,random_state = 1)                          
df_BLACK_VOICES        = data_fram[data_fram.category ==  'BLACK VOICES' ].sample(min_splict,random_state = 1)                          
df_HOME_LIVING         =   data_fram[data_fram.category =='HOME & LIVING'].sample(min_splict,random_state = 1)                            
df_PARENTS             = data_fram[data_fram.category ==  'PARENTS'      ].sample(min_splict,random_state = 1)                          
df_THE_WORLDPOST       = data_fram[data_fram.category ==  'THE WORLDPOST'].sample(min_splict,random_state = 1)                          
df_WEDDINGS            = data_fram[data_fram.category ==  'WEDDINGS'     ].sample(min_splict,random_state = 1)                          
df_WOMEN               = data_fram[data_fram.category ==  'WOMEN'        ].sample(min_splict,random_state = 1)                          
df_CRIME               = data_fram[data_fram.category ==  'CRIME'        ].sample(min_splict,random_state = 1)                          
df_IMPACT              = data_fram[data_fram.category ==  'IMPACT'       ].sample(min_splict,random_state = 1)                          
df_DIVORCE             = data_fram[data_fram.category ==  'DIVORCE'      ].sample(min_splict,random_state = 1)                          
df_WORLD_NEWS          = data_fram[data_fram.category ==  'WORLD NEWS'   ].sample(min_splict,random_state = 1)                          
df_MEDIA               = data_fram[data_fram.category ==  'MEDIA'        ].sample(min_splict,random_state = 1)                          
df_WEIRD_NEWS          = data_fram[data_fram.category ==  'WEIRD NEWS'   ].sample(min_splict,random_state = 1)                          
df_GREEN               = data_fram[data_fram.category ==  'GREEN'        ].sample(min_splict,random_state = 1)                          
df_WORLDPOST           = data_fram[data_fram.category ==  'WORLDPOST'    ].sample(min_splict,random_state = 1)                          
df_RELIGION            = data_fram[data_fram.category ==  'RELIGION'     ].sample(min_splict,random_state = 1)                          
df_STYLE               = data_fram[data_fram.category ==  'STYLE'        ].sample(min_splict,random_state = 1)                          
df_SCIENCE             = data_fram[data_fram.category ==  'SCIENCE'      ].sample(min_splict,random_state = 1)                          
df_TECH                = data_fram[data_fram.category ==  'TECH'         ].sample(min_splict,random_state = 1)                          
df_TASTE               = data_fram[data_fram.category ==  'TASTE'        ].sample(min_splict,random_state = 1)                          
df_MONEY               = data_fram[data_fram.category ==   'MONEY'        ].sample(min_splict,random_state = 1)                          
df_ARTS                = data_fram[data_fram.category ==   'ARTS'         ].sample(min_splict,random_state = 1)                          
df_ENVIRONMENT         = data_fram[data_fram.category ==   'ENVIRONMENT'  ].sample(min_splict,random_state = 1)                          
df_FIFTY               = data_fram[data_fram.category ==   'FIFTY'        ].sample(min_splict,random_state = 1)                          
df_GOOD_NEWS           = data_fram[data_fram.category ==   'GOOD NEWS'    ].sample(min_splict,random_state = 1)                          
df_U_S_NEWS            = data_fram[data_fram.category ==   'U.S. NEWS'    ].sample(min_splict,random_state = 1)                          
df_ARTS_CULTURE        = data_fram[data_fram.category ==   'ARTS & CULTURE'].sample(min_splict,random_state = 1)                         
df_COLLEGE             = data_fram[data_fram.category ==   'COLLEGE'      ].sample(min_splict,random_state = 1)                          
df_LATINO_VOICES       = data_fram[data_fram.category ==   'LATINO VOICES'].sample(min_splict,random_state = 1)                          
df_CULTURE_ARTS        = data_fram[data_fram.category ==   'CULTURE & ARTS'].sample(min_splict,random_state = 1)                          
df_EDUCATION           = data_fram[data_fram.category ==   'EDUCATION'    ] .sample(min_splict,random_state = 1)                         

    

In [14]:
balance_df = pd.concat([
    df_POLITICS		  ,
    df_WELLNESS       ,
    df_ENTERTAINMENT  ,
    df_TRAVEL         ,
    df_STYLE__BEAUTY  ,
    df_PARENTING      ,
    df_HEALTHY_LIVING ,
    df_QUEER_VOICES   ,
    df_FOOD_DRINK     ,
    df_BUSINESS       ,
    df_COMEDY         ,
    df_SPORTS         ,
    df_BLACK_VOICES   ,
    df_HOME_LIVING    ,
    df_PARENTS        ,
    df_THE_WORLDPOST  ,
    df_WEDDINGS       ,
    df_WOMEN          ,
    df_CRIME          ,
    df_IMPACT         ,
    df_DIVORCE        ,
    df_WORLD_NEWS     ,
    df_MEDIA          ,
    df_WEIRD_NEWS     ,
    df_GREEN          ,
    df_WORLDPOST      ,
    df_RELIGION       ,
    df_STYLE          ,
    df_SCIENCE        ,
    df_TECH           ,
    df_TASTE          ,
    df_MONEY          ,
    df_ARTS           ,
    df_ENVIRONMENT    ,
    df_FIFTY          ,
    df_GOOD_NEWS      ,
    df_U_S_NEWS       ,
    df_ARTS_CULTURE   ,
    df_COLLEGE        ,
    df_LATINO_VOICES  ,
    df_CULTURE_ARTS   ,
    df_EDUCATION ],axis = 0 )

In [15]:
category_into_integer = { }
count = 1
for cat in list(balance_df.category.value_counts().index):   
    category_into_integer[cat] = count
    count += 1 

In [16]:
balance_df['category_num'] = balance_df.category.map(category_into_integer)

In [17]:
data_fram['category_num'] = data_fram.category.map(category_into_integer)

In [18]:
balance_df

Unnamed: 0,link,headline,category,short_description,authors,date,category_num
46904,https://www.huffingtonpost.com/entry/house-gop...,House GOP Faceplant On Ethics Coup Shows Publi...,POLITICS,"Public outrage blocked a secret, midnight effo...",Ryan Grim and Matt Fuller,2017-01-03,1
90109,https://www.huffingtonpost.com/entry/patrick-m...,Rep. Patrick Murphy Comes Out In Favor Of Iran...,POLITICS,Many other Florida Democrats remain undecided.,Amanda Terkel,2015-08-31,1
74422,https://www.huffingtonpost.com/entry/senators-...,Senators Strike Funding Deal To Help Flint Rep...,POLITICS,The measure was fast-tracked so a vote could c...,Laura Barrón-López,2016-02-24,1
13027,https://www.huffingtonpost.com/entry/wednesday...,Wednesday's Morning Email: Stormy Daniels Suin...,POLITICS,A porn star is suing the president; let that s...,Eliot Nelson,2018-03-07,1
57851,https://www.huffingtonpost.com/entry/lenore-an...,Crime Survivors Are Organizing. They Want Crim...,POLITICS,A troubled teen turned prosecutor is bringing ...,Nico Pitney,2016-08-31,1
...,...,...,...,...,...,...,...
116356,https://www.huffingtonpost.com/entry/where-all...,Where All the Teachers Are Above Aveage,EDUCATION,Unless we rethink administrative and teacher t...,"Marc F. Bernstein, ContributorAdjunct faculty ...",2014-11-03,42
21257,https://www.huffingtonpost.com/entry/universit...,University Of Wisconsin-Inferior,EDUCATION,The University of Wisconsin-Superior became th...,"John A. Tures, ContributorProfessor of Politic...",2017-11-03,42
126826,https://www.huffingtonpost.com/entry/new-sat_b...,What Do We Know About the New SAT?,EDUCATION,There is a perception that we don't yet know m...,"Alex Mallory, ContributorFounder & President, ...",2014-07-07,42
39251,https://www.huffingtonpost.com/entry/heartland...,Climate Change-Deniers 'Spam' Thousands Of Tea...,EDUCATION,"""After teaching about #climatechange all day, ...",Nick Visser,2017-03-30,42


In [19]:
X_train,X_test,y_train,y_test = train_test_split(balance_df.short_description,balance_df.category_num,
                                                test_size=0.2,random_state=11,stratify=balance_df.category_num)

In [20]:
clf = Pipeline([
    ('vectorizer_bow',CountVectorizer()),
    ('multi NB', MultinomialNB())
])


In [21]:
clf.fit(X_train,y_train)

In [22]:
y_pred = clf.predict(X_test)

In [23]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.31      0.16      0.21       203
           2       0.29      0.58      0.39       203
           3       0.75      0.01      0.03       203
           4       0.31      0.11      0.17       203
           5       0.27      0.31      0.29       203
           6       0.48      0.14      0.21       203
           7       0.31      0.02      0.04       203
           8       0.63      0.13      0.21       203
           9       0.05      0.26      0.08       202
          10       0.40      0.09      0.15       202
          11       0.22      0.10      0.14       203
          12       0.16      0.57      0.25       203
          13       0.32      0.34      0.33       203
          14       0.07      0.50      0.12       203
          15       0.43      0.01      0.03       203
          16       0.27      0.41      0.33       203
          17       0.80      0.04      0.08       203
          18       0.33    

In [24]:
y_test.head()

94479      9
84583     38
130285    29
75239     37
57604     30
Name: category_num, dtype: int64

In [25]:
y_pred[:5] 

array([32, 38,  2, 28, 36], dtype=int64)

In [None]:
data_fram_remove_stop = data_fram['short_description'].apply(preprocess)

In [26]:
X_train,X_test,y_train,y_test = train_test_split(data_fram.short_description,data_fram.category_num,test_size=0.2,random_state=13)

In [27]:
clf.fit(X_train,y_train)

In [28]:
y_pred = clf.predict(X_test)

In [29]:
print(classification_report(y_test,y_pred)) # it was giving the 40 % Accuracy Bez of the Class imbalance and lots of data 

              precision    recall  f1-score   support

           1       0.37      0.87      0.52      7062
           2       0.25      0.00      0.01       302
           3       0.67      0.00      0.01       573
           4       0.31      0.02      0.03       533
           5       0.16      0.01      0.01       495
           6       0.75      0.04      0.07       507
           7       0.25      0.00      0.01       471
           8       0.80      0.04      0.08       464
           9       0.73      0.02      0.04       427
          10       0.00      0.00      0.00       407
          11       0.00      0.00      0.00       307
          12       0.30      0.85      0.44      3634
          13       0.75      0.01      0.02       273
          14       0.00      0.00      0.00       279
          15       0.00      0.00      0.00       262
          16       0.00      0.00      0.00       262
          17       0.00      0.00      0.00       264
          18       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
y_pred[0:5]

array([12,  1,  1,  1, 25], dtype=int64)

In [31]:
y_test.head()

161307    12
9308       1
34846     33
98126     29
159214    27
Name: category_num, dtype: int64