In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/news-category-dataset/News_Category_Dataset_v3.json


In [5]:
import pandas as pd

df = pd.read_json('/kaggle/input/news-category-dataset/News_Category_Dataset_v3.json', lines=True)[['headline', 'category']]

print(df.shape)

df.head()

(209527, 2)


Unnamed: 0,headline,category
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY
3,The Funniest Tweets From Parents This Week (Se...,PARENTING
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS


In [7]:

df.category.value_counts()

category
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATI

In [8]:
# Select only rows where the category is in a list of desired values
desired_categories = ['HEALTHY LIVING', 'ENTERTAINMENT', 'QUEER VOICES', 'PARENTING']
df_new = df[df['category'].isin(desired_categories)]
df_new.head()

Unnamed: 0,headline,category
3,The Funniest Tweets From Parents This Week (Se...,PARENTING
20,Golden Globes Returning To NBC In January Afte...,ENTERTAINMENT
28,James Cameron Says He 'Clashed' With Studio Be...,ENTERTAINMENT
39,Amazon Greenlights 'Blade Runner 2099' Limited...,ENTERTAINMENT
43,'The Phantom Of The Opera' To Close On Broadwa...,ENTERTAINMENT


In [None]:
df_new.category.value_counts()

In [9]:
min_samples = 6347 # we have these many EDUCATION articles 


df_ENTERTAINMENT = df_new[df_new.category=="ENTERTAINMENT"].sample(min_samples, random_state=2022)
df_PARENTING = df_new[df_new.category=="PARENTING"].sample(min_samples, random_state=2022)
df_HEALTHYLIVING = df_new[df_new.category=="HEALTHY LIVING"].sample(min_samples, random_state=2022)
df_EDUCATION = df_new[df_new.category=="QUEER VOICES"].sample(min_samples, random_state=2022)

In [11]:
df_balanced = pd.concat([df_ENTERTAINMENT,df_PARENTING,df_HEALTHYLIVING,df_EDUCATION],axis=0)
df_balanced.category.value_counts()

category
ENTERTAINMENT     6347
PARENTING         6347
HEALTHY LIVING    6347
QUEER VOICES      6347
Name: count, dtype: int64

In [12]:
target = {'ENTERTAINMENT': 0, 'PARENTING': 1, 'HEALTHY LIVING': 2, 'QUEER VOICES': 3}

df_balanced['category_num'] = df_balanced['category'].map({
    'ENTERTAINMENT': 0,
    'PARENTING': 1, 
    'HEALTHY LIVING': 2, 
    'QUEER VOICES': 3
})

In [13]:
df_balanced.tail()


Unnamed: 0,headline,category,category_num
174448,Vietnamese Lunar New Year: Time for Racism and...,QUEER VOICES,3
125613,This Is What CeCe McDonald Thinks We Should Do...,QUEER VOICES,3
11561,Jay-Z: 'I Cried Because I Was So Happy' For My...,QUEER VOICES,3
32072,"No Space, No Matter How Progressive, Is Imperm...",QUEER VOICES,3
132744,"I Want You, Gentle Reader, to Lighten Up!",QUEER VOICES,3


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.headline, 
    df_balanced.category_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2023,
    stratify=df_balanced.category_num
)

In [15]:
print(X_train.shape)
X_train.head()

(20310,)


103784           What's It Like to Be Deaf and Gay? (VIDEO)
114583           How To Plan The Perfect Paleo Thanksgiving
36503     Watch Jamie Lynn Spears Surprise Britney At Th...
120268    Watch The First Trailer For Tim Burton's New M...
46642     Veteran Indian Character Actor Om Puri Dead At 66
Name: headline, dtype: object

In [16]:
y_train.value_counts()

category_num
2    5078
0    5078
3    5077
1    5077
Name: count, dtype: int64

In [18]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 2))), #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.84      0.85      1269
           1       0.79      0.77      0.78      1270
           2       0.80      0.82      0.81      1269
           3       0.82      0.84      0.83      1270

    accuracy                           0.82      5078
   macro avg       0.82      0.82      0.82      5078
weighted avg       0.82      0.82      0.82      5078



In [20]:
model_filename = 'multinomial_nb_text_model.joblib'
joblib.dump(clf, model_filename)
print(f"Model saved to {model_filename}")

Model saved to multinomial_nb_text_model.joblib


In [14]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens)

In [16]:
df_balanced['preprocessed_txt'] = df_balanced['headline'].apply(preprocess) 


In [17]:
df_balanced

Unnamed: 0,headline,category,category_num,preprocessed_txt
11882,"R. Kelly Trained 14-Year-Old Girl As Sex ‘Pet,...",ENTERTAINMENT,0,R. Kelly train 14 Year Old Girl Sex pet Ex Gir...
109872,Des Doyle and Ryan Patrick McGuffey Learn the ...,ENTERTAINMENT,0,Des Doyle Ryan Patrick McGuffey learn Ropes sh...
85971,Ryan Seacrest Sells 'Squad Goals' Series To CBS,ENTERTAINMENT,0,Ryan Seacrest Sells Squad Goals series CBS
88565,Jessica Simpson's HSN Appearance Has Some Scra...,ENTERTAINMENT,0,Jessica Simpson HSN Appearance scratch head
42149,Second Accuser May Testify Against Bill Cosby ...,ENTERTAINMENT,0,Second Accuser testify Bill Cosby Criminal Case
...,...,...,...,...
174448,Vietnamese Lunar New Year: Time for Racism and...,QUEER VOICES,3,Vietnamese Lunar New Year time Racism Homophobia
125613,This Is What CeCe McDonald Thinks We Should Do...,QUEER VOICES,3,CeCe McDonald think Queer People safe
11561,Jay-Z: 'I Cried Because I Was So Happy' For My...,QUEER VOICES,3,Jay Z cry happy mom come
32072,"No Space, No Matter How Progressive, Is Imperm...",QUEER VOICES,3,Space matter progressive impermeable Bigotry


In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.preprocessed_txt, 
    df_balanced.category_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2023,
    stratify=df_balanced.category_num
)

In [30]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 2))), #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.86      0.84      1269
           1       0.79      0.79      0.79      1270
           2       0.84      0.77      0.81      1269
           3       0.82      0.84      0.83      1270

    accuracy                           0.81      5078
   macro avg       0.82      0.81      0.81      5078
weighted avg       0.82      0.81      0.81      5078



In [31]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# 1. Create a pipeline
clf = Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range=(1,2), stop_words='english')),
    ('multi_nb', MultinomialNB())
])

# 2. Define stratified K-Folds
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=2023)

# 3. Run cross-validation on the training data
scores = cross_val_score(clf, X_train, y_train, cv=cv, scoring='f1_macro', n_jobs=-1)

print("F1 macro scores for each fold:", scores)
print("Mean F1 macro score:", np.mean(scores))


F1 macro scores for each fold: [0.81406709 0.80769428 0.82286038 0.81039396 0.81191834]
Mean F1 macro score: 0.8133868099898743
