In [31]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import re

In [32]:
df = pd.read_json('News_Category_Dataset_v3.json', lines=True)

df.head(50)

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22
5,https://www.huffpost.com/entry/belk-worker-fou...,Cleaner Was Dead In Belk Bathroom For 4 Days B...,U.S. NEWS,The 63-year-old woman was seen working at the ...,,2022-09-22
6,https://www.huffpost.com/entry/reporter-gets-a...,Reporter Gets Adorable Surprise From Her Boyfr...,U.S. NEWS,"""Who's that behind you?"" an anchor for New Yor...",Elyse Wanshel,2022-09-22
7,https://www.huffpost.com/entry/puerto-rico-wat...,Puerto Ricans Desperate For Water After Hurric...,WORLD NEWS,More than half a million people remained witho...,"DÁNICA COTO, AP",2022-09-22
8,https://www.huffpost.com/entry/mija-documentar...,How A New Documentary Captures The Complexity ...,CULTURE & ARTS,"In ""Mija,"" director Isabel Castro combined mus...",Marina Fang,2022-09-22
9,https://www.huffpost.com/entry/biden-un-russia...,Biden At UN To Call Russian War An Affront To ...,WORLD NEWS,White House officials say the crux of the pres...,"Aamer Madhani, AP",2022-09-21


In [33]:
# Create a mapping for ambiguous categories
category_map = {
    'ARTS': 'ARTS & CULTURE',
    'CULTURE & ARTS': 'ARTS & CULTURE',
    'PARENTS': 'PARENTING',
    'THE WORLDPOST': 'WORLDPOST',
    'GREEN': 'ENVIRONMENT',
    'TASTE': 'FOOD & DRINK',
    'HEALTHY LIVING': 'WELLNESS',     
    'STYLE': 'STYLE & BEAUTY',        
    'MONEY': 'BUSINESS'
}

# Apply the mapping
df['category'] = df['category'].replace(category_map)

print(df['category'].value_counts())

category
POLITICS          35602
WELLNESS          24639
ENTERTAINMENT     17362
PARENTING         12746
STYLE & BEAUTY    12068
TRAVEL             9900
FOOD & DRINK       8436
BUSINESS           7748
QUEER VOICES       6347
WORLDPOST          6243
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
ENVIRONMENT        4066
ARTS & CULTURE     3922
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
RELIGION           2577
SCIENCE            2206
TECH               2104
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
COLLEGE            1144
LATINO VOICES      1130
EDUCATION          1014
Name: count, dtype: int64


In [34]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

df['text'] = (df['headline'] + " " + df['short_description']).apply(clean_text)
df = df[['text', 'category']]
df = df.dropna()

print("Number of samples:", len(df))
df.head(50)

Number of samples: 209527


Unnamed: 0,text,category
0,over million americans roll up sleeves for om...,U.S. NEWS
1,american airlines flyer charged banned for lif...,U.S. NEWS
2,of the funniest tweets about cats and dogs th...,COMEDY
3,the funniest tweets from parents this week sep...,PARENTING
4,woman who called cops on black birdwatcher los...,U.S. NEWS
5,cleaner was dead in belk bathroom for days be...,U.S. NEWS
6,reporter gets adorable surprise from her boyfr...,U.S. NEWS
7,puerto ricans desperate for water after hurric...,WORLD NEWS
8,how a new documentary captures the complexity ...,ARTS & CULTURE
9,biden at un to call russian war an affront to ...,WORLD NEWS


In [35]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['category'], test_size=0.2, random_state=42, stratify=df['category'])

In [36]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=5, max_df=0.8)
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)

In [37]:
model = LogisticRegression(max_iter=300, class_weight='balanced')
model.fit(x_train_vec, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,300


In [38]:
y_pred = model.predict(x_test_vec)

print("Accuracy", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy 0.621080513530282

Classification Report:
                 precision    recall  f1-score   support

ARTS & CULTURE       0.47      0.59      0.52       784
  BLACK VOICES       0.46      0.51      0.48       917
      BUSINESS       0.54      0.59      0.56      1550
       COLLEGE       0.36      0.59      0.45       229
        COMEDY       0.48      0.52      0.50      1080
         CRIME       0.46      0.64      0.54       712
       DIVORCE       0.71      0.77      0.74       685
     EDUCATION       0.32      0.58      0.41       203
 ENTERTAINMENT       0.74      0.58      0.65      3473
   ENVIRONMENT       0.46      0.57      0.51       813
         FIFTY       0.16      0.34      0.21       280
  FOOD & DRINK       0.73      0.80      0.76      1687
     GOOD NEWS       0.20      0.36      0.26       280
 HOME & LIVING       0.68      0.80      0.74       864
        IMPACT       0.30      0.40      0.34       697
 LATINO VOICES       0.34      0.52      0.41      

In [41]:
joblib.dump(model, 'topic_model.pk1')
joblib.dump(vectorizer, 'topic_vectorizer.pk1')

print("Model and vectorizer saved.")

Model and vectorizer saved.
