In [31]:
import re
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import nltk
import string
from nltk.corpus import stopwords

In [32]:
from google.colab import drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [35]:
df = pd.read_json('/content/gdrive/MyDrive/News_Category_Dataset_v3.json', lines=True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209527 entries, 0 to 209526
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   link               209527 non-null  object        
 1   headline           209527 non-null  object        
 2   category           209527 non-null  object        
 3   short_description  209527 non-null  object        
 4   authors            209527 non-null  object        
 5   date               209527 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.6+ MB


In [37]:
df['category'].value_counts()

POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATION       

In [38]:
selected_cat=['POLITICS', 'ENTERTAINMENT', 'U.S. NEWS', 'WORLD NEWS']

In [61]:
data=df[['category','short_description']][df['category'].isin(selected_cat)].reset_index(drop=True)

In [62]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

data['category']= label_encoder.fit_transform(data['category'])

In [63]:
data

Unnamed: 0,category,short_description
0,2,Health experts said it is too early to predict...
1,2,He was subdued by passengers and crew when he ...
2,2,Amy Cooper accused investment firm Franklin Te...
3,2,The 63-year-old woman was seen working at the ...
4,2,"""Who's that behind you?"" an anchor for New Yor..."
...,...,...
57635,0,Bow Wow needs to hire himself a new accountant...
57636,0,Fox and American Idol snagged the exclusive wo...
57637,0,Nick Stahl found himself a little short on cas...
57638,0,Representation of the collective diaspora has ...


In [64]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [66]:
## clean_text
nltk.download('stopwords')

import string
from nltk.corpus import stopwords

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

train['clean_text'] = train['short_description'].apply(clean_text)
test['clean_text'] = test['short_description'].apply(clean_text)

## remove_stopwords

stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'c']
stop_words = stop_words + more_stopwords

def remove_stopwords(text):
    words = text.split(' ')
    words = [word for word in words if word not in stop_words]
    text = ' '.join(words)
    return text

train['clean_text'] = train['clean_text'].apply(remove_stopwords)
test['clean_text'] = test['clean_text'].apply(remove_stopwords)

##  stemm_text

import nltk

stemmer = nltk.SnowballStemmer("english")

def stemm_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

train['clean_text'] = train['clean_text'].apply(stemm_text)
test['clean_text'] = test['clean_text'].apply(stemm_text)

## preprocess_data

def preprocess_data(text):
    # Clean puntuation, urls, and so on
    text = clean_text(text)
    # Remove stopwords
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    # Stemm all the words in the sentence
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

train['clean_text'] = train['clean_text'].apply(preprocess_data)
test['clean_text'] = test['clean_text'].apply(preprocess_data)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [68]:
train

Unnamed: 0,category,short_description,clean_text
34931,1,ATLANTIC CITY — The Trump Plaza Casino and Hot...,atlant citi — trump plaza casino hotel close w...
42887,0,Ron Weasley gets a pilot at NBC.,ron weasley get pilot nbc
55600,1,"This week, NBA commissioner Adam Silver brough...",week nba commiss adam silver brought hammer do...
49846,1,,
4707,3,Rescue teams were still trying to reach some B...,rescu team still tri reach bahamian communiti ...
...,...,...,...
54343,0,,
38158,1,"""They are paying the price for their own extre...",pay price extrem
860,2,The 28th Screen Actors Guild Awards will kick ...,screen actor guild award kick “hamilton” reun...
15795,3,The continuing campaign to exterminate the Roh...,continu campaign extermin rohingya peopl myanm...


In [69]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = TfidfVectorizer()
#fit_transform for train data
X_train = vectorizer.fit_transform(train['clean_text'])
y_train = train['category']
#transform for test data
X_test = vectorizer.transform(test['clean_text'])
y_test = test['category']

In [70]:
X_train

<46112x24433 sparse matrix of type '<class 'numpy.float64'>'
	with 393650 stored elements in Compressed Sparse Row format>

In [71]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [72]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

In [73]:

model = LogisticRegression(random_state = 42)
model.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [74]:
y_pred = model.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print(f'acc:{acc}')

acc:0.7800065054754418


In [75]:
#Predictions on test set
test_pred = model.predict(X_test)
test_true= y_test

In [76]:
print(classification_report(test_true,test_pred,target_names=class_names,digits=4))

               precision    recall  f1-score   support

ENTERTAINMENT     0.8107    0.6228    0.7044      3417
     POLITICS     0.7721    0.9376    0.8469      7169
    U.S. NEWS     0.5000    0.0217    0.0415       277
   WORLD NEWS     0.6973    0.1940    0.3035       665

     accuracy                         0.7794     11528
    macro avg     0.6950    0.4440    0.4741     11528
 weighted avg     0.7727    0.7794    0.7539     11528

