In [38]:
import pandas as pd
import re
import spacy
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# model building imports
import tensorflow as tf
from tensorflow import keras
from keras.layers import Embedding, Flatten, Dense, Dropout
from keras.layers import Conv1D, SimpleRNN, Bidirectional, MaxPooling1D, GlobalMaxPool1D, LSTM, GRU
from keras.models import Sequential
from keras.regularizers import L1L2
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split, cross_val_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder


In [2]:
en = spacy.load('en_core_web_md')
STOPWORDS = en.Defaults.stop_words

df = pd.read_json('../model/News_Category_Dataset_v3.json', lines=True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [3]:
new_df = df.drop(columns=['authors','link','date'])
new_df.head()

Unnamed: 0,headline,category,short_description
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha..."
3,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to..."
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...


In [4]:
final_df = new_df.copy()
final_df['length_of_news'] = final_df['headline'] + final_df['short_description']
final_df.drop(['headline','short_description'], inplace=True, axis=1)
final_df['len_news'] = final_df['length_of_news'].map(lambda x: len(x))
final_df.head()

Unnamed: 0,category,length_of_news,len_news
0,U.S. NEWS,Over 4 Million Americans Roll Up Sleeves For O...,230
1,U.S. NEWS,"American Airlines Flyer Charged, Banned For Li...",248
2,COMEDY,23 Of The Funniest Tweets About Cats And Dogs ...,133
3,PARENTING,The Funniest Tweets From Parents This Week (Se...,215
4,U.S. NEWS,Woman Who Called Cops On Black Bird-Watcher Lo...,233


In [5]:
def datacleaning(text):
    whitespace = re.compile(r"\s+")
    user = re.compile(r"(?i)@[a-z0-9_]+")
    text = whitespace.sub(' ', text)
    text = user.sub('', text)
    text = re.sub(r"\[[^()]*\]","", text)
    text = re.sub("\d+", "", text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r"(?:@\S*|#\S*|http(?=.*://)\S*)", "", text)
    text = text.lower()
    
    text = [word for word in text.split() if word not in list(STOPWORDS)]

    return ' '.join(text)

In [6]:
ndf = final_df.copy()
ndf.drop('len_news', inplace=True, axis=1)
ndf['length_of_news'] = ndf['length_of_news'].apply(datacleaning)
ndf.head()

Unnamed: 0,category,length_of_news
0,U.S. NEWS,million americans roll sleeves omicrontargeted...
1,U.S. NEWS,american airlines flyer charged banned life pu...
2,COMEDY,funniest tweets cats dogs week sept dog dont u...
3,PARENTING,funniest tweets parents week sept accidentally...
4,U.S. NEWS,woman called cops black birdwatcher loses laws...


In [30]:
ndf['vector'] = ndf['length_of_news'].apply(lambda text: en(text).vector)
ndf.head()

Unnamed: 0,category,length_of_news,vector
0,U.S. NEWS,million americans roll sleeves omicrontargeted...,"[-1.3539314, 0.5398995, -1.3941106, 1.7469765,..."
1,U.S. NEWS,american airlines flyer charged banned life pu...,"[-0.6059083, 0.037347153, -2.1189568, 0.673374..."
2,COMEDY,funniest tweets cats dogs week sept dog dont u...,"[0.56933457, 2.5814993, -3.8139722, -2.4902532..."
3,PARENTING,funniest tweets parents week sept accidentally...,"[-0.23393469, 0.6016026, -2.1117344, -0.011048..."
4,U.S. NEWS,woman called cops black birdwatcher loses laws...,"[-0.96607876, 0.16777916, -2.1811275, -0.02558..."


In [39]:
X_train, X_test, y_train, y_test = train_test_split(
    ndf.vector.values,
    ndf.category,
    test_size=0.2,
    random_state=2022
)

X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)

clf = MultinomialNB()
clf.fit(scaled_train_embed, y_train)

y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                precision    recall  f1-score   support

          ARTS       0.00      0.00      0.00       312
ARTS & CULTURE       0.00      0.00      0.00       273
  BLACK VOICES       0.00      0.00      0.00       894
      BUSINESS       0.00      0.00      0.00      1125
       COLLEGE       0.00      0.00      0.00       247
        COMEDY       0.00      0.00      0.00      1128
         CRIME       0.00      0.00      0.00       713
CULTURE & ARTS       0.00      0.00      0.00       209
       DIVORCE       0.00      0.00      0.00       670
     EDUCATION       0.00      0.00      0.00       199
 ENTERTAINMENT       0.37      0.01      0.02      3438
   ENVIRONMENT       0.00      0.00      0.00       299
         FIFTY       0.00      0.00      0.00       271
  FOOD & DRINK       0.45      0.00      0.01      1237
     GOOD NEWS       0.00      0.00      0.00       290
         GREEN       0.00      0.00      0.00       516
HEALTHY LIVING       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
clf = KNeighborsClassifier(n_neighbors = 5, metric= 'euclidean')
clf.fit(X_train_2d, y_train)

y_pred = clf.predict(X_test_2d)

print(classification_report(y_test, y_pred))

                precision    recall  f1-score   support

          ARTS       0.09      0.22      0.13       312
ARTS & CULTURE       0.07      0.08      0.08       273
  BLACK VOICES       0.15      0.20      0.17       894
      BUSINESS       0.22      0.34      0.27      1125
       COLLEGE       0.11      0.16      0.13       247
        COMEDY       0.18      0.19      0.19      1128
         CRIME       0.36      0.44      0.40       713
CULTURE & ARTS       0.11      0.15      0.13       209
       DIVORCE       0.28      0.41      0.34       670
     EDUCATION       0.19      0.24      0.21       199
 ENTERTAINMENT       0.41      0.51      0.45      3438
   ENVIRONMENT       0.22      0.23      0.23       299
         FIFTY       0.05      0.06      0.06       271
  FOOD & DRINK       0.53      0.62      0.57      1237
     GOOD NEWS       0.17      0.05      0.08       290
         GREEN       0.28      0.22      0.24       516
HEALTHY LIVING       0.20      0.13      0.16  

In [47]:
X = ndf['length_of_news']
encoder = LabelEncoder()
y = encoder.fit_transform(ndf['category'])
print("shape of input data: ", X.shape)
print("shape of target variable: ", y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

tokenizer = Tokenizer(num_words=100000, oov_token='<00V>') 
tokenizer.fit_on_texts(X_train) # build the word index
# padding X_train text input data
train_seq = tokenizer.texts_to_sequences(X_train) # converts strinfs into integer lists
train_padseq = pad_sequences(train_seq, maxlen=130) # pads the integer lists to 2D integer tensor 

# padding X_test text input data
test_seq = tokenizer.texts_to_sequences(X_test)
test_padseq = pad_sequences(test_seq, maxlen=130)

word_index = tokenizer.word_index
max_words = 150000  # total number of words to consider in embedding layer
total_words = len(word_index)
maxlen = 130 # max length of sequence 
y_train = to_categorical(y_train, num_classes=42)
y_test = to_categorical(y_test, num_classes=42)
print("Length of word index:", total_words)

shape of input data:  (209527,)
shape of target variable:  (209527,)
Length of word index: 202192


In [50]:
model2 = Sequential()
model2.add(Embedding(total_words, 100, input_length=maxlen))
model2.add(Bidirectional(LSTM(64, dropout=0.1, recurrent_dropout=0.10, activation='tanh', return_sequences=True)))
model2.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.20, activation='tanh', return_sequences=True)))
model2.add(Bidirectional(SimpleRNN(64, dropout=0.2, recurrent_dropout=0.20, activation='tanh', return_sequences=True)))
model2.add(Conv1D(72, 3, activation='relu'))
model2.add(MaxPooling1D(2))
model2.add(SimpleRNN(64, activation='tanh', dropout=0.2, recurrent_dropout=0.20, return_sequences=True))
model2.add(GRU(64, recurrent_dropout=0.20, recurrent_regularizer='l1_l2'))
model2.add(Dropout(0.2))
model2.add(Dense(42, activation='softmax'))
model2.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 130, 100)          20219200  
                                                                 
 bidirectional_9 (Bidirecti  (None, 130, 128)          84480     
 onal)                                                           
                                                                 
 bidirectional_10 (Bidirect  (None, 130, 128)          98816     
 ional)                                                          
                                                                 
 bidirectional_11 (Bidirect  (None, 130, 128)          24704     
 ional)                                                          
                                                                 
 conv1d_3 (Conv1D)           (None, 128, 72)           27720     
                                                      

In [51]:
model2.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy']
              )
# SETUP A EARLY STOPPING CALL and model check point API
earlystopping = keras.callbacks.EarlyStopping(monitor='accuracy',
                                              patience=5,
                                              verbose=1,
                                              mode='min'
                                              )
checkpointer = ModelCheckpoint(filepath='bestvalue1',moniter='val_loss', verbose=0, save_best_only=True)
callback_list = [checkpointer, earlystopping]

# fit model to the data
history2 = model2.fit(train_padseq, y_train, 
                     batch_size=128, 
                     epochs=15, 
                     validation_split=0.2,
                     shuffle=True
                    )

# evalute the model
test_loss2, test_acc2 = model2.evaluate(test_padseq, y_test, verbose=0)
print("test loss and accuracy:", test_loss2, test_acc2)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
test loss and accuracy: 2.3606834411621094 0.5219538807868958
