In [31]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D, AveragePooling1D, GlobalAvgPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.models import  Sequential
from keras.layers import Dense,Flatten,Embedding,LSTM,Dropout
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical
from sklearn.metrics import accuracy_score,classification_report
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import LabelEncoder

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import re


import warnings
warnings.filterwarnings('ignore')

In [32]:
train_df = pd.read_csv("../Data/training.csv")
test_df = pd.read_csv("../Data/testing.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Train shape :  (1435, 4)
Test shape :  (354, 4)


In [33]:
train_df = train_df[['Category','Article']]
test_df = test_df[['Category','Article']]

In [34]:
train_df.head(2)

Unnamed: 0,Category,Article
0,Entertainment,"A dash of stand up comedy, lots of improvisat..."
1,Entertainment,Actor Tamannaah Bhatia is excited to have sig...


In [35]:
le = LabelEncoder()
le.fit(train_df.Category)

LabelEncoder()

In [36]:
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

In [37]:
stop_words=set(stopwords.words('english'))

def TokenizeText(text):
    ''' 
     Tokenizes text by removing various stopwords and lemmatizing them
    '''
    text=str(text)
    text=text.lower()
    text=re.sub('[^A-Za-z0-9\s]+', '', text)
    word_list=word_tokenize(text)
    word_list_final=[]
    for word in word_list:
        if word not in stop_words:
            word_list_final.append((word))
    return " ".join(word_list_final)

In [38]:
train_df.Article =  train_df.Article.apply(TokenizeText)
test_df.Article = test_df.Article.apply(TokenizeText)

In [39]:
## split to train and val
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2020)

## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 200 # max number of words in a article to use

# ## fill up the missing values
train_X = train_df["Article"].values
val_X = val_df["Article"].values
test_X = test_df["Article"].values


## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features,)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## Get the target values
train_y = train_df['Category'].values
val_y = val_df['Category'].values

In [40]:
train_X.shape

(1287, 200)

In [41]:
EMBEDDING_FILE = 'glove.6B.300d.txt'

In [42]:
embeddings_index = {}
f = open(EMBEDDING_FILE)
for count,line in enumerate(f):
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        pass
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [43]:
embedding_dim = 300
embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < max_features:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [44]:
num_categories = train_df.Category.nunique()
num_categories

9

In [45]:
train_y=to_categorical(le.transform(train_y))
val_y= to_categorical(le.transform(val_y))
test_y= to_categorical(le.transform(test_df.Category))

In [46]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = LSTM(64, return_sequences=True)(x)
# x = LSTM(64, return_sequences=True)(x)

x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(num_categories, activation="softmax")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


model.layers[1].set_weights([embedding_matrix])
model.layers[1].trainable = False



print(model.summary())


Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 200, 300)          15000000  
_________________________________________________________________
lstm_3 (LSTM)                (None, 200, 64)           93440     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 16)                1040      
_________________________________________________________________
dropout_3 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 9)                 153 

In [47]:
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]


In [48]:
model.fit(train_X, train_y, batch_size=32, epochs=500, callbacks=callbacks,validation_data=(val_X, val_y))


Train on 1287 samples, validate on 143 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500


<keras.callbacks.History at 0x7f9e2b8b5390>

In [49]:
model.load_weights('best_model.h5')

In [50]:
accuracy_score(np.argmax(val_y,axis=1),np.argmax(model.predict(val_X),axis=1))

0.7552447552447552

In [51]:
actual_test_target  = test_df.Category.values

In [52]:
predictions = le.inverse_transform(np.argmax(model.predict(test_X),axis=1))

In [53]:
#Accuracy 
print ("Accuracy of random forest is {}".format(str(accuracy_score(actual_test_target,predictions))))

Accuracy of random forest is 0.7507082152974505


In [54]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = LSTM(64, return_sequences=True)(x)
# x = LSTM(64, return_sequences=True)(x)

x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(num_categories, activation="softmax")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


model.layers[1].set_weights([embedding_matrix])
model.layers[1].trainable = False



print(model.summary())


Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 200, 300)          15000000  
_________________________________________________________________
lstm_4 (LSTM)                (None, 200, 64)           93440     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 64)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 16)                1040      
_________________________________________________________________
dropout_4 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 9)                 153 

In [55]:
model.fit(np.vstack([train_X,val_X]), np.vstack([train_y,val_y]), batch_size=32, epochs=7)


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x7f9e024cbbe0>

In [56]:
actual_test_target  = test_df.Category.values
predictions = le.inverse_transform(np.argmax(model.predict(test_X),axis=1))

In [57]:
#Accuracy 
print ("Accuracy of LSTM is {}".format(str(accuracy_score(actual_test_target,predictions))))

Accuracy of LSTM is 0.7563739376770539


In [58]:
### confusion matrix
pd.crosstab(actual_test_target, predictions, rownames=['Actual'], colnames=['Predicted'])

Predicted,Business & Economy,Education & Career,Entertainment,Food & Health,International,Others,Politics & Governance,Science & Technology,Sports
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Business & Economy,35,1,0,0,0,0,1,3,0
Education & Career,0,33,2,1,0,1,0,3,0
Entertainment,0,2,35,0,0,2,0,0,0
Food & Health,0,2,1,27,1,3,0,4,2
International,0,1,0,0,31,6,2,0,0
Others,3,4,3,0,3,16,7,4,0
Politics & Governance,0,1,0,0,7,6,20,0,0
Science & Technology,0,2,3,2,0,2,1,30,0
Sports,0,0,0,0,0,0,0,0,40


In [59]:
report = classification_report(actual_test_target, predictions, output_dict=True)


In [60]:
pd.DataFrame(report).T


Unnamed: 0,precision,recall,f1-score,support
Business & Economy,0.921053,0.875,0.897436,40.0
Education & Career,0.717391,0.825,0.767442,40.0
Entertainment,0.795455,0.897436,0.843373,39.0
Food & Health,0.9,0.675,0.771429,40.0
International,0.738095,0.775,0.756098,40.0
Others,0.444444,0.4,0.421053,40.0
Politics & Governance,0.645161,0.588235,0.615385,34.0
Science & Technology,0.681818,0.75,0.714286,40.0
Sports,0.952381,1.0,0.97561,40.0
accuracy,0.756374,0.756374,0.756374,0.756374
