### Importing Libraries

In [16]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import tensorflow.keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential

### Handling Pre-processed data

In [3]:
data = pd.read_csv('../preprocess_data.csv')
data.drop(['task_2','Unnamed: 0','text'], axis=1, inplace=True)
data.head()

Unnamed: 0,Unnamed: 0.1,_id,task_1,text_clean
0,0,60c5d6bf5659ea5e55defa2c,HOF,made amp amp onli abl start make money sustain...
1,1,60c5d6bf5659ea5e55def461,HOF,technic still turn back clock dick head
2,2,60c5d6bf5659ea5e55defaad,NOT,govt stop think world media liber gang ani opt...
3,3,60c5d6bf5659ea5e55def419,HOF,soldier japan dick head
4,4,60c5d6bf5659ea5e55def7fa,HOF,would better ask think sleazi shitbag lmao


In [4]:
sentences = data['text_clean'].astype(str)
tokenizer = Tokenizer(num_words = 1500,split=' ')
tokenizer.fit_on_texts(sentences)
sequence = tokenizer.texts_to_sequences(sentences)

In [5]:

max_seq_len = 2500

index_of_words = tokenizer.word_index
print("No of unique words : ",len(index_of_words))

X = pad_sequences(sequence , maxlen = max_seq_len )
Y = data['task_1']

print(X)

No of unique words :  8255
[[   0    0    0 ...  170    3  210]
 [   0    0    0 ...   72   54   73]
 [   0    0    0 ...    3   52   13]
 ...
 [   0    0    0 ...  817   45  156]
 [   0    0    0 ...  213   99   38]
 [   0    0    0 ... 1166  236   57]]


In [6]:
embed_dim = 256
vocabSize = len(index_of_words)
lstm_out = 64

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, random_state = 0)
Y_true = Y_test
Y_train = pd.get_dummies(Y_train).values
Y_test = pd.get_dummies(Y_test).values

In [8]:
model = Sequential()
model.add(Embedding(vocabSize, embed_dim,input_length = 2500))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2500, 256)         2113280   
                                                                 
 lstm (LSTM)                 (None, 64)                82176     
                                                                 
 dense (Dense)               (None, 2)                 130       
                                                                 
Total params: 2,195,586
Trainable params: 2,195,586
Non-trainable params: 0
_________________________________________________________________
None


In [9]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint("hasoc_a_2.h5", monitor='val_loss', verbose=1, save_best_only=True,
save_weights_only=False, mode='auto')

In [10]:
model.fit(X_train,Y_train ,batch_size = 32, epochs = 5 ,validation_data=(X_test,Y_test) , callbacks=[checkpoint])

Epoch 1/5
Epoch 1: val_loss improved from inf to 0.49795, saving model to hasoc_a_2.h5
Epoch 2/5
Epoch 2: val_loss improved from 0.49795 to 0.49533, saving model to hasoc_a_2.h5
Epoch 3/5
Epoch 3: val_loss did not improve from 0.49533
Epoch 4/5
Epoch 4: val_loss did not improve from 0.49533
Epoch 5/5
Epoch 5: val_loss did not improve from 0.49533


<keras.callbacks.History at 0x1de390593d0>

In [11]:
model.load_weights('hasoc_a_2.h5')
model.evaluate(X_test,Y_test)



[0.49533092975616455, 0.7590987682342529]

In [12]:
Y_pred = model.predict(X_test)



In [13]:
y_actual = []
for i in Y_true:
    if i =='NOT':
        y_actual.append(1)
    else :
        y_actual.append(0)

pred_class = []
for i in Y_pred:
    pred_class.append(np.argmax(i))

In [14]:
print(classification_report(y_actual , pred_class))

              precision    recall  f1-score   support

           0       0.80      0.85      0.82       380
           1       0.67      0.58      0.62       197

    accuracy                           0.76       577
   macro avg       0.73      0.72      0.72       577
weighted avg       0.75      0.76      0.75       577

