In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,Bidirectional,SpatialDropout1D
from keras.models import Model
from keras.models import Sequential

In [2]:
data = pd.read_csv('../preprocess_data.csv')
data.drop(['task_1','Unnamed: 0','text'], axis=1, inplace=True)
data.head()

Unnamed: 0,Unnamed: 0.1,_id,task_2,text_clean
0,0,60c5d6bf5659ea5e55defa2c,PRFN,made amp amp onli abl start make money sustain...
1,1,60c5d6bf5659ea5e55def461,OFFN,technic still turn back clock dick head
2,2,60c5d6bf5659ea5e55defaad,NONE,govt stop think world media liber gang ani opt...
3,3,60c5d6bf5659ea5e55def419,OFFN,soldier japan dick head
4,4,60c5d6bf5659ea5e55def7fa,OFFN,would better ask think sleazi shitbag lmao


In [3]:
sentences = data['text_clean'].astype(str)
tokenizer = Tokenizer(num_words = 1500,split=' ')
tokenizer.fit_on_texts(sentences)
sequence = tokenizer.texts_to_sequences(sentences)

In [4]:
max_seq_len = 2500

index_of_words = tokenizer.word_index
print("No of unique words : ",len(index_of_words))

X = pad_sequences(sequence , maxlen = max_seq_len )
Y = data['task_2']

print(X)

No of unique words :  8255
[[   0    0    0 ...  170    3  210]
 [   0    0    0 ...   72   54   73]
 [   0    0    0 ...    3   52   13]
 ...
 [   0    0    0 ...  817   45  156]
 [   0    0    0 ...  213   99   38]
 [   0    0    0 ... 1166  236   57]]


In [5]:
embed_dim = 256
vocabSize = len(index_of_words)
lstm_out = 64

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, random_state = 0)
Y_true = Y_test
Y_train = pd.get_dummies(Y_train).values
Y_test = pd.get_dummies(Y_test).values

In [7]:
model = Sequential()
model.add(Embedding(vocabSize, embed_dim,input_length = 2500))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(4, activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2500, 256)         2113280   
                                                                 
 spatial_dropout1d (Spatial  (None, 2500, 256)         0         
 Dropout1D)                                                      
                                                                 
 lstm (LSTM)                 (None, 64)                82176     
                                                                 
 dense (Dense)               (None, 4)                 260       
                                                                 
Total params: 2195716 (8.38 MB)
Trainable params: 2195716 (8.38 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [8]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint("hasoc_b.h5", monitor='val_loss', verbose=1, save_best_only=True,
save_weights_only=False, mode='auto')

In [9]:
print(Y_true)

1358    NONE
2200    HATE
2337    PRFN
3640    NONE
2928    PRFN
        ... 
472     PRFN
15      PRFN
1813    HATE
1721    OFFN
3690    PRFN
Name: task_2, Length: 577, dtype: object


In [10]:
print(Y_test[:5])
classes = ['HATE','NONE','PRFN','OFFN']

[[False  True False False]
 [ True False False False]
 [False False False  True]
 [False  True False False]
 [False False False  True]]


In [11]:
model.fit(X_train,Y_train ,batch_size = 32, epochs = 1 ,validation_data=(X_test,Y_test) , callbacks=[checkpoint])

Epoch 1: val_loss improved from inf to 0.42816, saving model to hasoc_b.h5


  saving_api.save_model(


<keras.src.callbacks.History at 0x1cf5879b460>

In [12]:
Y_pred = model.predict(X_test)



In [13]:
print(Y_pred)

[[0.06943967 0.8139189  0.0669575  0.04968397]
 [0.14326045 0.7377507  0.09122909 0.02775968]
 [0.00333433 0.02223874 0.01973699 0.95468986]
 ...
 [0.5987306  0.1771212  0.21319255 0.01095558]
 [0.457735   0.27838898 0.2265558  0.03732024]
 [0.00678977 0.05411986 0.03166655 0.90742373]]


In [14]:
pred_class = []
for i in Y_pred:
    pred_class.append(np.argmax(i))
print(pred_class)

pred_class = pd.get_dummies(pred_class).values
print(pred_class)

[1, 1, 3, 1, 3, 3, 3, 1, 3, 0, 1, 3, 1, 3, 3, 3, 1, 3, 3, 3, 1, 0, 1, 0, 1, 3, 0, 0, 3, 3, 3, 3, 1, 3, 0, 3, 0, 3, 3, 0, 1, 1, 3, 3, 1, 1, 3, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 3, 3, 1, 1, 3, 3, 3, 3, 1, 3, 3, 0, 1, 3, 3, 1, 1, 1, 3, 3, 3, 0, 3, 3, 0, 3, 1, 3, 3, 3, 3, 3, 1, 1, 0, 3, 3, 1, 1, 3, 0, 0, 1, 3, 3, 3, 3, 0, 1, 0, 3, 3, 3, 3, 1, 2, 3, 0, 3, 1, 3, 1, 3, 3, 3, 1, 1, 1, 1, 3, 1, 1, 3, 3, 3, 0, 1, 0, 3, 1, 3, 3, 3, 3, 0, 3, 1, 1, 3, 1, 3, 3, 1, 1, 0, 1, 3, 1, 3, 3, 1, 1, 3, 1, 3, 1, 1, 0, 1, 3, 1, 3, 1, 1, 3, 1, 3, 1, 0, 3, 0, 3, 1, 1, 3, 3, 3, 1, 0, 1, 3, 1, 1, 3, 1, 3, 0, 1, 1, 3, 3, 1, 3, 3, 1, 2, 0, 3, 3, 0, 1, 1, 3, 3, 1, 1, 3, 0, 3, 1, 3, 3, 3, 3, 1, 1, 0, 0, 3, 3, 3, 1, 3, 1, 3, 1, 1, 1, 0, 1, 3, 3, 3, 3, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 3, 0, 3, 1, 3, 3, 3, 3, 1, 3, 1, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 3, 0, 3, 3, 1, 1, 1, 1, 3, 0, 3, 1, 0, 3, 1, 0, 1, 1, 3, 1, 1, 3, 3, 1, 1, 0, 0, 0, 3, 3, 1, 1, 3, 3, 3, 3, 1, 3, 0, 3, 0, 0, 1, 0, 1, 3, 3, 1, 1, 3, 1, 1, 3, 1, 3, 1, 1, 0, 3, 1, 3, 

In [15]:
print(classification_report(Y_test , pred_class))

              precision    recall  f1-score   support

           0       0.43      0.41      0.42        96
           1       0.59      0.64      0.62       197
           2       1.00      0.03      0.06       104
           3       0.64      0.96      0.77       180

   micro avg       0.59      0.59      0.59       577
   macro avg       0.67      0.51      0.46       577
weighted avg       0.65      0.59      0.53       577
 samples avg       0.59      0.59      0.59       577

