In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Input, LSTM
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.layers import Embedding, Dropout
from keras.models import Model
from keras.layers import BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras import optimizers

In [None]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
data = pd.read_csv('train.csv').fillna(' ')
data['comment_text'].fillna("missing", inplace=True)
X_train, X_test, y_train, y_test = train_test_split(data["comment_text"],data[classes],test_size=0.15,random_state=76)
data_text = pd.concat([X_train,X_test])
print(type(X_train))

In [None]:
#tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(data_text))
X_traint = tokenizer.texts_to_sequences(X_train)
X_testt = tokenizer.texts_to_sequences(X_test)

In [None]:
#padding 
X_trainp = pad_sequences(X_traint, maxlen=250)
X_testp = pad_sequences(X_testt, maxlen=250)

In [None]:
#glove embedding
GLOVE_6B_50D_PATH = "glove.6B.50d.txt"
index_len = len(tokenizer.word_index)
glove = {}
dimension = 50
glove_matrix = np.zeros((index_len+1,dimension))
glove_path = open(GLOVE_6B_50D_PATH)
for line in tqdm(glove_path):
    line_arr = line.split()
    word = line_arr[0]
    glove[word] = np.asarray(line_arr[1:], dtype='float32')
glove_path.close()
for word, idx in tqdm(tokenizer.word_index.items()):
    if word in glove: glove_matrix[idx] = glove[word]

In [None]:
#lstm model
model_len = 250
model = Sequential()
model.add(Embedding(index_len + 1, dimension, weights = [glove_matrix], input_length = model_len, trainable = False))
model.add(Bidirectional(LSTM(100, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(BatchNormalization())
model.add(Dropout(0.15))
model.add(Dense(100, activation = "relu"))
model.add(Dropout(0.15))
model.add(Dense(70, activation = "relu"))
model.add(Dropout(0.15))
model.add(Dense(35, activation = "relu"))
model.add(Dropout(0.15))
model.add(Dense(6, activation = 'sigmoid'))
model.summary()
model.compile(loss = 'binary_crossentropy', optimizer = optimizers.Adam(lr = 0.01), metrics = ['accuracy'])
lstm = model.fit(X_trainp, y_train, epochs = 2, batch_size = 150, validation_split = 0.25)
y_pred = model.predict(X_testp, verbose = 3, batch_size = 150) 