**Classify insults using glove and attention**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount = True)


In [None]:
dataset_path = 'gdrive/My Drive/Colab/insult/'
import pandas as pd
import numpy as np

train_df = pd.read_csv(dataset_path + "train_text.csv", encoding = 'latin-1')
test_df = pd.read_csv(dataset_path + "test_text.csv", encoding = 'latin-1')
train_id, X_train, y_train = train_df['id'], train_df['Comment'], train_df['Target']
test_id, X_test = test_df['id'], test_df['Comment']

**Standard preprocessing applied** (some have been commented based on results and observations)

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import string
nltk.download('punkt')
X_train_mod, y_train_mod = [], []
X_test_mod= []
words = []
table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
for ind, review in enumerate(X_train):
  try:
    #review = cleanData(review)
    word_review = word_tokenize(review)
    
    word_review = [w.translate(table) for w in word_review]
    #word_review = [w for w in word_review if w.isalpha()]
    #word_review = [w for w in word_review if w not in stop_words]
    word_review = [w.lower() for w in word_review]
    #word_review = [lmtzr.lemmatize(w) for w in word_review]
    #word_review = list(set(word_review))
    text_joined = ' '.join(word_review)
    X_train_mod.append(text_joined)
    y_train_mod.append(y_train[ind])
    
    words += word_review
  except:
    
    continue
for ind, review in enumerate(X_test):
  try:
    #review = cleanData(review)
    word_review = word_tokenize(review)
    
    word_review = [w.translate(table) for w in word_review]
    #word_review = [w for w in word_review if w.isalpha()]
    #word_review = [w for w in word_review if w not in stop_words]
    word_review = [w.lower() for w in word_review]
    #word_review = [lmtzr.lemmatize(w) for w in word_review]
    #word_review = list(set(word_review))
    text_joined = ' '.join(word_review)
    X_test_mod.append(text_joined)
    #y_test_mod.append(y_test[ind])
    words += word_review
  except:
    continue
    
X_total = X_train_mod + X_test_mod

In [None]:
print(len(X_test_mod))

**Using Glove 300d**

In [None]:
embedding = {}
f = open(dataset_path + "glove.840B.300d.txt")

for line in f:
    values = line.split()
    word = values[0]
    try:
      coefs = np.asarray(values[1:], dtype='float32')
      embedding[word] = coefs
    except:
      continue
    
f.close()

**Keras Processing and glove vectorization**
The most notable point here is we have padded the sequences in two ways: Pre and post. We are going to use them in our model

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#max_len = max(len(l.split()) for l in X_total)
max_len = 200
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_total)
sequences = tokenizer.texts_to_sequences(X_train_mod)

word_index = tokenizer.word_index
print(len(word_index))

review_pad = pad_sequences(sequences, maxlen = max_len)
review_pad_post = pad_sequences(sequences, maxlen=max_len,padding='post', truncating='post')
print(review_pad.shape)
print(review_pad_post.shape)

In [None]:
y_train_mod = np.asarray(y_train_mod).reshape(len(y_train_mod), 1)

In [None]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 300))
for word, i in word_index.items():
  if i > num_words:
    continue
  embedding_vector = embedding.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector
    
print(num_words)

**The Central Model**

Based on the pre and post padding, we have two branches now. For each branch, we send the input layer through an LSTM layer with dropout before sending it to the attention block. Post this procedure for each block, we concatenate the blocks, flatten them, add a dense layer, and finally the output layer is obtained.

In [None]:
from keras.models import *
from keras.layers import *
from keras.initializers import Constant

TIME_STEPS = max_len
INPUT_DIM = max_len

def attention_3d_block(inputs):
    # inputs.shape = (batch_size, time_steps, input_dim)
    input_dim = int(inputs.shape[2])//2
    a = Permute((2, 1))(inputs)
    a = Dense(TIME_STEPS*2, activation='relu')(a)
    a = Dense(TIME_STEPS, activation='softmax')(a)
    a_probs = Permute((2, 1))(a)
    output_attention_mul = Multiply()([inputs, a_probs])
    #output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul')
    return output_attention_mul


def model_attention_applied_after_lstm():
    input_pre = Input(shape= (INPUT_DIM,))
    input_post = Input(shape= (INPUT_DIM,))
    lstm_units = 32
    embedding_layer = Embedding(num_words,
                                300,
                                embeddings_initializer = Constant(embedding_matrix),
                                input_length = max_len,
                                trainable=False)
    
    inputs_1 = embedding_layer(input_pre)
    inputs_1 = SpatialDropout1D(0.25)(inputs_1)
    lstm_out_1 = Bidirectional(CuDNNLSTM(lstm_units,return_sequences=True))(inputs_1)
    lstm_out_1 = Dropout(0.5)(lstm_out_1)
    x1 = attention_3d_block(lstm_out_1)

    inputs_2 = embedding_layer(input_post)
    inputs_2 = SpatialDropout1D(0.25)(inputs_2)
    lstm_out_2 = Bidirectional(CuDNNLSTM(lstm_units,return_sequences=True))(inputs_2)
    lstm_out_2 = Dropout(0.5)(lstm_out_2)
    x2 = attention_3d_block(lstm_out_2)

    x = concatenate([x1, x2])
    x = Flatten()(x)
    x = Dropout(0.5)(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    preds = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=[input_pre, input_post], outputs=preds)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
m = model_attention_applied_after_lstm()

#m.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(m.summary())

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(review_pad, 
                                                    y_train_mod, test_size=0.1,
                                                    random_state=42,
                                                    stratify=y_train_mod)
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(review_pad_post, 
                                                    y_train_mod, test_size=0.1,
                                                    random_state=42,
                                                    stratify=y_train_mod)

print(y_train.shape)

In [None]:
from keras.callbacks import ModelCheckpoint
checkpoint_path = dataset_path + "model-{epoch:02d}.h5"

cp_callback = ModelCheckpoint(checkpoint_path, verbose=1,
                              save_best_only=True,
                              monitor='val_loss')

In [None]:
m.fit([X_train,X_train_p], y_train, batch_size = 128, epochs = 10,
      validation_data = ([X_test,X_test_p], y_test),
          callbacks = [], verbose = 1)

In [None]:
sequences = tokenizer.texts_to_sequences(X_test_mod)

word_index = tokenizer.word_index
print(len(word_index))

test_pad = pad_sequences(sequences, maxlen = max_len)
test_pad_post = pad_sequences(sequences, maxlen=max_len,padding='post', truncating='post')
print(test_pad.shape)

In [None]:
from keras.models import load_model
model = load_model("gdrive/My Drive/Colab/insult/model-07.h5")

In [None]:
output = model.predict([test_pad,test_pad_post], verbose=1, batch_size=128)

In [None]:
output = [l[0] for l in output]
output = list(output)
out_df = pd.DataFrame(list(zip(test_id, output)), columns =['id', 'Prediction'])

In [None]:
print(out_df)
out_df.to_csv("output_attn_1.csv")