In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import math

import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from collections import Counter
from sklearn import preprocessing

from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, confusion_matrix


import json
import collections
import operator
import random
import seaborn as sns
from sklearn.model_selection import train_test_split

import tensorflow as tf

import random

import gensim
import gensim.downloader
from sklearn.ensemble import RandomForestClassifier

In [None]:
train_file = pd.read_csv('clean.csv')

In [None]:
train_file.columns, train_file.shape

In [None]:
max_label_length = 6
new_label = []
for each in train_file['label']:
    temp = str(each)
    if len(temp) != max_label_length:
        zeros = max_label_length - len(str(each))
        res = temp.rjust(zeros + len(temp), '0')
        new_label.append(res)
    else:
        new_label.append(temp)    

In [None]:
train_file['label'] = new_label

In [None]:
train_file = train_file.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis = 1)

In [None]:
all_toxic = train_file[(train_file.label != '000000')].reset_index(drop = True)
non_toxic = train_file[(train_file.label == '000000')].reset_index(drop = True)

In [None]:
all_toxic.shape, non_toxic.shape

In [None]:
del train_file

In [None]:
non_toxic_labels = [0] * non_toxic.shape[0]
toxic_labels = [1] * all_toxic.shape[0]

In [None]:
non_toxic['label'] = non_toxic_labels
all_toxic['label'] = toxic_labels

In [None]:
non_toxic.head(4), all_toxic.head(4)

In [None]:
whole_frame = pd.concat([non_toxic, all_toxic], ignore_index=True)

In [None]:
train, non_train = train_test_split(whole_frame, test_size=0.1, random_state=42, stratify = whole_frame['label'], shuffle = True)
val, test = train_test_split(non_train, test_size=0.1, random_state=42, stratify = non_train['label'], shuffle = True)
train.shape, val.shape, test.shape

In [None]:
del non_toxic
del all_toxic

x_train = train['comment_text'].reset_index(drop = True)
y_train = train['label'].reset_index(drop = True)

x_val = val['comment_text'].reset_index(drop = True)
y_val = val['label'].reset_index(drop = True)

x_test = test['comment_text'].reset_index(drop = True)
y_test = test['label'].reset_index(drop = True)


del train
del val
del test

#### Split the comment and get the vectors for words
#### if the word is OOV, simply skip, and in process if list is empty then append 0
#### if sentence vectors by average: simple take mean
#### else return all vectors

In [None]:
def sentence_vectors(datapoint, vector_type):
    words = datapoint.split(' ')
    word_vectors = []
    for each in words:
        try:
            word_vectors.append(fasttext_vectors[each])
        except KeyError:
            pass
    word_vectors = np.array(word_vectors)
    
    if vector_type == 'sent':
        if word_vectors.shape[0] == []:
            return 0
        else:
            return np.mean(word_vectors, axis = 0)
    else:
        if word_vectors.shape[0] == []:
            return 0, 0
        else:
            return word_vectors, len(words)

In [None]:
print(list(gensim.downloader.info()['models'].keys()))

In [None]:
fasttext_vectors = gensim.downloader.load('word2vec-google-news-300')

In [None]:
train_word_vector = x_train.apply(sentence_vectors, vector_type = 'word')
val_word_vector = x_val.apply(sentence_vectors, vector_type = 'word')
test_word_vector = x_test.apply(sentence_vectors, vector_type = 'word')


#### get max length

In [None]:
def get_vts_and_lgts(train_word_vector):
    lengths = []
    vectors = []
    for each in range(train_word_vector.shape[0]):
        lengths.append(train_word_vector.iloc[each][1])
        vectors.append(train_word_vector.iloc[each][0])
    return vectors, lengths


In [None]:
vectors_train, lengths_train = get_vts_and_lgts(train_word_vector)
vectors_val, lengths_val = get_vts_and_lgts(val_word_vector)
vectors_test, _ = get_vts_and_lgts(test_word_vector)
del train_word_vector, val_word_vector, test_word_vector


In [None]:
max_len = max(max(lengths_train), max(lengths_val))


#### Get indices where 0 is returned and discard them

In [None]:
def get_index_zerovectors(train_sentence_vector):
    remove_indices = []
    for index, vector in enumerate(train_sentence_vector):
        if vector.shape == ():
            remove_indices.append(index)
    return remove_indices

In [None]:
train_remove_indices = get_index_zerovectors(vectors_train)
val_remove_indices = get_index_zerovectors(vectors_val)
test_remove_indices = get_index_zerovectors(vectors_test)


In [None]:
vectors_train = [i for j, i in enumerate(vectors_train) if j not in train_remove_indices]
y_train = [i for j, i in enumerate(y_train) if j not in train_remove_indices]

vectors_val = [i for j, i in enumerate(vectors_val) if j not in val_remove_indices]
y_val = [i for j, i in enumerate(y_val) if j not in val_remove_indices]

vectors_test = [i for j, i in enumerate(vectors_test) if j not in test_remove_indices]
y_test = [i for j, i in enumerate(y_test) if j not in test_remove_indices]


#### Make lengths of all comments same as max_len with appending 0.0 at the start

In [None]:
vectors_train = tf.keras.utils.pad_sequences(
    vectors_train,
    maxlen=max_len,
    dtype='int32',
    padding='pre',
    truncating='pre',
    value=0.0
)
vectors_val = tf.keras.utils.pad_sequences(
    vectors_val,
    maxlen=max_len,
    dtype='int32',
    padding='pre',
    truncating='pre',
    value=0.0
)
vectors_test = tf.keras.utils.pad_sequences(
    vectors_test,
    maxlen=max_len,
    dtype='int32',
    padding='pre',
    truncating='pre',
    value=0.0
)


In [None]:
y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)


In [None]:
vectors_train.shape, y_train.shape, vectors_val.shape, y_val.shape, vectors_test.shape, y_test.shape


In [None]:
def rnn():
    inputs = tf.keras.Input(shape = (1250, 300))
    x = tf.keras.layers.SimpleRNN(8, activation = 'tanh')(inputs)
    outputs = tf.keras.layers.Dense(1)(x)
    model = tf.keras.Model(inputs = inputs, outputs = outputs)
    return model

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy', threshold=0.8),
      
]

model_rnn = rnn()
model_rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=METRICS)
model_rnn.summary()


In [None]:
history_rnn = model_rnn.fit(x = vectors_train, y = y_train, validation_data=(vectors_val, y_val), epochs=3)


In [None]:
preds_rnn = model_rnn.predict(vectors_test)
conf_mat = confusion_matrix(y_test, preds_rnn)
    
print(conf_mat)
print('f1 score',f1_score(y_test, preds_rnn, average='weighted', zero_division = 1))
print('recall',recall_score(y_test, preds_rnn, average='weighted', zero_division = 1))
print('precision',precision_score(y_test, preds_rnn, average='weighted', zero_division =1))
print('accuracy',accuracy_score(y_test, preds_rnn))


#### Bahdanau Attention

In [None]:
class attention(tf.keras.layers.Layer):
    def __init__(self, inputs_0, inputs_1):
        super().__init__()
        self.W = self.add_weight(name = 'attention_weight', shape = (inputs_1, 1),
                             initializer = 'random_normal', trainable = True)
        self.b = self.add_weight(name = 'attention_bias', shape = (inputs_0, 1),
                             initializer = 'zeros', trainable = True)
    
    def call(self, inputs):
        e = tf.math.tanh(tf.matmul(inputs, self.W) + self.b)
        e = tf.squeeze(e, axis=-1, name='squeeze')
        alpha = tf.nn.softmax(e)
        alpha = tf.expand_dims(alpha, axis = -1)
        context = inputs * alpha
        context = tf.math.reduce_sum(context, axis = -1)
        return context

In [None]:
def rnn_attention():
    inputs = tf.keras.Input(shape = (1250, 300))
    x = tf.keras.layers.SimpleRNN(8, return_sequences = True, activation = 'tanh')(inputs)
    x = attention(x.shape[1], x.shape[2])(x)
    outputs = tf.keras.layers.Dense(1)(x)
    model = tf.keras.Model(inputs = inputs, outputs = outputs)
    return model

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy', threshold=0.8),
      
]

model_attention = rnn_attention()
model_attention.compile(optimizer='adam', loss='binary_crossentropy', metrics=METRICS)
model_attention.summary()


In [None]:
history_rnnatten = model_attention.fit(x = vectors_train, y = y_train, validation_data=(vectors_val, y_val), epochs=3)


In [None]:
preds_attn = model_attention.predict(vectors_test)
conf_mat = confusion_matrix(y_test, preds_rnn)
    
print(conf_mat)
print('f1 score',f1_score(y_test, preds_attn, average='weighted', zero_division = 1))
print('recall',recall_score(y_test, preds_attn, average='weighted', zero_division = 1))
print('precision',precision_score(y_test, preds_attn, average='weighted', zero_division =1))
print('accuracy',accuracy_score(y_test, preds_attn))


#### weighted attention layer

In [None]:
class wtgd_attention(tf.keras.layers.Layer):
    def __init__(self, units):
        super().__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
    
    def call(self, features, hidden):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
        attn_weights = tf.nn.softmax(self.V(score), axis = 1)
        context_vector = tf.math.reduce_sum(attn_weights * features, axis = 1)
        return context_vector, attn_weights

In [None]:
def bilstm_attention():
    inputs = tf.keras.Input(shape = (1250, 300))
    lstm, f_h, f_c, b_h, b_c = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(8, return_sequences = True, return_state=True, activation = 'tanh'))(inputs)
    state_h = tf.keras.layers.Concatenate()([f_h, b_h])
    state_c = tf.keras.layers.Concatenate()([f_c, b_c])
    vector, weights = wtgd_attention(10)(lstm, state_h)
    outputs = tf.keras.layers.Dense(1)(vector)
    model = tf.keras.Model(inputs = inputs, outputs = outputs)
    return model

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy', threshold=0.8),
      
]

bilstm_attention = bilstm_attention()
bilstm_attention.compile(optimizer='adam', loss='binary_crossentropy', metrics=METRICS)
bilstm_attention.summary()



In [None]:
history_bilstm = bilstm_attention.fit(x = vectors_train, y = y_train, validation_data=(vectors_val, y_val), epochs=3)


In [None]:
preds_biattn = bilstm_attention.predict(vectors_test)
conf_mat = confusion_matrix(y_test, preds_rnn)
    
print(conf_mat)
print('f1 score',f1_score(y_test, preds_biattn, average='weighted', zero_division = 1))
print('recall',recall_score(y_test, preds_biattn, average='weighted', zero_division = 1))
print('precision',precision_score(y_test, preds_biattn, average='weighted', zero_division =1))
print('accuracy',accuracy_score(y_test, preds_biattn))
