In [4]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import re
from keras import backend as K
from keras import initializers, regularizers, constraints
from keras.engine.topology import Layer
from keras.models import Model

In [2]:
df = pd.read_csv('toxic_comment_data.csv')
df.head()

Unnamed: 0,is_offensive,text
0,0,Then go to the village pump and suggest they c...
1,1,ANTI GREEK NATIONALIS -WIKIPEDIA \n\nHi Alexik...
2,1,Dis hoe wasnt dis violent on Lottery Ticket 😂😂
3,0,It is better for Atabay not helping the banned...
4,0,"""is in CamelCase. """"SiCKO"""" is not CamelCase,..."


In [30]:
MAX_NB_WORDS = 100000

In [3]:
x = df['text'].astype(str)
y = df['is_offensive']

In [5]:
def cleanData(text):    
    text = text.lower().split()
    text = " ".join(text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+\-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text

In [6]:
x = x.map(lambda a: cleanData(a))

In [10]:
def text_to_wordlist(text):    
    #Remove Special Characters
    text = re.sub(r'[^a-z\d ]', " ", text)
    text = re.sub(r'\d+', '_num_', text)    
    return(text)

In [11]:
x = x.map(lambda a: text_to_wordlist(a))

In [12]:
train, test, y_train, y_test = train_test_split(x,y,stratify=y,random_state=42)

In [16]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(x)

In [17]:
x_train = tokenizer.texts_to_sequences(train)

In [18]:
x_test = tokenizer.texts_to_sequences(test)
word_index = tokenizer.word_index

In [21]:
x = tokenizer.texts_to_sequences(x)

In [22]:
vocab_size = len(word_index)
print('Vocab size: {}'.format(vocab_size))
longest = max(len(seq) for seq in x)
print("Longest comment size: {}".format(longest))
average = np.mean([len(seq) for seq in x])
print("Average comment size: {}".format(average))
stdev = np.std([len(seq) for seq in x])
print("Stdev of comment size: {}".format(stdev))
max_len = int(average + stdev * 3)
print('Max comment size: {}'.format(max_len))

Vocab size: 175236
Longest comment size: 1401
Average comment size: 60.708669190795966
Stdev of comment size: 95.75519768824256
Max comment size: 347


In [23]:
processed_post_x_train = pad_sequences(x_train, maxlen=max_len, padding='post', truncating='post')
processed_post_x_test = pad_sequences(x_test, maxlen=max_len, padding='post', truncating='post')

In [24]:
processed_x_train = pad_sequences(x_train, maxlen=max_len)
processed_x_test = pad_sequences(x_test, maxlen=max_len)

In [25]:
embeddings_index = {}
f = open(os.path.join('glove.6B/', 'glove.6B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))    

Found 400000 word vectors.


In [31]:
nb_words = min(MAX_NB_WORDS,vocab_size)

100000


In [26]:
embedding_dim = 300
embedding_matrix = np.zeros((nb_words, embedding_dim))
for word, i in word_index.items():
    if i >= 100000:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [27]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        """
        self.supports_masking = True
        #self.init = initializations.get('glorot_uniform')
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        # eij = K.dot(x, self.W) TF backend doesn't support it

        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
    #print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0],  self.features_dim

In [28]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, concatenate
from keras.layers.merge import concatenate
from keras.models import Model

In [36]:
import keras.backend
from keras.models import Sequential, load_model
from keras.layers import CuDNNGRU, Dense, Conv1D, MaxPooling1D
from keras.layers import Dropout, GlobalMaxPooling1D, BatchNormalization, LSTM
from keras.layers import Bidirectional
from keras.layers.embeddings import Embedding
from keras.optimizers import Nadam
from keras.utils.vis_utils import plot_model
import matplotlib.pyplot as plt
from keras.callbacks import ModelCheckpoint

In [44]:
def get_model():
    comment_input = Input(shape=(max_len,), dtype='int32')
    comment_input_post = Input(shape=(max_len,), dtype='int32')

    x1 = Embedding(nb_words, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=True)(comment_input)
    x1 = LSTM(60, return_sequences=True)(x1)
    x1 = Attention(max_len)(x1)

    x2 = Embedding(nb_words, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=True)(comment_input_post)
    x2 = LSTM(60, return_sequences=True)(x2)
    x2 = Attention(max_len)(x2)

    x = concatenate([x1, x2])
    x = BatchNormalization()(x)
    x = Dense(50, activation='relu')(x)
    x = Dropout(0.3)(x)
    preds = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=[comment_input, comment_input_post], outputs=preds)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [45]:
model = get_model()
filepath="attnbest.hd5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [None]:
model_hist = model.fit([processed_x_train,processed_post_x_train],y_train,validation_data=([processed_x_test,processed_post_x_test],y_test),epochs=5,batch_size=256,verbose=1,callbacks=callbacks_list)

Train on 138265 samples, validate on 46089 samples
Epoch 1/5

Epoch 00001: val_acc improved from -inf to 0.95723, saving model to attnbest.hd5
Epoch 2/5

Epoch 00002: val_acc improved from 0.95723 to 0.96216, saving model to attnbest.hd5
Epoch 3/5
 31744/138265 [=====>........................] - ETA: 1:57:29 - loss: 0.0531 - acc: 0.9802