In [1]:
#Dependencies
import pandas as pd
import numpy as np
import nltk
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K
from keras.models import Model
from keras.layers import Input, Embedding, GlobalMaxPool1D, Bidirectional
from keras.layers import Dense, LSTM, Dropout, BatchNormalization, Activation

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jharmse/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## TF-IDF - Akshi

## Naive Bayes - Xinbin

## Logistic Regression - Akshi

## Word2vec - Xinbin

## Multilayer Perceptron - Matt

### Data Import

For this project, we are using Kaggle's toxic comment datasets. The data, and an overview of the data, can be found [here](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data).

There are two files worth taking note of:

* `train.csv`
* `test.csv`

We will *train* our models on the `train.csv` data. To ensure that our model isn't memorising the training data (a.k.a. overfitting), we will *test* our model on the independent `test.csv` data.

`test.csv` has the same format as `train.csv`, but contains never-seen-before comment. By testing our model on this dataset, will give us an indication of whether our model will work in a real-world application (will it be able to flag or delete new toxic comments?).

In [24]:
train = pd.read_csv('../additional/data/train.csv')
test = pd.read_csv('../additional/data/test.csv')

print("Training data examples:")
display(train.head())
print("Test data examples:")
display(test.head())

Training data examples:


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


Test data examples:


Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


## Bidirectional LSTM - Matt

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [10]:
def preproc_line(line):
    text = re.sub(r"[^a-zA-Z0-9]", " ", line.lower())
    words = text.split()
    words = [w for w in words if w not in stopwords.words('english')]
  
    return words

In [16]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y = train[classes].values
X = train['comment_text']

In [17]:
print(len(X))
print(len(y))
print(X_temp[0])

159571
159571
Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27


In [18]:
print(X[0])

Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27


In [19]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X))
X_tokenized = tokenizer.texts_to_sequences(X)

In [20]:
max_len = 200
X_pad = pad_sequences(X_tokenized, maxlen=max_len)

In [21]:
def create_network(input_dim, embed_size, units, layers, output_dim, prob=0.2):
    input_ = Input(name='input_', shape=(input_dim, ))
    embed = Embedding(input_dim, embed_size)(input_)

    def add_layer(input_layer, units, name):
        lstm = Bidirectional(LSTM(units, return_sequences=True, activation='relu',
                                 name=name))(input_layer)
        bn_layer = BatchNormalization()(lstm)
        return bn_layer

    for i in range(layers):
        if i == 0:
            last_layer = add_layer(embed, units, 'rnn0')
        else:
            last_layer = add_layer(last_layer, units, 'rnn'+str(i))

        x = Dropout(prob)(last_layer)
        x = GlobalMaxPool1D()(x)
        x = Dense(units, activation='relu')(x)
        x = Dropout(prob)(x)
        logits = Dense(output_dim, name='logits')(x)
        out = Activation('sigmoid', name='out')(logits)
        model = Model(inputs=input_, outputs=out)

        print(model.summary())

        return model

In [22]:
embed_size = 128
units = 128
layers = 2
batch_size = 32
epochs = 4

In [23]:
K.clear_session()
model = create_network(max_len, embed_size, units, layers, len(classes))
model.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ (InputLayer)          (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 128)          25600     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200, 256)          263168    
_________________________________________________________________
batch_normalization_1 (Batch (None, 200, 256)          1024      
_________________________________________________________________
dropout_1 (Dropout)          (None, 200, 256)          0         
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
__________

In [None]:
# model.fit(X_pad, y, batch_size=batch_size, epochs=epochs, validation_split=0.1)

## Accuracy - Akshi

## AUC (ROC) - Akshi