In [8]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedShuffleSplit

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, LSTM
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model
from imblearn.over_sampling import SMOTE, ADASYN

In [9]:
# path to the train data file
TRAIN_LOCATION = 'C:/Users/sharm/Desktop/Dat5Melb/Final_Project/Datasets/train/stemmedtrain.csv'
TEST_LOCATION = 'C:/Users/sharm/Desktop/Dat5Melb/Final_Project/Datasets/test1/test.csv'

# utility definitions for easier handling of the dataset column names
TEXT_COLUMN = 'comment_text'
CLASS_TOXIC, CLASS_SEVER_TOXIC, CLASS_OBSCENE, CLASS_THREAT, CLASS_INSULT, \
    CLASS_IDENTITY_HATE = ["toxic", "severe_toxic", "obscene", "threat", \
                           "insult", "identity_hate"]
CLASSES = [CLASS_TOXIC, CLASS_SEVER_TOXIC, CLASS_OBSCENE, CLASS_THREAT, CLASS_INSULT, CLASS_IDENTITY_HATE]

MAX_SEQUENCE_LENGTH = 150 # average length of sentence from training set was 67
MAX_NUM_WORDS = 200000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

# load all the data available
dataDf_train = pd.read_csv(TRAIN_LOCATION)
dataDf_test = pd.read_csv(TEST_LOCATION)

In [10]:
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
with open('C:/Users/sharm/Desktop/Dat5Melb/Bias Elimination/Data/glove.6B/glove.6B.100d.txt', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [11]:
print('Preparing embedding matrix.')

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(dataDf_train[TEXT_COLUMN])
sequences = tokenizer.texts_to_sequences(dataDf_train[TEXT_COLUMN])

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

Preparing embedding matrix.
Found 204191 unique tokens.


In [12]:
print('Training models.')
# http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/
models = []

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]

nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])


for klass in CLASSES:
    
    labels = to_categorical(dataDf_train.iloc[indices][klass])
    #labels = dataDf_train.iloc[indices][klass]
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)

    x_train = data[:-nb_validation_samples]
    y_train = labels[:-nb_validation_samples]
    x_val = data[-nb_validation_samples:]
    y_val = labels[-nb_validation_samples:]

#     X_resampled, y_resampled = SMOTE().fit_sample(x_train, y_train)
        
    # train a 1D convnet with global maxpooling
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    x = Conv1D(128, 5, activation='relu')(embedded_sequences)
    x = GlobalMaxPooling1D()(x)
    x = Dense(128, activation='relu')(x)
    preds = Dense(2, activation='softmax')(x) # set to 2 because we have 0-1
    
    model = Model(sequence_input, preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])

    #x = LSTM(50)(embedded_sequences)
    #x = Dropout(0.5)(x)
    #preds = Dense(1, activation='sigmoid')(x)

    #model = Model(sequence_input, preds)
    #model.compile(loss='binary_crossentropy',
    #              optimizer='sgd',
    #              metrics=['acc'])
    #print(model.summary())

    model.fit(x_train, y_train,
              batch_size=64,
              epochs=3, verbose=1,
              validation_data=(x_val, y_val))
    models.append(model)

Training models.
Shape of data tensor: (159571, 150)
Shape of label tensor: (159571, 2)
Train on 127657 samples, validate on 31914 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Shape of data tensor: (159571, 150)
Shape of label tensor: (159571, 2)
Train on 127657 samples, validate on 31914 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Shape of data tensor: (159571, 150)
Shape of label tensor: (159571, 2)
Train on 127657 samples, validate on 31914 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Shape of data tensor: (159571, 150)
Shape of label tensor: (159571, 2)
Train on 127657 samples, validate on 31914 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Shape of data tensor: (159571, 150)
Shape of label tensor: (159571, 2)
Train on 127657 samples, validate on 31914 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Shape of data tensor: (159571, 150)
Shape of label tensor: (159571, 2)
Train on 127657 samples, validate on 31914 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [6]:
if True:
    X = pad_sequences(tokenizer.texts_to_sequences(dataDf_test[TEXT_COLUMN]), 
                      maxlen=MAX_SEQUENCE_LENGTH)
    
    predicted = []
    for model, klass in zip(models, CLASSES):
        print('>>> Processing %s' %klass)
        predicted.append(model.predict(X)[:,1])
        #predicted.append(model.predict(X)[:,0])
        
    y_sub = pd.DataFrame(np.array(predicted).T)
    y_sub.columns = CLASSES

    subDf = pd.concat([dataDf_test['id'], y_sub], axis=1)

    subDf.head()
    subDf.to_csv('C:/Users/sharm/Desktop/Dat5Melb/Final_Project/submission.csv', index=False)

>>> Processing toxic
>>> Processing severe_toxic
>>> Processing obscene
>>> Processing threat
>>> Processing insult
>>> Processing identity_hate


In [None]:
# MSE
d = predictions - evaluation_classes
sq_difs = map(lambda x: np.dot(x, x.T), d.as_matrix())
print('MSE: %f' %(np.sum(sq_difs) * 1.0 / len(d)))

In [7]:
print(subDf.head())

                 id     toxic  severe_toxic   obscene        threat    insult  \
0  00001cee341fdb12  0.999963  6.575979e-02  0.953947  8.918338e-09  0.925405   
1  0000247867823ef7  0.000087  7.178182e-08  0.000022  1.434730e-09  0.000035   
2  00013b17ad220c46  0.000306  1.293571e-11  0.000528  9.877542e-08  0.000202   
3  00017563c3f7919a  0.000065  1.059091e-07  0.000009  2.349599e-10  0.002628   
4  00017695ad8997eb  0.001651  2.609980e-08  0.000175  1.364719e-09  0.001154   

   identity_hate  
0   4.823716e-03  
1   1.119048e-06  
2   7.886510e-06  
3   1.252631e-06  
4   9.864416e-09  
