In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedShuffleSplit

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, LSTM
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model

from imblearn.over_sampling import SMOTE, ADASYN

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# path to the train data file
TRAIN_LOCATION = 'C:/Users/sharm/Desktop/Dat5Melb/Final_Project/Datasets/train/train.csv'
TEST_LOCATION = 'C:/Users/sharm/Desktop/Dat5Melb/Final_Project/Datasets/test1/test.csv'

# utility definitions for easier handling of the dataset column names
TEXT_COLUMN = 'comment_text'
CLASS_TOXIC, CLASS_SEVER_TOXIC, CLASS_OBSCENE, CLASS_THREAT, CLASS_INSULT, \
    CLASS_IDENTITY_HATE = ["toxic", "severe_toxic", "obscene", "threat", \
                           "insult", "identity_hate"]
CLASSES = [CLASS_TOXIC, CLASS_SEVER_TOXIC, CLASS_OBSCENE, CLASS_THREAT, CLASS_INSULT, CLASS_IDENTITY_HATE]

MAX_SEQUENCE_LENGTH = 150 # average length of sentence from training set was 67
MAX_NUM_WORDS = 200000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.35

# load all the data available
dataDf_train = pd.read_csv(TRAIN_LOCATION)
dataDf_test = pd.read_csv(TEST_LOCATION)

In [3]:
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
with open('C:/Users/sharm/Desktop/Dat5Melb/Bias Elimination/Data/glove.6B/glove.6B.100d.txt', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [4]:
print('Preparing embedding matrix.')

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(dataDf_train[TEXT_COLUMN])
sequences = tokenizer.texts_to_sequences(dataDf_train[TEXT_COLUMN])

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

Preparing embedding matrix.
Found 210337 unique tokens.


In [None]:
print('Training models.')
# http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/
models = []

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]

nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])


for klass in CLASSES:
    print('Processing: ', klass)
    labels = dataDf_train.iloc[indices][klass]
    
    #labels = dataDf_train.iloc[indices][klass]

    x_train = data[:-nb_validation_samples]
    y_train = labels[:-nb_validation_samples]
    x_train, y_train = SMOTE().fit_sample(x_train, y_train)
    y_train = to_categorical(y_train)
    print('Training data tensor: ', x_train.shape)
    print('Labels data tensor: ', y_train.shape)
    
    x_val = data[-nb_validation_samples:]
    y_val = to_categorical(labels[-nb_validation_samples:])

    # train a 1D convnet with global maxpooling
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    x = Conv1D(128, 5, activation='relu')(embedded_sequences)
    x = GlobalMaxPooling1D()(x)
    x = Dense(128, activation='relu')(x)
    preds = Dense(2, activation='softmax')(x) # set to 2 because we have 0-1
    
    model = Model(sequence_input, preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])

    #x = LSTM(50)(embedded_sequences)
    #x = Dropout(0.5)(x)
    #preds = Dense(1, activation='sigmoid')(x)

    #model = Model(sequence_input, preds)
    #model.compile(loss='binary_crossentropy',
    #              optimizer='sgd',
    #              metrics=['acc'])
    #print(model.summary())

    model.fit(x_train, y_train,
              batch_size=64,
              epochs=3, verbose=1,
              validation_data=(x_val, y_val))
    models.append(model)

Training models.
Processing:  toxic
Training data tensor:  (187442, 150)
Labels data tensor:  (187442, 2)
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Train on 187442 samples, validate on 55849 samples
Epoch 1/3


In [None]:
if True:
    X = pad_sequences(tokenizer.texts_to_sequences(dataDf_test[TEXT_COLUMN]), 
                      maxlen=MAX_SEQUENCE_LENGTH)
    
    predicted = []
    for model, klass in zip(models, CLASSES):
        print('>>> Processing %s' %klass)
        predicted.append(model.predict(X)[:,1])
        #predicted.append(model.predict(X)[:,0])
        
    y_sub = pd.DataFrame(np.array(predicted).T)
    y_sub.columns = CLASSES

    subDf = pd.concat([dataDf_test['id'], y_sub], axis=1)

    subDf.head()
    subDf.to_csv('C:/Users/sharm/Desktop/Dat5Melb/Final_Project/submission.csv', index=False)

In [None]:
print(subDf.head())