# Code Content



In [1]:
# Import base libraries for mathematical operations, dataframes, time and plotting
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
import seaborn as sns
font = {'family' : 'sans-serif',
        'style' : 'normal',
        'size'   : 15}
plt.rc('font', **font)
plt.rcParams['figure.figsize'] = 12, 8

import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support as prfs

from keras.wrappers.scikit_learn import KerasClassifier
import keras
from keras.utils import to_categorical
from keras import regularizers
from keras.constraints import maxnorm

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten, LeakyReLU
from keras.callbacks import EarlyStopping

from keras.layers.core import Dropout

from joblib import dump, load

Using TensorFlow backend.


In [3]:
import py_plots
from py_plots import precisionmeasures as pm

In [4]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

In [5]:
# Code Starting time
t0 = time()

In [6]:
def performance_metrics_table(test,pred,feature):
    '''Inputs:
            test = actual labels of test set
            pred = model predictions for the the test set
            feature = feature name
            
            Computes macro- and micro- precision, recall and F1-score
        Output:
            Multi-index data frame with 3 precision measures 
    '''
    temp_dict = {'Performance':['Precision','Recall','F1-Score']}
    averages = ['micro','macro']
    for average in averages:
        p,r,f,_ = prfs(test,pred,average = average)
        temp_dict[average]= np.round((p,r,f),4)
    temp_df = pd.DataFrame(temp_dict)
    temp_df = pd.melt(temp_df, id_vars=['Performance'], value_vars=averages,
                        var_name='Metric', value_name=feature).set_index(['Metric','Performance'])
    temp_df = temp_df.rename_axis([None,'Performance Measures'])
    return temp_df

# 1. Data upload

In [7]:
class_names = ['Hate','Offensive','Neutral']
path = "datasets/balanced_dataset.csv"

In [8]:
# upload the dataset
data = pd.read_csv(path)
# drop any rows with null (after preprocessing)
data = data.dropna()
# print first 5 rows of the data set
data.head()

Unnamed: 0,labels,tweet,clean_tweet
0,0,"#sikh #temple vandalised in in #calgary, #wso ...",sikh temple vandalised in in calgary wso conde...
1,2,"@user @user @user on flipside of , praise @us...",on flipside of praise for reminder that reales...
2,2,RT @KatiePavlich: Charlie Crist doesn't have a...,charlie crist doesn t have any more political ...
3,0,@user you might be a libtard if... #libtard #...,you might be a libtard if libtard sjw liberal ...
4,0,RT @RihannaHasAids: aight game over. dykes had...,aight game over dykes had to ruin it


# 2. Split dataset into training-validation-test sets

In [9]:
# Split the dataset into training and test sets (2:1)
X_train, x_test, Y_train, y_test = train_test_split(data.clean_tweet, data.labels, test_size=0.33, random_state=42)

# maximum word count of tweets in the training set
max_length = np.max([len(tweet.split()) for tweet in X_train])

print('Maximum lenght (word-count) of tweets in the training set: {}\n'.format(max_length))


# Split the trainng dataset further into training and validation sets (2:1)
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.33, random_state=42)

y_train_onehot = to_categorical(y_train)
y_val_onehot = to_categorical(y_val)
y_test_onehot = to_categorical(y_test)

# Print
print('=='*15)
print('Training-Validation-Test Split')
print('=='*15)
print('Size of training data: {}'.format(len(y_train)))
print('..'*15)
print('Size of validation data: {}'.format(len(y_val)))
print('..'*15)
print('Size of test data: {}'.format(len(y_test)))
print('..'*15)

Maximum lenght (word-count) of tweets in the training set: 34

Training-Validation-Test Split
Size of training data: 8865
..............................
Size of validation data: 4367
..............................
Size of test data: 6518
..............................


# 2. Word vectorization

## 2.1 Tokenizaition

In [10]:
# Initializer tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train.append(x_val))
vocab_size = len(tokenizer.word_index)+1
print('Total size of training (including validation set) vocabulary: {} words'.format(vocab_size))

Total size of training (including validation set) vocabulary: 18602 words


# 2.2 Zero-padding

In [11]:
sequence_train = tokenizer.texts_to_sequences(x_train)
padded_train = pad_sequences(sequence_train, maxlen=max_length, padding='post') 

sequence_val = tokenizer.texts_to_sequences(x_val)
padded_val = pad_sequences(sequence_val, maxlen=max_length, padding='post') 

sequence_test = tokenizer.texts_to_sequences(x_test)
padded_test = pad_sequences(sequence_test, maxlen=max_length, padding='post') 

In [12]:
# Upload embedding matrix for words in the vocabulary
embed_dim = 300
embedding_matrix = pd.read_pickle("model/GloVe_matrix.pkl").values
print( 'Shape of embedding matrix is {} x {}'. format(embedding_matrix.shape[0],embedding_matrix.shape[1]))

Shape of embedding matrix is 18602 x 300


# 4. Fine tuning best fit for batch sizes

After finalizing the # of layers and # of neurons

1. Batch size:
        -512
        -256
        -128
        -64
        -8


In [17]:
for i, batchsize in enumerate([512,256,128,64,8]):
    print('Batch size: {}'.format(batchsize))
    model = Sequential()
    model.add(Embedding(vocab_size, embed_dim, weights=[embedding_matrix], input_length=max_length, trainable=True))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['acc'])

    es = EarlyStopping(monitor='val_loss',patience=2)
    history = model.fit(padded_train,y_train_onehot,
                           epochs=30,
                           batch_size=batchsize,
                           validation_data=(padded_val, y_val_onehot),
                           callbacks=[es],
                           verbose=2)
    pred = model.predict(padded_test)
    y_pred = np.argmax(pred,axis=1)
    if i ==0:
        tbl = performance_metrics_table(y_test,y_pred,'Batch Size '+np.str(batchsize))
    else:
        tbl = tbl.join(performance_metrics_table(y_test,y_pred,'Batch Size '+np.str(batchsize)))

Batch size: 512
Train on 8865 samples, validate on 4367 samples
Epoch 1/30
 - 2s - loss: 0.9462 - acc: 0.5381 - val_loss: 0.7477 - val_acc: 0.7074
Epoch 2/30
 - 2s - loss: 0.5583 - acc: 0.7972 - val_loss: 0.5969 - val_acc: 0.7639
Epoch 3/30
 - 2s - loss: 0.3182 - acc: 0.8910 - val_loss: 0.5439 - val_acc: 0.7967
Epoch 4/30
 - 2s - loss: 0.1623 - acc: 0.9534 - val_loss: 0.6020 - val_acc: 0.7939
Epoch 5/30
 - 2s - loss: 0.0803 - acc: 0.9795 - val_loss: 0.6569 - val_acc: 0.7962
Batch size: 256
Train on 8865 samples, validate on 4367 samples
Epoch 1/30
 - 4s - loss: 0.9551 - acc: 0.5538 - val_loss: 0.7154 - val_acc: 0.7277
Epoch 2/30
 - 3s - loss: 0.5080 - acc: 0.8153 - val_loss: 0.5414 - val_acc: 0.7932
Epoch 3/30
 - 3s - loss: 0.2400 - acc: 0.9199 - val_loss: 0.5564 - val_acc: 0.8063
Epoch 4/30
 - 3s - loss: 0.1072 - acc: 0.9732 - val_loss: 0.6226 - val_acc: 0.8015
Batch size: 128
Train on 8865 samples, validate on 4367 samples
Epoch 1/30
 - 6s - loss: 0.9339 - acc: 0.5331 - val_loss: 0.7

In [20]:
print('=='*10, 'Best MLP Model fits based on Batch-Size', '=='*10)
tbl



Unnamed: 0_level_0,Unnamed: 1_level_0,Batch Size 512,Batch Size 256,Batch Size 128,Batch Size 64,Batch Size 8
Unnamed: 0_level_1,Performance Measures,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
micro,Precision,0.7906,0.8015,0.7875,0.8059,0.8148
micro,Recall,0.7906,0.8015,0.7875,0.8059,0.8148
micro,F1-Score,0.7906,0.8015,0.7875,0.8059,0.8148
macro,Precision,0.7905,0.8015,0.7895,0.8055,0.8145
macro,Recall,0.7913,0.8021,0.7877,0.8064,0.8155
macro,F1-Score,0.7892,0.8,0.7884,0.8055,0.8134
