# Content

**<span style="color:green">Execution time = 38 minutes </span>**

Code below computes the model performance measures using the document embeddings.

Document embeddings are average of word embeddings per document (a single tweet).

Code uses **Random Forest** and **Gradient Boosting** computed after *grid-search.*

In [1]:
# Import base libraries for mathematical operations, dataframes, time and plotting
import numpy as np
import pandas as pd
from time import time
import re

import warnings
warnings.filterwarnings("ignore")

In [2]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

In [3]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as prfs

Using TensorFlow backend.


In [4]:
# Code Starting time
t0 = time()

## 1. Data upload

In [5]:
class_names = ['Hate','Offensive','Neutral']
path = "datasets/balanced_dataset.csv"

In [6]:
# upload the dataset
data = pd.read_csv(path)
# drop any rows with null (after preprocessing)
data = data.dropna()
# print first 5 rows of the data set
data.head()

Unnamed: 0,labels,tweet,clean_tweet
0,0,"#sikh #temple vandalised in in #calgary, #wso ...",sikh temple vandalised in in calgary wso conde...
1,2,"@user @user @user on flipside of , praise @us...",on flipside of praise for reminder that reales...
2,2,RT @KatiePavlich: Charlie Crist doesn't have a...,charlie crist doesn t have any more political ...
3,0,@user you might be a libtard if... #libtard #...,you might be a libtard if libtard sjw liberal ...
4,0,RT @RihannaHasAids: aight game over. dykes had...,aight game over dykes had to ruin it


In [7]:
# Split the dataset into training and test sets (2:1)
X_train, X_test, Y_train, Y_test = train_test_split(data.clean_tweet, data.labels, test_size=0.33, random_state=42)

In [8]:
embeddings_index = dict()
f = open('../wordemb/helper/glove/glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coeff = np.asarray(values[1:], dtype = 'float32')
    embeddings_index[word] = coeff
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [9]:
def embedded_vec(text_data):
    ''' Input: Document (tweet) dataset
    
        Document embedding is computed per tweet as average of word embedding
        
        Output: Array containing document embedding of entire dataset
    '''
    # doc_embedding collects document vector for each tweet
    doc_embedding = []
    for text in text_data:
        # word counter for number of word per tweet
        word_count = 0
        # initialize an empty word embedding vector 
        word_embedding = np.zeros(300)
        for word in text.split():
            try:
                # extract word vector from GloVe dataset
                word_embedding += embeddings_index.get(word)
                # Increase word counter by one
                word_count += 1
            except:
                 pass
        if word_count:
            word_embedding /= word_count
        else:
            word_embedding = np.zeros(300)
        doc_embedding.append(word_embedding)
    # return document embeddings in array
    doc_embedding = np.array(doc_embedding)
    return doc_embedding

In [10]:
def performance_metrics_table(test,pred,feature):
    '''Inputs:
            test = actual labels of test set
            pred = model predictions for the the test set
            feature = feature name
            
            Computes macro- and micro- precision, recall and F1-score
        Output:
            Multi-index data frame with 3 precision measures 
    '''
    temp_dict = {'Performance':['Precision','Recall','F1-Score']}
    averages = ['micro','macro']
    for average in averages:
        p,r,f,_ = prfs(test,pred,average = average)
        temp_dict[average]= np.round((p,r,f),4)
    temp_df = pd.DataFrame(temp_dict)
    temp_df = pd.melt(temp_df, id_vars=['Performance'], value_vars=averages,
                        var_name='Metric', value_name=feature).set_index(['Metric','Performance'])
    temp_df = temp_df.rename_axis([None,'Performance Measures'])
    return temp_df

In [11]:
x_train = embedded_vec(X_train)
x_test = embedded_vec(X_test)

In [12]:
# Initiate Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=500, bootstrap=False, max_depth = 100)
    
print('=='*30)
print(rf_clf)
ta = time()
rf_clf.fit(x_train, Y_train)
y_pred_rf = rf_clf.predict(x_test)
tb = time()-ta
print('\nTotal Time for Random Forest fit on document embeddings is {} sec'.format(np.round(tb)))

tbl = performance_metrics_table(Y_test,y_pred_rf,'Random Forest')

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=100, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Total Time for Random Forest fit on document embeddings is 138.0 sec


In [13]:
# Initiate Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(n_estimators=800, max_depth = 5)
print('=='*30)
print(gb_clf)
ta = time()
gb_clf.fit(x_train, Y_train)
y_pred_gb = gb_clf.predict(x_test)
tb = time()-ta
print('\nTotal time for Gradient Boosting fit on document embeddings is {} sec '.format(np.round(tb)))

tbl = tbl.join(performance_metrics_table(Y_test,y_pred_gb,'Gradient Boosting'))

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=5,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=800,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
Total time for Gradient Boosting fit on document embeddings is 2122.0 sec 


In [14]:
#print performance metric table for Random Forests for the feature list
print('=='*10,'Peformance Measures for Document Embedding','=='*10)
tbl



Unnamed: 0_level_0,Unnamed: 1_level_0,Random Forest,Gradient Boosting
Unnamed: 0_level_1,Performance Measures,Unnamed: 2_level_1,Unnamed: 3_level_1
micro,Precision,0.7884,0.8182
micro,Recall,0.7884,0.8182
micro,F1-Score,0.7884,0.8182
macro,Precision,0.7891,0.818
macro,Recall,0.7892,0.8187
macro,F1-Score,0.7867,0.8175


In [15]:
t1 = time()
code_time = t1 - t0
print("=="*30)
print('Total Code Execution Time: {} seconds'. format(np.round(code_time),4))
print("=="*30)

Total Code Execution Time: 2286.0 seconds


In [21]:
(0.84-0.44)/0.84

0.47619047619047616