# Content

**Warning: <span style="color:red">high execution time (several hours) </span>**

Code below computes the model performance measures for the following input features:

    - Character Bi-grams
    - Character Tri-grams
    - Character 4-grams
    - Character 5-grams
    - Character 6-grams
    - Character 7-grams
    - Character 8-grams
    - Word Uni-gram
    - Word Bi-grams
    - Word Tri-grams

Code uses **Random Forest** and **Gradient Boosting** computed after *grid-search.*

In [None]:
# Import base libraries for mathematical operations, dataframes, time and plotting
import numpy as np
import pandas as pd
from time import time
import re

import warnings
warnings.filterwarnings("ignore")

In [None]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as prfs

In [None]:
# Code Starting time
t0 = time()

## 1. Data upload

In [None]:
class_names = ['Hate','Offensive','Neutral']
path = "datasets/balanced_dataset.csv"

In [None]:
# upload the dataset
data = pd.read_csv(path)
# drop any rows with null (after preprocessing)
data = data.dropna()
# print first 5 rows of the data set
data.head()

In [None]:
# Split the dataset into training and test sets (2:1)
X_train, X_test, Y_train, Y_test = train_test_split(data.clean_tweet, data.labels, test_size=0.33, random_state=42)

In [None]:
feature_list = ['char bi-gram','char tri-gram', 'char 4-gram',
                'char 5-gram', 'char 6-gram', 'char 7-gram','char 8-gram',
                'word uni-gram','word bi-gram','word tri-gram']

In [None]:
def feature_info(feature):
    ''' 
    Input: Feature
    Steps:
        Split each feature at whitespaces and hyphens and convert into lower case
        Set analyzer to 0th element of the split list 'char'/'word'
        Convert 1st element into numerical value
        Convert words like uni-1, bi-2, tri-3
    Output: analyzer = word/char
            N = range of N-grams
    '''
    token = re.split(r'\s|-', feature)
    analyzer = token[0]
    
    temp_dict = {'uni':1, 'bi': 2, 'tri': 3}
    
    if token[1] in temp_dict:
        N = temp_dict[token[1]]
    else:
        N = np.int(token[1])
    return (analyzer, N)

In [None]:
def performance_metrics_table(test,pred,feature):
    '''Inputs:
            test = actual labels of test set
            pred = model predictions for the the test set
            feature = feature name
            
            Computes macro- and micro- precision, recall and F1-score
        Output:
            Multi-index data frame with 3 precision measures 
    '''
    temp_dict = {'Performance':['Precision','Recall','F1-Score']}
    averages = ['micro','macro']
    for average in averages:
        p,r,f,_ = prfs(test,pred,average = average)
        temp_dict[average]= np.round((p,r,f),4)
    temp_df = pd.DataFrame(temp_dict)
    temp_df = pd.melt(temp_df, id_vars=['Performance'], value_vars=averages,
                        var_name='Metric', value_name=feature).set_index(['Metric','Performance'])
    temp_df = temp_df.rename_axis([None,'Performance Measures'])
    return temp_df

In [None]:
# For Each feature fit the decision tree models.
for i,feature in enumerate(feature_list):  
    # Extract infro from each feature  
    analyzer,N_range = feature_info(feature)
    # Vectorize the text data
    vectorizer = TfidfVectorizer(analyzer = analyzer,ngram_range = (N_range,N_range))
    x_train = vectorizer.fit_transform(X_train)
    x_test = vectorizer.transform(X_test)
    #########################################
    # Initiate Random Forest Classifier
    rf_clf = RandomForestClassifier(n_estimators=500, bootstrap=False, max_depth = 100)
    
    print('=='*30)
    ta = time()
    rf_clf.fit(x_train, Y_train)
    y_pred_rf = rf_clf.predict(x_test)
    tb = time()-ta
    print('Total Time for Random Forest fit on {} is {} sec'.format(feature.lower(), np.round(tb)))
    #########################################
    # Initiate Gradient Boosting Classifier
    gb_clf = GradientBoostingClassifier(n_estimators=800, max_depth = 5)
    print('..'*30)
    ta = time()
    gb_clf.fit(x_train, Y_train)
    y_pred_gb = gb_clf.predict(x_test)
    tb = time()-ta
    print('Total time for Gradient Boosting fit on {} is {} sec '.format(feature.lower(), np.round(tb)))
    #########################################
    # Store the results from individual classifier per 
    if i==0:
        # Tables for storing performance metrics
        rf_tbl = performance_metrics_table(Y_test,y_pred_rf,feature)
        gb_tbl = performance_metrics_table(Y_test,y_pred_gb,feature)
    else:
        # Join together tables for new features
        rf_tbl = rf_tbl.join(performance_metrics_table(Y_test,y_pred_rf,feature))
        gb_tbl = gb_tbl.join(performance_metrics_table(Y_test,y_pred_gb,feature))
        

print('=='*30)

In [None]:
#print performance metric table for Random Forests for the feature list
print('=='*22,'Random Forest Classifier','=='*22)
rf_tbl

In [None]:
#print performance metric table for Gradient Boosting for the feature list
print('=='*21,'Gradient Boosting Classifier','=='*21)
gb_tbl

In [None]:
t1 = time()
code_time = t1 - t0
print("=="*30)
print('Total Code Execution Time: {} seconds'. format(np.round(code_time),4))
print("=="*30)