# Content

**Warning: <span style="color:red">high execution time (several hours) </span>**

Code below runs grid search on random forest for *word uni-gram*.

Best fit model for **Random Forest** and **Gradient Boosting** is saved in the *model* directory.

In [None]:
# Import base libraries for mathematical operations, dataframes, time and plotting
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
import seaborn as sns
font = {'family' : 'sans-serif',
        'style' : 'normal',
        'size'   : 15}
plt.rc('font', **font)
plt.rcParams['figure.figsize'] = 12, 8

import warnings
warnings.filterwarnings("ignore")

In [None]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

In [None]:
import re
import joblib


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support as prfs
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
# Import helper folder for plotting
import py_plots
from py_plots import precisionmeasures as pm

In [None]:
# Code Starting time
t0 = time()

## 1. Data upload

In [None]:
class_names = ['Hate','Offensive','Neutral']
path = "datasets/balanced_dataset.csv"

In [None]:
# upload the dataset
data = pd.read_csv(path)
# drop any rows with null (after preprocessing)
data = data.dropna()
# print first 5 rows of the data set
data.head()

In [None]:
# Split the dataset into training and test sets (2:1)
X_train, X_test, Y_train, Y_test = train_test_split(data.clean_tweet, data.labels, test_size=0.33, random_state=42)

## 2. TFIDF Vectorizer

In [None]:
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(X_train)
x_test = vectorizer.transform(X_test)

## 3. Grid Search

**Random Forest Grid Search Parameters**:
1. Bootstrap
    - <span style="color:blue">True, False</span>
2. Maximum depth of tree
    - <span style="color:blue">90, 100, 110</span>
3. Number of trees
    - <span style="color:blue">300, 500, 800</span>

**Gradient Boosting Grid Search Parameters**:
1. Learning rate set
    - <span style="color:blue">0.1, 0.001</span>
2. Maximum depth of tree
    - <span style="color:blue">3, 5</span>
3. Number of trees
    - <span style="color:blue">500, 800</span>


*With stratified, 5-fold, cross-validation sets.*

In [None]:
# Create the parameter grid based on the results of random search 

param_grid_rf = {'bootstrap': [True, False],
              'max_depth': [90, 100, 110],
              'n_estimators': [300,500,800]}

param_grid_gb = {'learning_rate': [0.1,0.01],
              'max_depth': [3,5],
              'n_estimators': [500,800]}


for clf, name in ((RandomForestClassifier(), "Random Forest"),(GradientBoostingClassifier(),'Gradient Boosting')):
    print('=' * 80)
    print(clf)
    # Instantiate the grid search model
    if name == 'Random Forest':
        grid_result = GridSearchCV(estimator = clf, param_grid = param_grid_rf, cv = 5, verbose = 2)
        filename = "model/rf_bestfit_model.pkl"
    else:
        grid_result = GridSearchCV(estimator = clf, param_grid = param_grid_gb, cv = 5, verbose = 2)
        filename = "model/gb_bestfit_model.pkl"
    
    grid_result.fit(x_train,Y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
    
    joblib.dump(grid_result, filename)

    # Load from file
    model = joblib.load(filename)

    # Calculate the accuracy and predictions
    score = model.score(x_test, Y_test)
    print("Test score: {0:.2f} %".format(100 * score))
    predict = model.predict(x_test)