**Import Libraries**

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import confusion_matrix,f1_score

import xgboost as xgb

In [None]:
local_path = "C:\\Greenwich\\MSc Project\\project_code\\"
store_local = False

**Load Data**

In [None]:
full_train_data = pd.read_csv("https://github.com/Voldegin/hate_speech_detection/blob/develop/data/mixed/mixed_train_data.csv?raw=true")
test_data = pd.read_csv("https://github.com/Voldegin/hate_speech_detection/blob/develop/data/mixed/mixed_test_data.csv?raw=true")

In [None]:
len(full_train_data), len(test_data)

(89979, 10000)

In [None]:
train_data, val_data = train_test_split(full_train_data,test_size=5000,random_state=21)

In [None]:
len(train_data), len(val_data), len(test_data)

(84979, 5000, 10000)

**Splitting into labels and features**

In [None]:
train_data.head()

Unnamed: 0,tweet_text,is_cyberbullying,cleaned
51990,"""\n Yes, it is. But a demo is not notable enou...",0,ye is demo notabl enough headlin alreadi writt...
5095,Has it occurred to you that I wouldn't tell so...,1,occur tell someon fuck first refer edit bullsh...
64041,Notice of Edit warring noticeboard discussion\...,0,notic edit war noticeboard discuss hello messa...
53391,""":It doesn't even seem that an amendment is su...",0,it even seem amend subject p5 vote perman five...
10,"""\n\nUNBLOCK ME OR I'LL GET MY LAWYERS ON TO Y...",1,unblock ill get lawyer block constitut right f...


In [None]:
def split_label_and_feature(data):
    return data['cleaned'], data['is_cyberbullying']

In [None]:
X_train, y_train = split_label_and_feature(train_data)
X_val, y_val = split_label_and_feature(val_data)
X_test, y_test = split_label_and_feature(test_data)

**Model Training**

In [None]:
clf = CountVectorizer()
X_train_cv =  clf.fit_transform(X_train)

tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_cv)
X_train_tf = tf_transformer.transform(X_train_cv)

In [None]:
X_val_cv = clf.transform(X_val)
X_val_tf = tf_transformer.transform(X_val_cv)

**Random Forest**

In [None]:
rf_model = RandomForestClassifier(n_estimators=500)
rf_model.fit(X_train_tf,y_train)

RandomForestClassifier(n_estimators=500)

In [None]:
predictions = rf_model.predict(X_val_tf)

In [None]:
confusion_matrix(y_val,predictions)

array([[2352,  224],
       [ 323, 2101]])

In [None]:
f1_score(y_val,predictions)

0.8848178563908191

**Naive Baiyes**

In [None]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tf, y_train)

MultinomialNB()

In [None]:
predictions = nb_model.predict(X_val_tf)

In [None]:
confusion_matrix(y_val,predictions)

array([[2149,  427],
       [ 257, 2167]])

In [None]:
f1_score(y_val,predictions)

0.8636907134316462

**XGBoost**

In [None]:
xgb_model = xgb.XGBClassifier(n_estimators=500)
xgb_model.fit(X_train_tf, y_train)

XGBClassifier(n_estimators=500)

In [None]:
predictions = xgb_model.predict(X_val_tf)

In [None]:
confusion_matrix(y_val,predictions)

array([[2445,  131],
       [ 446, 1978]])

In [None]:
f1_score(y_val,predictions)

0.8727112287668211

**Grid Search on Best Model**

In [None]:
#setting grid of selected parameters for iteration
param_grid = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5, 10, 15],
        'n_estimators': [100,300,500,800,1000],
        'learning_rate': [0.01, 0.06, 0.1, 0.5,],
        }

In [None]:
# Initialize Grid Search Model
xgb_base = xgb.XGBClassifier()
xgb_grid_model = GridSearchCV(estimator=xgb_base, param_grid=param_grid, scoring='accuracy',
                                 verbose=10, n_jobs=-1, cv=2)

xgb_grid_model = RandomizedSearchCV(estimator=xgb_base, param_distributions=param_grid, scoring='accuracy',
                                 verbose=10, n_jobs=-1, cv=2)

# Fit Grid Search Model
xgb_grid_model.fit(X_train_tf, y_train)
print("Best score: %0.3f" % xgb_grid_model.best_score_)
print("Best parameters set:")
best_parameters = xgb_grid_model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

best_xgb_model = xgb_grid_model.best_estimator_

Fitting 2 folds for each of 10 candidates, totalling 20 fits
Best score: 0.887
Best parameters set:
	colsample_bytree: 1.0
	gamma: 1
	learning_rate: 0.06
	max_depth: 15
	min_child_weight: 5
	n_estimators: 500
	subsample: 1.0


**Retrain Best Model and save**

In [None]:
best_xgb_model = xgb.XGBClassifier(colsample_bytree=0.6,gamma=0.5,learning_rate=0.06,max_depth=10,min_child_weight=1,n_estimators=800,subsample=0.8)
best_xgb_model.fit(X_train_tf, y_train)

XGBClassifier(colsample_bytree=0.6, gamma=0.5, learning_rate=0.06, max_depth=10,
              n_estimators=800, subsample=0.8)

In [None]:
predictions = best_xgb_model.predict(X_val_tf)
confusion_matrix(y_val,predictions)

array([[2418,  158],
       [ 347, 2077]])

In [None]:
f1_score(y_val,predictions)

0.8916076411247048

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd drive/MyDrive/MSc\ Data\ Science/MSc\ Project/models

/content/drive/MyDrive/MSc Data Science/MSc Project/models


In [None]:
import pickle
file_name = "mixed_best_xgb.pkl"

# save
pickle.dump(best_xgb_model, open(file_name, "wb"))

In [None]:
ls

best_xgb.pkl    [0m[01;34mdistilbert--without-stem-94[0m/  [01;34mroberta[0m/
[01;34mdistilbert-94[0m/  mixed_best_xgb.pkl
