**Import Libraries**

In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import confusion_matrix,f1_score

import xgboost as xgb

In [2]:
local_path = "C:\\Greenwich\\MSc Project\\project_code\\"
store_local = False

**Load Data**

In [None]:
# full_train_data = pd.read_csv("https://github.com/Voldegin/hate_speech_detection/blob/develop/data/mixed/mixed_train_data.csv?raw=true")
# test_data = pd.read_csv("https://github.com/Voldegin/hate_speech_detection/blob/develop/data/mixed/mixed_test_data.csv?raw=true")

In [3]:
full_train_data = pd.read_csv("https://github.com/Voldegin/hate_speech_detection/blob/develop/data/uniform/uniform_train_data.csv?raw=true")
test_data = pd.read_csv("https://github.com/Voldegin/hate_speech_detection/blob/develop/data/uniform/uniform_test_data.csv?raw=true")

In [4]:
len(full_train_data), len(test_data)

(91269, 9851)

In [None]:
# train_data, val_data = train_test_split(full_train_data,test_size=5000,random_state=21)

In [20]:
split = StratifiedShuffleSplit(n_splits=2,test_size=0.1, random_state=23)
for train_index, val_index in split.split(full_train_data[['tweet_text','cleaned']],full_train_data['is_cyberbullying']):
    train_data = full_train_data.loc[train_index]
    val_data = full_train_data.loc[val_index]

In [21]:
len(train_data), len(val_data), len(test_data)

(82142, 9127, 9851)

**Splitting into labels and features**

In [22]:
train_data.head()

Unnamed: 0,tweet_text,is_cyberbullying,cleaned
8163,@iamlabeng peace and order at the price of HR ...,1,peac order price hr violat airport lgu achiev ...
71649,"Foo Fighters edit \n\nIt wasn't disruptive, I ...",0,foo fighter edit disrupt ask valid question ex...
29745,and make a video with screenshots exposing my ...,1,make video screenshot expos bulli bulli high s...
10622,#Feminazi's gone wild! Smh!😒,1,feminazi gone wild smh
76699,"I agree. More than one overlapping articles, p...",0,agre one overlap articl practic theme neg effe...


In [23]:
val_data.head()

Unnamed: 0,tweet_text,is_cyberbullying,cleaned
32316,"@Rileyyz_69 stupid fuck, riley isn't allowed t...",1,stupid fuck riley allow use kind social networ...
28244,all the girls from high school who bullied me ...,1,girl high school bulli like 1d realli wan na p...
45409,"== Hey, you didnt tell me how fucking long thi...",1,hey didnt tell fuck long block for wtf long wait
774,RT @NoToFeminism: I don’t need femisnn i heard...,0,rt don ’ t need femisnn heard femist tri write...
83717,}}\n{{WikiProject University of Oxford|class=B...,0,wikiproject univers oxfordclassbimportancemid


In [24]:
test_data.head()

Unnamed: 0,tweet_text,is_cyberbullying,cleaned
0,This video could be terrible and my weave woul...,0,video could terribl weav would still snatch
1,or so I can direct parents there around xmas t...,0,direct parent around xma time p
2,Drasko trying to use his fork to eat the bread...,0,drasko tri use fork eat breadcrumb
3,@NikkiGobel hmm okay.,0,hmm okay
4,Women have been equal socially for quite awhil...,0,women equal social quit awhil lt said author t...


In [25]:
def split_label_and_feature(data):
    return data['cleaned'], data['is_cyberbullying']

In [26]:
X_train, y_train = split_label_and_feature(train_data)
X_val, y_val = split_label_and_feature(val_data)
X_test, y_test = split_label_and_feature(test_data)

**Model Training**

In [27]:
clf = CountVectorizer()
X_train_cv =  clf.fit_transform(X_train)

tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_cv)
X_train_tf = tf_transformer.transform(X_train_cv)

In [28]:
X_val_cv = clf.transform(X_val)
X_val_tf = tf_transformer.transform(X_val_cv)

**Random Forest**

In [29]:
rf_model = RandomForestClassifier(n_estimators=500)
rf_model.fit(X_train_tf,y_train)

RandomForestClassifier(n_estimators=500)

In [30]:
predictions = rf_model.predict(X_val_tf)

In [31]:
confusion_matrix(y_val,predictions)

array([[4108,  405],
       [ 581, 4033]])

In [32]:
f1_score(y_val,predictions)

0.8910737958462218

**Naive Baiyes**

In [33]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tf, y_train)

MultinomialNB()

In [34]:
predictions = nb_model.predict(X_val_tf)

In [35]:
confusion_matrix(y_val,predictions)

array([[3783,  730],
       [ 427, 4187]])

In [36]:
f1_score(y_val,predictions)

0.8786066519777568

**XGBoost**

In [37]:
xgb_model = xgb.XGBClassifier(n_estimators=500)
xgb_model.fit(X_train_tf, y_train)

XGBClassifier(n_estimators=500)

In [38]:
predictions = xgb_model.predict(X_val_tf)

In [39]:
confusion_matrix(y_val,predictions)

array([[4276,  237],
       [ 836, 3778]])

In [40]:
f1_score(y_val,predictions)

0.8756518715957816

**Grid Search on Best Model**

In [41]:
#setting grid of selected parameters for iteration
param_grid = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5, 10, 15],
        'n_estimators': [100,300,500,800,1000],
        'learning_rate': [0.01, 0.06, 0.1, 0.5,],
        }

In [42]:
# Initialize Grid Search Model
xgb_base = xgb.XGBClassifier()
xgb_grid_model = GridSearchCV(estimator=xgb_base, param_grid=param_grid, scoring='accuracy',
                                 verbose=10, n_jobs=-1, cv=2)

xgb_grid_model = RandomizedSearchCV(estimator=xgb_base, param_distributions=param_grid, scoring='accuracy',
                                 verbose=10, n_jobs=-1, cv=2)

# Fit Grid Search Model
xgb_grid_model.fit(X_train_tf, y_train)
print("Best score: %0.3f" % xgb_grid_model.best_score_)
print("Best parameters set:")
best_parameters = xgb_grid_model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

best_xgb_model = xgb_grid_model.best_estimator_

Fitting 2 folds for each of 10 candidates, totalling 20 fits
Best score: 0.887
Best parameters set:
	colsample_bytree: 1.0
	gamma: 0.5
	learning_rate: 0.06
	max_depth: 15
	min_child_weight: 1
	n_estimators: 300
	subsample: 0.6


**Retrain Best Model and save**

In [43]:
# Mixed and balanced Data hyperparameters

# best_xgb_model = xgb.XGBClassifier(colsample_bytree=0.6,gamma=0.5,learning_rate=0.06,max_depth=10,min_child_weight=1,n_estimators=800,subsample=0.8)
# best_xgb_model.fit(X_train_tf, y_train)

In [44]:
# Uniform Data hyperparameters
best_xgb_model = xgb.XGBClassifier(colsample_bytree=1.0,gamma=0.5,learning_rate=0.06,max_depth=15,min_child_weight=1,n_estimators=300,subsample=0.6)
best_xgb_model.fit(X_train_tf, y_train)

XGBClassifier(colsample_bytree=1.0, gamma=0.5, learning_rate=0.06, max_depth=15,
              n_estimators=300, subsample=0.6)

In [45]:
predictions = best_xgb_model.predict(X_val_tf)
confusion_matrix(y_val,predictions)

array([[4264,  249],
       [ 731, 3883]])

In [46]:
f1_score(y_val,predictions)

0.8879487765835811

In [47]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [48]:
cd drive/MyDrive/MSc\ Data\ Science/MSc\ Project/models

/content/drive/MyDrive/MSc Data Science/MSc Project/models


In [49]:
import pickle
file_name = "uniform_best_xgb.pkl"

# save
pickle.dump(best_xgb_model, open(file_name, "wb"))

In [50]:
ls

best_xgb.pkl                  [0m[01;34mmixed-distilbert--without-stem-94[0m/
[01;34mdistilbert-94[0m/                [01;34mmixed-roberta[0m/
[01;34mdistilbert--without-stem-94[0m/  [01;34mroberta[0m/
mixed_best_xgb.pkl            uniform_best_xgb.pkl
