**Import Libraries**

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.metrics import confusion_matrix,f1_score

from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

import xgboost as xgb

In [None]:
local_path = "C:\\Greenwich\\MSc Project\\project_code\\"
store_local = False

**Load Data**

In [None]:
# full_train_data = pd.read_csv("https://github.com/Voldegin/hate_speech_detection/blob/develop/data/mixed/mixed_train_data.csv?raw=true")
# test_data = pd.read_csv("https://github.com/Voldegin/hate_speech_detection/blob/develop/data/mixed/mixed_test_data.csv?raw=true")

In [None]:
full_train_data = pd.read_csv("https://github.com/Voldegin/hate_speech_detection/blob/develop/data/uniform/uniform_train_data.csv?raw=true")
test_data = pd.read_csv("https://github.com/Voldegin/hate_speech_detection/blob/develop/data/uniform/uniform_test_data.csv?raw=true")

In [None]:
len(full_train_data), len(test_data)

(91269, 9851)

In [None]:
# train_data, val_data = train_test_split(full_train_data,test_size=5000,random_state=21)

In [None]:
split = StratifiedShuffleSplit(n_splits=2,test_size=0.1, random_state=23)
for train_index, val_index in split.split(full_train_data[['tweet_text','cleaned']],full_train_data['is_cyberbullying']):
    train_data = full_train_data.loc[train_index]
    val_data = full_train_data.loc[val_index]

In [None]:
len(train_data), len(val_data), len(test_data)

(82142, 9127, 9851)

**Splitting into labels and features**

In [None]:
train_data.head()

Unnamed: 0,tweet_text,is_cyberbullying,cleaned
8163,@iamlabeng peace and order at the price of HR ...,1,peac order price hr violat airport lgu achiev ...
71649,"Foo Fighters edit \n\nIt wasn't disruptive, I ...",0,foo fighter edit disrupt ask valid question ex...
29745,and make a video with screenshots exposing my ...,1,make video screenshot expos bulli bulli high s...
10622,#Feminazi's gone wild! Smh!😒,1,feminazi gone wild smh
76699,"I agree. More than one overlapping articles, p...",0,agre one overlap articl practic theme neg effe...


In [None]:
val_data.head()

Unnamed: 0,tweet_text,is_cyberbullying,cleaned
32316,"@Rileyyz_69 stupid fuck, riley isn't allowed t...",1,stupid fuck riley allow use kind social networ...
28244,all the girls from high school who bullied me ...,1,girl high school bulli like 1d realli wan na p...
45409,"== Hey, you didnt tell me how fucking long thi...",1,hey didnt tell fuck long block for wtf long wait
774,RT @NoToFeminism: I don’t need femisnn i heard...,0,rt don ’ t need femisnn heard femist tri write...
83717,}}\n{{WikiProject University of Oxford|class=B...,0,wikiproject univers oxfordclassbimportancemid


In [None]:
test_data.head()

Unnamed: 0,tweet_text,is_cyberbullying,cleaned
0,This video could be terrible and my weave woul...,0,video could terribl weav would still snatch
1,or so I can direct parents there around xmas t...,0,direct parent around xma time p
2,Drasko trying to use his fork to eat the bread...,0,drasko tri use fork eat breadcrumb
3,@NikkiGobel hmm okay.,0,hmm okay
4,Women have been equal socially for quite awhil...,0,women equal social quit awhil lt said author t...


In [None]:
def split_label_and_feature(data):
    return data['cleaned'], data['is_cyberbullying']

In [None]:
X_train, y_train = split_label_and_feature(train_data)
X_val, y_val = split_label_and_feature(val_data)
X_test, y_test = split_label_and_feature(test_data)

**Model Training**

In [None]:
allText = pd.concat([X_train,X_val])

In [None]:
len(X_val)

9127

In [None]:
len(X_train)

82142

In [None]:
len(allText)

91269

In [None]:
def constructLabeledSentences(data):
    sentences=[]
    for index, row in data.iteritems():
        sentences.append(LabeledSentence(utils.to_unicode(row).split(), ['Text' + '_%s' % str(index)]))
    return sentences

sentences = constructLabeledSentences(allText)
model = Doc2Vec(min_count=1, window=5, size=100, sample=1e-4, negative=5, workers=8,iter=100,seed=1)

  after removing the cwd from sys.path.


In [None]:
model.build_vocab(sentences)

model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
# model.save('./docEmbeddings_train_val.d2v')

In [None]:
len(model.docvecs)

91269

In [None]:
len(X_train) + len(X_val)

91269

In [None]:
full_array = np.zeros((len(allText), 100))

for i in range(len(allText)):
    # print(i)
    full_array[i] = model.docvecs['Text_'+str(i)]

In [None]:
X_train_array = full_array[:len(X_train)]
X_val_array = full_array[len(X_train):]

In [None]:
len(X_train_array), len(X_val_array)

(82142, 9127)

In [None]:
xgb_model = xgb.XGBClassifier(n_estimators=500)
xgb_model.fit(X_train_array, y_train)

XGBClassifier(n_estimators=500)

In [None]:
predictions = xgb_model.predict(X_val_array)

In [None]:
confusion_matrix(y_val,predictions)

array([[1856, 2657],
       [1894, 2720]])

In [None]:
f1_score(y_val,predictions)

0.5444900410369332