In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NganLuong\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [1]:
import sys
sys.path.insert(0, '../../scripts/')

In [2]:
# import required packages
import numpy as np
import pandas as pd

# encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# models
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier

# metrics
import time
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

# plots
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

# local scripts
from text_utils import preprocess_corpus

In [3]:
df_train = pd.read_csv(r'../../data/train_data.csv', delimiter=None)
df_valid = pd.read_csv(r'../../data/valid_data.csv', delimiter=None)

df_valid['clean_tweet'] = preprocess_corpus(df_valid['tweet_text'])

Xtr, Ytr = df_train["clean_tweet"], df_train["cyberbullying_type"]
Xva, Yva = df_valid["clean_tweet"], df_valid["cyberbullying_type"]

In [4]:
Xtr.sample(5)

15483                                   step post facebook
8083                    ivicong ikaw nga bully diyan crook
25689    bryce girl attacking girl bullied throughout m...
5695     say everthing happens reason fuck shit cunt pu...
25786                            gay rape joke tony really
Name: clean_tweet, dtype: object

In [5]:
Ytr.sample(5)

22772       gender
3995        gender
18280          age
1121     ethnicity
15839     religion
Name: cyberbullying_type, dtype: object

In [6]:
Xva.sample(5)

96      found close friend story insta girl bullied hi...
1388                praisexoscar nigger youself dumb fuck
8967    tpthompson gop rep find gut stand idiot fireth...
681              look like lennyfinnegan let cat keyboard
9392    tayyoung fuck obama dumb nigger ignorance toda...
Name: clean_tweet, dtype: object

In [7]:
Yva.sample(5)

1298    other_cyberbullying
988               ethnicity
3130                 gender
9414              ethnicity
1988      not_cyberbullying
Name: cyberbullying_type, dtype: object

In [9]:
# vectorize tweet texts
tv = TfidfVectorizer()

vectors = tv.fit_transform(Xtr.values.astype('U'))
Xtr_tv = pd.DataFrame(vectors.toarray(), columns=tv.get_feature_names())

vectors = tv.transform(Xva.values.astype('U'))
Xva_tv = pd.DataFrame(vectors.toarray(), columns=tv.get_feature_names())

In [24]:
le = LabelEncoder()

Ytr_le = pd.DataFrame(le.fit_transform(Ytr),columns=['encoded_cyberbullying_type'])

Yva_le = pd.DataFrame(le.transform(Yva),columns=['encoded_cyberbullying_type'])

In [25]:
print(Xtr_tv.shape, Ytr_le.shape)
print(Xva_tv.shape, Yva_le.shape)

(28614, 36281) (28614, 1)
(9539, 36281) (9539, 1)


In [26]:
Xtr_tv.sample(5)

Unnamed: 0,aaa,aaaa,aaaaaaaaaa,aaaaaaaaaaaaaaaaaaaaaah,aaaaaaaaaah,aaaaaaaaaajajajajajajajahahahajahaja,aaaaah,aaaaargh,aaaag,aaaah,...,zusterschap,zvakaoma,zvlahos,zyampii,zyeth,zyme,zynga,zython,zzoegrimm,zzz
11390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
Ytr_le.sample(5)

Unnamed: 0,encoded_cyberbullying_type
2233,4
385,1
26203,4
23900,0
21382,5


In [None]:
models = {
    'Gaussian Naive Bayes': GaussianNB(),
    'Linear SVM': LinearSVC(),
    'Kernel SVM': SVC(),
    'XGBoost': GradientBoostingClassifier()
}

results = {
    'time_to_train': [],
    'accuracy': [],
    'f1': []
}


for name, model in list(models.items()):
    start_time = time.time()
    
    model.fit(Xtr_tv, Ytr_le.values.ravel())
    
    end_time = time.time()
    
    Ypred = model.predict(Xva_tv)
    
    results['time_to_train'].append(end_time - start_time)
    results['accuracy'].append(accuracy_score(Yva_le, Ypred))
    results['f1'].append(f1_score(Yva_le, Ypred, average=None))
    
display(pd.DataFrame(results['time_to_train'], index=models.keys(), columns=['Time (seconds)']))

display(pd.DataFrame(results['accuracy'], index=models.keys(), columns=['Accuracy']))

display(pd.DataFrame(np.asarray(results['f1']).T, columns=models.keys(), index=le.classes_))