In [223]:
import numpy as np
import pandas as pd
import regex as re
from matplotlib import pyplot as plt
import seaborn as sns

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.utils import class_weight
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.svm import SVC

# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rkroc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rkroc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [184]:
data = pd.read_csv('clean_data.csv')

In [185]:
data.isnull().sum()

tweet        134
target         0
hashtag    40728
name       29594
url        43391
emoji      47252
dtype: int64

In [186]:
data = data.fillna('')

In [187]:
data['target'].value_counts(normalize=1)*100

religion               16.770108
age                    16.757527
gender                 16.717689
ethnicity              16.692527
not_cyberbullying      16.658978
other_cyberbullying    16.403170
Name: target, dtype: float64

In [188]:
LE = LabelEncoder()
LE.fit(data['target'])
data['target'] = LE.transform(data['target'])
LE_name_mapping = dict(zip(LE.classes_, LE.transform(LE.classes_)))
print(LE_name_mapping)

{'age': 0, 'ethnicity': 1, 'gender': 2, 'not_cyberbullying': 3, 'other_cyberbullying': 4, 'religion': 5}


In [189]:
data['target'].value_counts(normalize=1)*100

5    16.770108
0    16.757527
2    16.717689
1    16.692527
3    16.658978
4    16.403170
Name: target, dtype: float64

In [190]:
# getting the length from the extra columns
for col in ['hashtag','name','url','emoji']:
    data[col+"_len"] = data[col].apply(lambda x:len(x))
data

Unnamed: 0,tweet,target,hashtag,name,url,emoji,hashtag_len,name_len,url_len,emoji_len
0,in other words your food was crapilicious,3,"['katandandre', 'mkr']",,,,22,0,0,0
1,why is so white studio,3,"['aussietv', 'mkr', 'theblock', 'imacelebritya...",,,,119,0,0,0
2,a classy whore or more red velvet cupcakes,3,,['xochitlsuckkks'],,,0,18,0,0
3,meh p thanks for the heads up but not too conc...,3,,['jasongio'],,,0,12,0,0
4,this is an isis account pretending to be a kur...,3,,['rudhoeenglish'],,,0,17,0,0
...,...,...,...,...,...,...,...,...,...,...
47687,black ppl arent expected to do anything depend...,1,,,,,0,0,0,0
47688,turner did not withhold his disappointment tur...,1,,,,,0,0,0,0
47689,i swear to god this dumb nigger bitch i have g...,1,,,,,0,0,0,0
47690,yea fuck you rt if youre a nigger fucking unfo...,1,,['therealexel'],,,0,15,0,0


In [192]:
for col in ['hashtag','name','url','emoji']:
    data[col] = data[col].apply(lambda x:re.sub(r"[,']",'',x))
for col in ['hashtag','name','url','emoji']:
    data[col] = data[col].str.replace("[",'')
    data[col] = data[col].str.replace("]",'')
    if col =='emoji':
        data[col] = data[col].str.replace("_",' ')
        

  data[col] = data[col].str.replace("[",'')
  data[col] = data[col].str.replace("]",'')


In [193]:
data

Unnamed: 0,tweet,target,hashtag,name,url,emoji,hashtag_len,name_len,url_len,emoji_len
0,in other words your food was crapilicious,3,katandandre mkr,,,,22,0,0,0
1,why is so white studio,3,aussietv mkr theblock imacelebrityau today sun...,,,,119,0,0,0
2,a classy whore or more red velvet cupcakes,3,,xochitlsuckkks,,,0,18,0,0
3,meh p thanks for the heads up but not too conc...,3,,jasongio,,,0,12,0,0
4,this is an isis account pretending to be a kur...,3,,rudhoeenglish,,,0,17,0,0
...,...,...,...,...,...,...,...,...,...,...
47687,black ppl arent expected to do anything depend...,1,,,,,0,0,0,0
47688,turner did not withhold his disappointment tur...,1,,,,,0,0,0,0
47689,i swear to god this dumb nigger bitch i have g...,1,,,,,0,0,0,0
47690,yea fuck you rt if youre a nigger fucking unfo...,1,,therealexel,,,0,15,0,0


# Vectorization

In [199]:
text_col = data.loc[:,data.dtypes=='object'].columns
text_col

Index(['tweet', 'hashtag', 'name', 'url', 'emoji'], dtype='object')

In [200]:
stop_words = stopwords.words('english')
stop_words.extend(['rt', 'mkr', 'didn', 'bc', 'n', 'm', 'im', 'll', 'y', 've', 'u', 'ur', 'don', 't', 's','etc'])
for col in text_col:
    data[col] = data[col].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
    print("Done! {}".format(col))
data.head()

Done! tweet
Done! hashtag
Done! name
Done! url
Done! emoji


Unnamed: 0,tweet,target,hashtag,name,url,emoji,hashtag_len,name_len,url_len,emoji_len
0,words food crapilicious,3,katandandre,,,,22,0,0,0
1,white studio,3,aussietv theblock imacelebrityau today sunrise...,,,,119,0,0,0
2,classy whore red velvet cupcakes,3,,xochitlsuckkks,,,0,18,0,0
3,meh p thanks heads concerned another angry dud...,3,,jasongio,,,0,12,0,0
4,isis account pretending kurdish account like i...,3,,rudhoeenglish,,,0,17,0,0


In [204]:
list(data['url'])

['',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'http://t.co/usqinyw5gn',
 'http://twitvid.com/a2tnp',
 '',
 'http://t.co/jlvke1epws',
 '',
 '',
 '',
 '',
 'http://tumblr.com/xol3xl14zy',
 '',
 'http://t.co/8b1aclczn9',
 'http://t.co/0xrozsnn',
 '',
 '',
 '',
 'http://t.co/sxtiwtp',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'http://t.co/zeglms7',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'http://t.co/fina7zfttt',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'http://t.co/b4obslitug',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'http://t.co/wgrutwf',
 '',
 'http://t.co/kskglrh1jt http://t.c…',
 '',
 '',
 '',
 'http://t.co/0m07mut5ay',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'http://t.co/dr0yql4',
 '',
 '',
 '',
 '',
 '',
 'http://t.co/ywjmayphjm',
 'http://bit.ly/rlus4m',
 'http://t.co/yyfpou0',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'http://t.co/cqajvol',
 '',
 '',
 '',
 'http://t.co/xjkhms6',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'http://t.co/6fv

In [51]:
from sklearn.feature_extraction.text import CountVectorizer

In [205]:
cv_t = CountVectorizer(max_features = 1500,ngram_range=(1,3))
t_array = cv_t.fit_transform(data['tweet']).toarray()

In [206]:
cv_h = CountVectorizer(max_features = 500,ngram_range=(1,1))
h_array = cv_h.fit_transform(data['hashtag']).toarray()

In [207]:
cv_e = CountVectorizer(max_features = 500,ngram_range=(1,1))
e_array = cv_e.fit_transform(data['emoji']).toarray()

In [208]:
extra = data[['hashtag_len','name_len','url_len','emoji_len']].values
extra = np.asarray(extra)

In [209]:
X = np.hstack((t_array,h_array,e_array,extra))
y = data['target'].values

In [210]:
print(X.shape,y.shape)

(47692, 2214) (47692,)


In [211]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [220]:
#importing the model
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,f1_score,classification_report

In [218]:
#Decision tree Classifier
clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)
pred= clf.predict(X_test)
acc = f1_score(pred,y_test,average='micro')
print(acc)
print(confusion_matrix(y_test,pred))

0.7743998322675333
[[1496    2    4   38   16    1]
 [   4 1570    7   24   19    3]
 [   2   11 1308  167  126   12]
 [  38   11  107  787  576   53]
 [  10   10  117  615  755   12]
 [   5    4   20   97   41 1471]]


In [222]:
print(classification_report(y_test,pred))#3 and #4 are not performimg good

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1557
           1       0.98      0.96      0.97      1627
           2       0.84      0.80      0.82      1626
           3       0.46      0.50      0.48      1572
           4       0.49      0.50      0.49      1519
           5       0.95      0.90      0.92      1638

    accuracy                           0.77      9539
   macro avg       0.78      0.77      0.77      9539
weighted avg       0.78      0.77      0.78      9539



In [224]:
forest = RandomForestClassifier(n_jobs=-1,random_state=1)
forest.fit(X, y)
y_pred = forest.predict(X)
f1_score(y,y_pred,average='micro')

0.9567223014342028

In [226]:
clf = RandomForestClassifier(n_estimators=100,random_state=1,n_jobs=-1)
clf.fit(X_train,y_train)
pred_rf = clf.predict(X_test)
f1_score(y_test,pred_rf,average='micro')

0.8070028304853757

In [227]:
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(n_estimators=100,random_state=1,n_jobs=-1)
scoring='f1_micro'
scores = cross_val_score(clf, X, y, scoring=scoring, cv=5,verbose=3)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ................................ score: (test=0.811) total time= 1.2min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.3min remaining:    0.0s


[CV] END ................................ score: (test=0.807) total time=  50.1s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.1min remaining:    0.0s


[CV] END ................................ score: (test=0.805) total time=  51.2s
[CV] END ................................ score: (test=0.811) total time=  50.5s
[CV] END ................................ score: (test=0.806) total time=  51.6s
0.81 accuracy with a standard deviation of 0.00


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.7min finished


In [228]:
# matrix = confusion_matrix(y_test1, predictions)
# matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

# # Build the plot
# plt.figure(figsize=(16,7))
# sns.set(font_scale=1.4)
# sns.heatmap(matrix, annot=True, annot_kws={'size':10},
#             cmap=plt.cm.Greens, linewidths=0.2)

# # Add labels to the plot
# class_names = ['Updates','Personal', 'Promotions','Forums','Purchases','Travel','Spam','Social']
# tick_marks = np.arange(len(class_names))
# tick_marks2 = tick_marks + 0.5
# plt.xticks(tick_marks, class_names, rotation=25)
# plt.yticks(tick_marks2, class_names, rotation=0)
# plt.xlabel('Predicted label')
# plt.ylabel('True label')
# plt.title('Confusion Matrix for XG BOOST Model')
# plt.show()

In [229]:
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier

In [230]:
model_rb1 = BalancedRandomForestClassifier(n_estimators=100) # performs random under sampling of majority class in each sample
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model_rb1, X, y, scoring='f1_micro', cv=cv, n_jobs=-1,verbose=3)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 out of  30 | elapsed: 24.9min remaining:  3.8min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 25.0min finished


0.81 accuracy with a standard deviation of 0.01
