In [None]:
##Importing necessary libraries
import re
import numpy as np
import pandas as pd
import nltk
import pickle
from sklearn.model_selection  import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV

In [None]:
#Downloading necessary nltk packages
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
##Loading dataset
df = pd.read_csv('https://raw.githubusercontent.com/ShriramDayama/Cyberbulllying_Claissfication/main/cyberbullying_tweets.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,tweet_text,cyberbullying_type
0,0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


# **Exploratory Data Analysis**

In [None]:
df.cyberbullying_type.value_counts()

religion             7998
age                  7992
gender               7973
ethnicity            7961
not_cyberbullying    7945
Name: cyberbullying_type, dtype: int64

# **Preprocessing Data**

In [None]:
#Converting cyberbullying type into numerical values
dict_map = {'not_cyberbullying' : 0, 
            'religion' : 1,
            'age' : 2,
            'gender' : 3,
            'ethnicity' : 4,
            }

df['cb_type'] = df['cyberbullying_type'].map(dict_map, na_action = None)

In [None]:
#defining a function for preprocessing the tweets
def preprocess(string):
  pattern = "[#|@][^\s]*|https://[^\s]|www\.[^\s]"


  emoji_pattern = re.compile(
    "["
    u"\U0001F600-\U0001F64F"
    u"\U0001F300-\U0001F5FF"
    u"\U0001F680-\U0001F6FF"
    u"\U0001F1E0-\U0001F1FF"
    u"\U00002702-\U000027B0"
    u"\U000024C2-\U0001F251"
    "]+"
    ,flags = re.UNICODE,
  )

  
  stop_words = set(nltk.corpus.stopwords.words('english'))
  
  string = re.sub(pattern , "", string)
  string = re.sub(emoji_pattern, "", string)

  tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
  string = tokenizer.tokenize(string)

  lemmatizer = nltk.stem.WordNetLemmatizer()
  string = [lemmatizer.lemmatize(word) for word in string]
  
  string = ' '.join(string)
  return string

In [None]:
# Testing the preprocess function
sent = df.tweet_text[0]
psent = preprocess(sent)
psent

'In other word your food wa crapilicious'

In [None]:
# Applying the preprocess function on the dataset
df["processed_tweets"] = df['tweet_text'].apply(preprocess)
df

Unnamed: 0.1,Unnamed: 0,tweet_text,cyberbullying_type,cb_type,processed_tweets
0,0,"In other words #katandandre, your food was cra...",not_cyberbullying,0,In other word your food wa crapilicious
1,1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,0,Why is so white
2,2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,0,a classy whore Or more red velvet cupcake
3,3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,0,meh P thanks for the head up but not too conce...
4,4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,0,This is an ISIS account pretending to be a Kur...
...,...,...,...,...,...
39864,47687,"Black ppl aren't expected to do anything, depe...",ethnicity,4,Black ppl aren t expected to do anything depen...
39865,47688,Turner did not withhold his disappointment. Tu...,ethnicity,4,Turner did not withhold his disappointment Tur...
39866,47689,I swear to God. This dumb nigger bitch. I have...,ethnicity,4,I swear to God This dumb nigger bitch I have g...
39867,47690,Yea fuck you RT @therealexel: IF YOURE A NIGGE...,ethnicity,4,Yea fuck you RT IF YOURE A NIGGER FUCKING UNFO...


# **Fitting the model**

In [None]:
#Creating train and test datasets
X_train, X_test, y_train, y_test = train_test_split(df["processed_tweets"],df['cb_type'], test_size = 0.2, random_state = 40)


In [None]:
X_train.head()

7494                                Fuck off kat and andre
17233    Oh really are you above court who prove Narend...
35888                 Fuck off you dumb kike loving nigger
882      Quem mais sofre bullying Loiras burras Nerds X...
5699                             Kandre are SUCH bad sport
Name: processed_tweets, dtype: object

In [None]:
#Fitting and checking accuracy of various model
models = {'MultinomialNB' : MultinomialNB(), 'RandomForestClassifier' : RandomForestClassifier(),
          'SVC' : SVC(), 'XGBClassifier' : XGBClassifier()}



for x, y in models.items():
  clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    (x, y)
  ])

  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)

  print(x)
  print(classification_report(y_test, y_pred))
  print('_'*60)


MultinomialNB
              precision    recall  f1-score   support

           0       0.88      0.45      0.60      1570
           1       0.79      0.97      0.87      1599
           2       0.76      0.98      0.86      1620
           3       0.86      0.83      0.85      1587
           4       0.89      0.89      0.89      1598

    accuracy                           0.83      7974
   macro avg       0.84      0.83      0.81      7974
weighted avg       0.84      0.83      0.81      7974

____________________________________________________________
RandomForestClassifier
              precision    recall  f1-score   support

           0       0.76      0.92      0.83      1570
           1       0.96      0.94      0.95      1599
           2       0.98      0.98      0.98      1620
           3       0.95      0.80      0.87      1587
           4       0.99      0.98      0.99      1598

    accuracy                           0.92      7974
   macro avg       0.93      0.92

# **Hyperparameter Tuning**

In [None]:
# Defining a dictionary for hyperparameter tuning
params = {
    'XGBClassifier__n_estimators': [10, 20, 30, 40, 50],
    'XGBClassifier__max_depth': range(1,7),
    'XGBClassifier__min_child_weight': range(5),
}

In [None]:
#Using RandomizedSearchCV to find the best parameters
clf = Pipeline(steps =[
      ('vectorizer_tfidf', TfidfVectorizer()),
      ('XGBClassifier' , XGBClassifier())
  ])

xgbc = RandomizedSearchCV(estimator= clf ,param_distributions= params, cv = 5,  n_iter = 5, return_train_score=False)

xgbc.fit(X_train,y_train)
result = pd.DataFrame(xgbc.cv_results_)


In [None]:
result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_XGBClassifier__n_estimators,param_XGBClassifier__min_child_weight,param_XGBClassifier__max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,40.208806,1.58387,0.307004,0.103098,40,0,4,"{'XGBClassifier__n_estimators': 40, 'XGBClassi...",0.926164,0.923342,0.925067,0.921148,0.927105,0.924565,0.002118,2
1,49.669624,14.989767,0.231597,0.074883,50,2,4,"{'XGBClassifier__n_estimators': 50, 'XGBClassi...",0.928045,0.925223,0.926948,0.923499,0.927732,0.926289,0.001704,1
2,10.770565,1.013183,0.256993,0.064839,50,4,1,"{'XGBClassifier__n_estimators': 50, 'XGBClassi...",0.905628,0.904217,0.90312,0.90359,0.906412,0.904593,0.001241,5
3,24.204296,1.020519,0.201198,0.040835,40,2,3,"{'XGBClassifier__n_estimators': 40, 'XGBClassi...",0.923185,0.921775,0.922088,0.920364,0.922715,0.922025,0.000964,3
4,16.971922,1.221407,0.17698,0.014465,10,0,6,"{'XGBClassifier__n_estimators': 10, 'XGBClassi...",0.914877,0.912996,0.912212,0.910958,0.916601,0.913529,0.001994,4


In [None]:
pred = xgbc.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.76      0.94      0.84      1570
           1       0.97      0.93      0.95      1599
           2       0.99      0.98      0.98      1620
           3       0.95      0.82      0.88      1587
           4       1.00      0.98      0.99      1598

    accuracy                           0.93      7974
   macro avg       0.94      0.93      0.93      7974
weighted avg       0.94      0.93      0.93      7974



# **Saving the model**

In [None]:
#Saving the model
with open('cyberbullying_classifier1.pkl', 'wb') as f:
  pickle.dump(xgbc, f)

In [None]:
#loading and testing the model
model = pickle.load(open('cyberbullying_classifier1.pkl','rb'))
pred = model.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.76      0.94      0.84      1570
           1       0.97      0.93      0.95      1599
           2       0.99      0.98      0.98      1620
           3       0.95      0.82      0.88      1587
           4       1.00      0.98      0.99      1598

    accuracy                           0.93      7974
   macro avg       0.94      0.93      0.93      7974
weighted avg       0.94      0.93      0.93      7974

