# **Step-1 Business Problem Understanding**

**The goal is to analyze tweets to identify and classify cyberbullying content effectively, helping businesses or organizations take appropriate action to reduce its impact**

# **Step-2 Data understanding**

In [27]:
import pandas as pd
import warnings
warnings.simplefilter("ignore")

In [28]:
from google.colab import files
uploaded = files.upload()

Saving cyberbullying_tweets.csv to cyberbullying_tweets (1).csv


In [29]:
import pandas as pd
import io

df = pd.read_csv(io.BytesIO(uploaded['cyberbullying_tweets (1).csv']))
print(df)

                                              tweet_text cyberbullying_type
0      In other words #katandandre, your food was cra...  not_cyberbullying
1      Why is #aussietv so white? #MKR #theblock #ImA...  not_cyberbullying
2      @XochitlSuckkks a classy whore? Or more red ve...  not_cyberbullying
3      @Jason_Gio meh. :P  thanks for the heads up, b...  not_cyberbullying
4      @RudhoeEnglish This is an ISIS account pretend...  not_cyberbullying
...                                                  ...                ...
47687  Black ppl aren't expected to do anything, depe...          ethnicity
47688  Turner did not withhold his disappointment. Tu...          ethnicity
47689  I swear to God. This dumb nigger bitch. I have...          ethnicity
47690  Yea fuck you RT @therealexel: IF YOURE A NIGGE...          ethnicity
47691  Bro. U gotta chill RT @CHILLShrammy: Dog FUCK ...          ethnicity

[47692 rows x 2 columns]


In [30]:
df.rename(columns={"tweet_text": "message", "cyberbullying_type": "label"}, inplace=True)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47692 entries, 0 to 47691
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   message  47692 non-null  object
 1   label    47692 non-null  object
dtypes: object(2)
memory usage: 745.3+ KB


In [32]:
df["label"].unique()

array(['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying',
       'age', 'ethnicity'], dtype=object)

In [33]:
df["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
religion,7998
age,7992
gender,7973
ethnicity,7961
not_cyberbullying,7945
other_cyberbullying,7823


In [34]:
df["label"].value_counts()/len(df)

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
religion,0.167701
age,0.167575
gender,0.167177
ethnicity,0.166925
not_cyberbullying,0.16659
other_cyberbullying,0.164032


# Step - 3 : Text Preprocessing

#### (Text Cleaning + Text Vectorization)

# Text Cleaning

- **Remove Punctuation**
- **Remove Stopwords**
- **Stemming**

In [35]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download('stopwords')  # For stopwords
nltk.download('wordnet')    # For WordNet lemmatizer
nltk.download('omw-1.4')    # Optional: For language models for WordNet

# Initialize lemmatizer
ps = WordNetLemmatizer()

# Preprocessing
corpus = []
for i in range(len(df)):
    # Remove non-alphabetic characters
    s = re.sub('[^a-zA-Z]', " ", df['message'][i])
    s = s.lower()
    s = s.split()
    # Remove stopwords and apply lemmatization
    s = [ps.lemmatize(word) for word in s if not word in set(stopwords.words('english'))]
    s = " ".join(s)
    corpus.append(s)

corpus


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


['word katandandre food crapilicious mkr',
 'aussietv white mkr theblock imacelebrityau today sunrise studio neighbour wonderlandten etc',
 'xochitlsuckkks classy whore red velvet cupcake',
 'jason gio meh p thanks head concerned another angry dude twitter',
 'rudhoeenglish isi account pretending kurdish account like islam lie',
 'raja aab quickieleaks yes test god good bad indifferent weird whatever prof god existence',
 'itu sekolah ya bukan tempat bully ga jauh kaya neraka',
 'karma hope bite kat butt nasty mkr',
 'stockputout everything mostly priest',
 'rebecca black drop school due bullying',
 'jord dead http co usqinyw gn',
 'bully flush kd http twitvid com tnp',
 'ughhhh mkr',
 'rt kurdsnews turkish state killed child last year http co jlvke epws news google eviricitoplulu uk rt eyidee',
 'love best response hotcake managed film non committal meh adolescent mkr',
 'yasmimcaci bferrarii parem de fazer bullying comigo uhahuah bando de preto',
 'sarinhacoral victor maggi tadinhu d

## **Vectorization**

In [36]:
from sklearn. feature_extraction. text import TfidfVectorizer

tf= TfidfVectorizer(max_features=800)

X = pd.DataFrame(tf.fit_transform(corpus).toarray(), columns=tf.get_feature_names_out())

In [37]:
X.shape

(47692, 1000)

In [38]:
X

Unnamed: 0,able,abortion,absolutely,abt,abuse,accept,account,act,acting,action,...,yard,yea,yeah,year,yes,yet,yo,young,youtube,yr
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.792026,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47687,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.279232,0.0,0.0,0.0,0.0
47688,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
47689,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
47690,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.508594,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [39]:
df["label"] = df["label"].replace({"not_cyberbullying": 0, "gender": 1,
                                                                             "religion": 2, "other_cyberbullying": 3,
                                                                             "age": 4,  "ethnicity": 5})

df

Unnamed: 0,message,label
0,"In other words #katandandre, your food was cra...",0
1,Why is #aussietv so white? #MKR #theblock #ImA...,0
2,@XochitlSuckkks a classy whore? Or more red ve...,0
3,"@Jason_Gio meh. :P thanks for the heads up, b...",0
4,@RudhoeEnglish This is an ISIS account pretend...,0
...,...,...
47687,"Black ppl aren't expected to do anything, depe...",5
47688,Turner did not withhold his disappointment. Tu...,5
47689,I swear to God. This dumb nigger bitch. I have...,5
47690,Yea fuck you RT @therealexel: IF YOURE A NIGGE...,5


In [40]:
y=df["label"]

In [41]:
y

Unnamed: 0,label
0,0
1,0
2,0
3,0
4,0
...,...
47687,5
47688,5
47689,5
47690,5


## Train-Test Split

In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)

# Step-4: Modelling

# **1. Navie Bayes Classifier**

In [43]:
from sklearn.naive_bayes import MultinomialNB                  # import ML Algorithm

model=MultinomialNB()                                          # save as model name

model.fit(X_train,y_train)                                    # fit on train data

## Evaluation

In [44]:
ypred_train = model.predict(X_train)

from sklearn.metrics import accuracy_score
print("Train Accuracy:", accuracy_score(y_train,ypred_train))

Train Accuracy: 0.7780253191098996


In [45]:
from sklearn.model_selection import cross_val_score
print("Cross Validation Score:", cross_val_score(model,X_train,y_train,cv=5).mean())

Cross Validation Score: 0.7609362686139329


In [46]:
#Predictions
ypred_test = model.predict(X_test)

from sklearn.metrics import accuracy_score
print("Test Accuracy:", accuracy_score(y_test,ypred_test))

Test Accuracy: 0.7691581926826712


In [47]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,ypred_test))

from sklearn.metrics import classification_report
print(classification_report(y_test,ypred_test))

[[ 703   58  126  380  254   51]
 [ 133 1274   31  123   32   33]
 [  25   17 1565   21    7    3]
 [ 238   69   79  790  263   80]
 [  30    4    3   18 1499    3]
 [  11   10   27   44   29 1506]]
              precision    recall  f1-score   support

           0       0.62      0.45      0.52      1572
           1       0.89      0.78      0.83      1626
           2       0.85      0.96      0.90      1638
           3       0.57      0.52      0.55      1519
           4       0.72      0.96      0.82      1557
           5       0.90      0.93      0.91      1627

    accuracy                           0.77      9539
   macro avg       0.76      0.77      0.76      9539
weighted avg       0.76      0.77      0.76      9539



In [48]:
from sklearn. linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier                # KNeighborsRegressor
from sklearn.svm import SVC                                       # SVR
from sklearn. tree import DecisionTreeClassifier                  # DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier               # RandomForestRegressor
from sklearn. ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier           # GradientBoostingRegressor
from xgboost import XGBClassifier                                 # XGBRegressor

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# **2. Logistic regression**


In [49]:
#ModelLing
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()
log_model.fit(X_train,y_train)

In [50]:
#Predict & Evaluate on train data
ypred_train = log_model.predict(X_train)
print("Train Accuracy :", accuracy_score(y_train,ypred_train))

Train Accuracy : 0.8486881765523026


In [51]:
#Cross Validation on Train data
from sklearn.model_selection import cross_val_score
print("CV Score :", cross_val_score(log_model, X_train, y_train,cv=5, scoring="accuracy").mean())

CV Score : 0.8228712606181622


In [52]:
#predict & Evaluation on test data
ypred_test = log_model.predict(X_test)
print("Test Accuracy :", accuracy_score(y_test,ypred_test))

Test Accuracy : 0.8333158612013838


In [26]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,ypred_test))

from sklearn.metrics import classification_report
print(classification_report(y_test,ypred_test))

[[ 944   42   40  482   54   10]
 [ 125 1365    6  115    8    7]
 [  73    7 1540   16    1    1]
 [ 364   49    6 1076   14   10]
 [  21    2    2   13 1519    0]
 [  10    3    3   22    3 1586]]
              precision    recall  f1-score   support

           0       0.61      0.60      0.61      1572
           1       0.93      0.84      0.88      1626
           2       0.96      0.94      0.95      1638
           3       0.62      0.71      0.66      1519
           4       0.95      0.98      0.96      1557
           5       0.98      0.97      0.98      1627

    accuracy                           0.84      9539
   macro avg       0.84      0.84      0.84      9539
weighted avg       0.85      0.84      0.84      9539

[[ 944   42   40  482   54   10]
 [ 125 1365    6  115    8    7]
 [  73    7 1540   16    1    1]
 [ 364   49    6 1076   14   10]
 [  21    2    2   13 1519    0]
 [  10    3    3   22    3 1586]]
              precision    recall  f1-score   support

    

# **3. Decision tree**

In [54]:
model = DecisionTreeClassifier(random_state=True)
model.fit(X_train,y_train)

In [55]:
max_depth = model.get_depth()
print("Max Depth of the Decision Tree:", max_depth)

Max Depth of the Decision Tree: 889


In [56]:
#Hyper parameter tuning
estimator = DecisionTreeClassifier(random_state=True)

param_grid = {"criterion": ["gini", "entropy"], "max_depth":list(range(1,16))}

dt_grid = GridSearchCV(estimator,param_grid, scoring='accuracy', cv=5)
dt_grid.fit(X_train,y_train)

dt = dt_grid.best_estimator_
dt

In [57]:
 #Important features
feats_dt = pd. DataFrame(data=dt. feature_importances_,
index=X.columns,
columns=['Importance' ])

feats_dt

Unnamed: 0,Importance
able,0.000000
abortion,0.000000
absolutely,0.000000
abt,0.000046
abuse,0.000000
...,...
yet,0.000000
yo,0.000000
young,0.000000
youtube,0.000000


In [58]:
#Important features
feats_ab = pd. DataFrame(data=dt. feature_importances_,
index=X.columns,
columns=['Importance' ])

important_features_dt = feats_ab[feats_ab["Importance"]>0].index.tolist()
important_features_dt

['abt',
 'al',
 'always',
 'andre',
 'anything',
 'away',
 'ban',
 'beautiful',
 'better',
 'big',
 'bitch',
 'black',
 'blameonenotall',
 'blonde',
 'book',
 'boy',
 'bullied',
 'bullshit',
 'bully',
 'bullying',
 'bus',
 'call',
 'called',
 'change',
 'christian',
 'co',
 'colored',
 'com',
 'come',
 'constantly',
 'da',
 'dumb',
 'everyone',
 'fear',
 'female',
 'feminazi',
 'fine',
 'first',
 'forgot',
 'fuck',
 'gay',
 'get',
 'getting',
 'girl',
 'giving',
 'go',
 'god',
 'good',
 'got',
 'hadith',
 'hahaha',
 'hate',
 'high',
 'hoe',
 'however',
 'http',
 'idiot',
 'instant',
 'isi',
 'islam',
 'islamic',
 'israeliregime',
 'joke',
 'kat',
 'know',
 'last',
 'later',
 'law',
 'lesbian',
 'lgbt',
 'list',
 'live',
 'lol',
 'make',
 'married',
 'matter',
 'mkr',
 'mohammed',
 'morning',
 'much',
 'murder',
 'muslim',
 'name',
 'need',
 'negro',
 'nigger',
 'notsexist',
 'oh',
 'old',
 'one',
 'past',
 'pay',
 'people',
 'ppl',
 'racism',
 'rape',
 'real',
 'really',
 'right',
 'rt

In [None]:
#Selecting train & test data
X_train_dt = X_train[important_features_dt]
X_test_dt = X_test [important_features_dt]

#ModelLing
dt = dt_grid.best_estimator_
dt.fit(X_train_dt,y_train)

#Evaluation
ypred_train = dt.predict(X_train_dt)
ypred_test = dt.predict(X_test_dt)

print("Train Accuracy :", accuracy_score(y_train,ypred_train))
print("CV Score :", cross_val_score(dt,X_train_dt,y_train,cv=5,scoring="accuracy").mean())
print("Test Accuracy :", accuracy_score(y_test,ypred_test))

Train Accuracy : 0.8678216653998375
CV Score : 0.8647026399354362
Test Accuracy : 0.8665478561694098


In [59]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,ypred_test))

from sklearn.metrics import classification_report
print(classification_report(y_test,ypred_test))

[[ 901   45   42  511   60   13]
 [ 127 1358    6  122    6    7]
 [  58   10 1534   33    1    2]
 [ 390   54    3 1045   15   12]
 [  21    1    2   12 1521    0]
 [   8    3    2   23    1 1590]]
              precision    recall  f1-score   support

           0       0.60      0.57      0.59      1572
           1       0.92      0.84      0.88      1626
           2       0.97      0.94      0.95      1638
           3       0.60      0.69      0.64      1519
           4       0.95      0.98      0.96      1557
           5       0.98      0.98      0.98      1627

    accuracy                           0.83      9539
   macro avg       0.84      0.83      0.83      9539
weighted avg       0.84      0.83      0.84      9539



# **4.Random Forest Classifier**

In [60]:
#Hyper parameter tuning
estimator = RandomForestClassifier(random_state=True)

param_grid = {'n_estimators':list(range(1,51))}

rf_grid = GridSearchCV(estimator, param_grid, scoring="accuracy",cv=5)
rf_grid.fit(X_train,y_train)

rf= rf_grid.best_estimator_
rf

In [None]:
#Important features
feats_ab = pd. DataFrame(data=rf. feature_importances_, index=X.columns, columns=['Importance' ])

important_features_rf = feats_ab[feats_ab["Importance"]>0].index.tolist()
important_features_rf

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [None]:
#use important features
X_train_rf = X_train[important_features_rf]
X_test_rf = X_test [important_features_rf]

#Modelling
rf= rf_grid.best_estimator_
rf.fit(X_train_rf,y_train)

#Evaluation
ypred_train = rf.predict(X_train_rf)
ypred_test = rf.predict(X_test_rf)

print("Train Accuracy :", accuracy_score(y_train,ypred_train))
print("CV score :", cross_val_score(rf, X_train_rf, y_train, cv=5, scoring="accuracy").mean())
print("Test Accuracy :", accuracy_score(y_test,ypred_test))

Train Accuracy : 0.9593478887636621
CV score : 0.8508113504737608
Test Accuracy : 0.8535485899989517


In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,ypred_test))

from sklearn.metrics import classification_report
print(classification_report(y_test,ypred_test))

# **5. ADA Boost**

In [None]:
estimator = AdaBoostClassifier(random_state=True)
param_grid = {'n_estimators':list(range(1,51))}

ab_grid = GridSearchCV(estimator, param_grid, scoring="accuracy", cv=5)
ab_grid.fit(X_train,y_train)

ab = ab_grid.best_estimator_
ab

In [None]:
#Important features
feats_ab = pd. DataFrame(data=ab. feature_importances_, index=X.columns, columns=['Importance'])

important_features_ab = feats_ab[feats_ab["Importance"]>0].index.tolist()
important_features_ab

[114, 336, 342, 395, 421, 561, 582, 601, 704, 748]

In [None]:
X_train_ab = X_train[important_features_ab]
X_test_ab = X_test[important_features_ab]

#Modelling
ab = ab_grid.best_estimator_
ab.fit(X_train_ab,y_train)

#Evaluation
ypred_train = ab.predict(X_train_ab)
ypred_test = ab.predict(X_test_ab)

print("Train Accuracy :", accuracy_score(y_train,ypred_train))
print("CV Score :", cross_val_score(ab,X_train_ab,y_train,cv=5, scoring="accuracy").mean())
print("Test Accuracy :",accuracy_score(y_test,ypred_test))

Train Accuracy : 0.8475349251697114
CV Score : 0.8475349101143452
Test Accuracy : 0.8487262815808785


In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,ypred_test))

from sklearn.metrics import classification_report
print(classification_report(y_test,ypred_test))

# **6. Gradient Boost Classifier**

In [None]:
estimator = GradientBoostingClassifier(random_state=True)
param_grid = {"n_estimators": list(range(1,10)),
"learning_rate": [0.1,0.2,0.3,0.4,0.5,0.7,0.8,0.9,1.0]}

gb_grid = GridSearchCV(estimator,param_grid, scoring="accuracy", cv=5)
gb_grid.fit(X_train,y_train)

gb = gb_grid.best_estimator_
gb

In [None]:
#Important features
feats_gb = pd. DataFrame(data=gb. feature_importances_, index=X.columns, columns=['Importance' ])

important_features_gb = feats_gb[feats_gb["Importance"]>0].index.tolist()
important_features_gb

[34,
 35,
 44,
 90,
 97,
 111,
 113,
 114,
 120,
 142,
 148,
 154,
 159,
 202,
 293,
 294,
 303,
 324,
 336,
 342,
 395,
 399,
 408,
 412,
 421,
 439,
 440,
 460,
 505,
 538,
 561,
 582,
 593,
 601,
 640,
 695,
 704,
 735,
 748,
 768,
 841,
 869,
 870,
 927,
 965]

In [None]:
X_train_gb = X_train[important_features_gb]
X_test_gb = X_test [important_features_gb]

gb = gb_grid.best_estimator_
gb.fit(X_train_gb,y_train)

ypred_train = gb.predict(X_train_gb)
ypred_test = gb.predict(X_test_gb)

print("Train Accuracy :", accuracy_score(y_train,ypred_train))
print("CV Score :", cross_val_score(gb,X_train_gb,y_train,cv=5,scoring="accuracy").mean())
print("Test Accuracy :", accuracy_score(y_test,ypred_test))

Train Accuracy : 0.8682148192802663
CV Score : 0.8668519230640419
Test Accuracy : 0.8730474892546388


In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,ypred_test))

from sklearn.metrics import classification_report
print(classification_report(y_test,ypred_test))