In [1]:
import pandas as pd

# Load dataset (CSV inside ZIP)
df = pd.read_csv('../data/cyberbullying_tweets.csv.zip')

# Display first few rows
df.head()


Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [2]:
# Check for null values and data types
df.info()
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47692 entries, 0 to 47691
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   tweet_text          47692 non-null  object
 1   cyberbullying_type  47692 non-null  object
dtypes: object(2)
memory usage: 745.3+ KB


tweet_text            0
cyberbullying_type    0
dtype: int64

In [3]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = " ".join(word for word in text.split() if word not in stop_words)
    return text


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pushp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [4]:
df['tweet_text'] = df['tweet_text'].astype(str)  # Ensure no NaN
df['tweet_text'] = df['tweet_text'].apply(preprocess_text)

# Show cleaned text
df[['tweet_text', 'cyberbullying_type']].head()


Unnamed: 0,tweet_text,cyberbullying_type
0,words food crapilicious,not_cyberbullying
1,white,not_cyberbullying
2,classy whore red velvet cupcakes,not_cyberbullying
3,meh p thanks heads concerned another angry dud...,not_cyberbullying
4,isis account pretending kurdish account like i...,not_cyberbullying


In [5]:
from sklearn.model_selection import train_test_split

# Encode target labels
df['label'] = df['cyberbullying_type'].astype('category').cat.codes

X = df['tweet_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)

# Show performance
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1603
           1       0.98      0.96      0.97      1603
           2       0.88      0.81      0.84      1531
           3       0.55      0.55      0.55      1624
           4       0.58      0.64      0.61      1612
           5       0.95      0.93      0.94      1566

    accuracy                           0.81      9539
   macro avg       0.81      0.81      0.81      9539
weighted avg       0.81      0.81      0.81      9539



In [8]:
import pickle

with open('../models/cyberbully_classifier.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('../models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)


In [9]:
def predict_text(text):
    cleaned = preprocess_text(text)
    vect = vectorizer.transform([cleaned])
    pred = model.predict(vect)[0]
    label = df['cyberbullying_type'].astype('category').cat.categories[pred]
    return label

# Try prediction
predict_text("You're such a loser")


'other_cyberbullying'

In [17]:
predict_text("good")




'not_cyberbullying'