In [None]:
import pandas as pd

# Load dataset (CSV inside ZIP)
df = pd.read_csv('../data/cyberbullying_tweets.csv.zip')

# Display first few rows
df.head()


In [None]:
# Check for null values and data types
df.info()
df.isnull().sum()


In [None]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = " ".join(word for word in text.split() if word not in stop_words)
    return text


In [None]:
df['tweet_text'] = df['tweet_text'].astype(str)  # Ensure no NaN
df['tweet_text'] = df['tweet_text'].apply(preprocess_text)

# Show cleaned text
df[['tweet_text', 'cyberbullying_type']].head()


In [None]:
from sklearn.model_selection import train_test_split

# Encode target labels
df['label'] = df['cyberbullying_type'].astype('category').cat.codes

X = df['tweet_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)

# Show performance
print(classification_report(y_test, y_pred))


In [None]:
import pickle

with open('../models/cyberbully_classifier.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('../models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)


In [None]:
def predict_text(text):
    cleaned = preprocess_text(text)
    vect = vectorizer.transform([cleaned])
    pred = model.predict(vect)[0]
    label = df['cyberbullying_type'].astype('category').cat.categories[pred]
    return label

# Try prediction
predict_text("You're such a loser")


In [None]:
predict_text("good")


