In [None]:
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Thesis/bully_not_bully_dataset.csv")


In [None]:
# Data Preprocessing
def preprocess_text(text):
    text = text.strip()
    text = text.replace("\n", " ")  # Remove newlines
    return text

df['comment'] = df['comment'].apply(preprocess_text)

In [None]:
# Train-Test Split
train_texts, test_texts, train_labels, test_labels = train_test_split(df['comment'], df['bully_label'], test_size=0.2, random_state=42)


In [None]:
# Text Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

In [None]:
# Encode Labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_labels)
y_test = label_encoder.transform(test_labels)

In [None]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Naive Bayes": MultinomialNB()
}

In [None]:
# Train and Evaluate Models
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Training Logistic Regression...
Logistic Regression Accuracy: 0.7906
              precision    recall  f1-score   support

       bully       0.80      0.91      0.85      5732
   not bully       0.77      0.57      0.66      3069

    accuracy                           0.79      8801
   macro avg       0.78      0.74      0.75      8801
weighted avg       0.79      0.79      0.78      8801

Training Support Vector Machine...
Support Vector Machine Accuracy: 0.8013
              precision    recall  f1-score   support

       bully       0.80      0.92      0.86      5732
   not bully       0.80      0.57      0.67      3069

    accuracy                           0.80      8801
   macro avg       0.80      0.75      0.76      8801
weighted avg       0.80      0.80      0.79      8801

Training Random Forest...
Random Forest Accuracy: 0.7872
              precision    recall  f1-score   support

       bully       0.79      0.92      0.85      5732
   not bully       0.78      0.54   