**SVM**

In [None]:
# Install dependencies if needed
!pip install emoji scikit-learn

import pandas as pd
import emoji
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder


In [None]:
def preprocess_text(text):
    # Convert emojis ‚Üí text labels (e.g., üò° -> :angry_face:)
    text = emoji.demojize(text, language='en')
    # Replace underscores with spaces to improve TF-IDF tokenization
    text = text.replace("_", " ")
    return text


In [None]:
!pip install pandas odfpy

In [None]:
df = pd.read_excel("Dataset.ods")

df = df.rename(columns={"COMMENTS": "text", "LABELS": "label"})
df

In [None]:

# Preprocess emojis
df["text"] = df["text"].apply(preprocess_text)

# Encode labels
le = LabelEncoder()
df["label"] = le.fit_transform(df["label"])

In [None]:
df

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)


In [None]:
svm_clf = Pipeline([
    ('tfidf', TfidfVectorizer(
        sublinear_tf=True,
        max_features=2000,
        ngram_range=(1, 2)   # unigrams + bigrams improve bullying detection
    )),
    ('svm', LinearSVC(class_weight="balanced"))
])


In [None]:
svm_clf.fit(X_train, y_train)


In [None]:
svm_preds = svm_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, svm_preds))
print(classification_report(y_test, svm_preds, target_names=le.classes_))


In [None]:
def predict_comment(text):
    text = preprocess_text(text)
    pred = svm_clf.predict([text])[0]
    return le.inverse_transform([pred])[0]

print(predict_comment("You are so stupid üò°"))
print(predict_comment("Have a nice day üòä"))


**RANDOM FOREST**

In [None]:
import emoji
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [None]:
# 1. Optional: Preprocess emojis ‚Üí readable words
# -------------------------------------------------------
def preprocess_text(text):
    # convert üò° ‚Üí :angry_face:
    text = emoji.demojize(text, language="en")
    return text.replace("_", " ")


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
   df["text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

In [None]:
tfidf = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),   # unigrams + bigrams
    sublinear_tf=True
)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
# 4. Random Forest model
# -------------------------------------------------------
rf = RandomForestClassifier(
    n_estimators=400,         # number of trees
    max_depth=None,           # grow trees fully
    class_weight="balanced",  # handle bullying imbalance
    n_jobs=-1,                # use all CPU cores
    random_state=42
)
rf.fit(X_train_tfidf, y_train)

In [None]:
# 5. Predict & evaluate
# -------------------------------------------------------
rafo_preds = rf.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, rafo_preds))
print(classification_report(y_test, rafo_preds))

In [None]:
def predict_comment1(text):
    text = preprocess_text(text)
    pred = rf.predict([text])[0]
    return le.inverse_transform([pred])[0]

print(predict_comment("You are so stupid üò°"))
print(predict_comment("Have a nice day üòä"))


**DISTILBERT**

In [None]:
!pip install transformers datasets emoji scikit-learn


In [None]:
import emoji
import pandas as pd
from sklearn.model_selection import train_test_split

# Example structure: df["text"], df["label"]
# label: 1 = Non-bullying, 0 = Bullying

def clean_text(text):
    text = emoji.demojize(text, language="en")  # üò° -> :angry_face:
    return text.replace("_", " ")

df["clean_text"] = df["text"].apply(clean_text)

train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["label"],
    random_state=42
)


In [None]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(
        batch["clean_text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )


In [None]:
from datasets import Dataset

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

train_ds = train_ds.rename_column("label", "labels")
test_ds = test_ds.rename_column("label", "labels")

train_ds.set_format(type="torch")
test_ds.set_format(type="torch")


In [None]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="macro")
    }


In [None]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,

    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,

    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_steps=20,

    fp16=False,         # True if GPU supports FP16
    report_to="none"
)


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


In [None]:
results = trainer.evaluate()
print(results)


In [None]:
model.to("cuda")

In [None]:
import torch

def bert_predict(text):
    text = clean_text(text)

    # 1. Tokenize
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # 2. Move inputs to the same device as model
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # 3. Predict
    with torch.no_grad():
        outputs = model(**inputs)

    bert_preds = outputs.logits.argmax(dim=1).item()
    return "Bullying" if bert_preds == 0 else "Not bullying"


In [None]:
print(bert_predict("Red"))

In [None]:
bert_preds = []
for text in X_test:
    bert_preds.append(0 if bert_predict(text) == "Bullying" else 1)



**LSTM**

In [None]:
pip install emoji torch numpy scikit-learn

In [None]:
import emoji
import re

def clean_text(text):
    text = emoji.demojize(text, language="en")  # üò° ‚Üí :angry_face:
    text = text.replace("_", " ")
    text = re.sub(r":[a-zA-Z0-9 ]+:", "", text)  # remove :emoji names:
    return text.lower().strip()


In [None]:
X = df["text"].tolist()
y = df["label"].tolist()   # 1 = non-bullying, 0 = bullying

X = [clean_text(t) for t in X]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
from collections import Counter
import torch
import numpy as np

# Build vocabulary
def build_vocab(texts, vocab_size=20000):
    words = " ".join(texts).split()
    freq = Counter(words)
    most_common = freq.most_common(vocab_size - 2)

    vocab = {"<PAD>": 0, "<UNK>": 1}
    vocab.update({word: i + 2 for i, (word, _) in enumerate(most_common)})
    return vocab

vocab = build_vocab(X_train, vocab_size=20000)

def encode(text, max_len=40):
    ids = [vocab.get(w, 1) for w in text.split()]
    if len(ids) < max_len:
        ids += [0] * (max_len - len(ids))
    return ids[:max_len]


In [None]:
max_len = 40

X_train_ids = [encode(t, max_len) for t in X_train]
X_test_ids  = [encode(t, max_len) for t in X_test]

X_train_ids = torch.tensor(X_train_ids)
X_test_ids  = torch.tensor(X_test_ids)
y_train     = torch.tensor(y_train)
y_test      = torch.tensor(y_test)


In [None]:
from torch.utils.data import DataLoader, TensorDataset

batch_size = 32

train_dl = DataLoader(
    TensorDataset(X_train_ids, y_train),
    batch_size=batch_size,
    shuffle=True
)

test_dl = DataLoader(
    TensorDataset(X_test_ids, y_test),
    batch_size=batch_size
)


In [None]:
import torch.nn as nn
import torch.nn.functional as F

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=300, hidden_dim=256, num_classes=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        output, (h, c) = self.lstm(x)
        out = self.fc(h[-1])
        return out


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = LSTMClassifier(vocab_size=len(vocab)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

epochs = 200

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for Xb, yb in train_dl:
        Xb, yb = Xb.to(device), yb.to(device)

        optimizer.zero_grad()
        preds = model(Xb)

        loss = criterion(preds, yb)
        loss.backward()

        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")


In [None]:
from sklearn.metrics import accuracy_score, f1_score

model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for Xb, yb in test_dl:
        Xb = Xb.to(device)
        preds = model(Xb).argmax(dim=1).cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(yb.numpy())

print("Accuracy:", accuracy_score(all_labels, all_preds))
print("F1 Score:", f1_score(all_labels, all_preds, average="macro"))


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# recreate tokenizer
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)


In [None]:
def lstm_predict(text):
    # Use the same tokenizer used during LSTM training (Keras tokenizer)
    text = clean_text(text)

    seq = tokenizer.texts_to_sequences([text])      # ‚ùó must be keras tokenizer
    padded = pad_sequences(seq, maxlen=max_len)

    device = next(model.parameters()).device
    X = torch.tensor(padded).long().to(device)

    with torch.no_grad():
        output = model(X)
        pred = output.argmax(dim=1).item()

    return "Bullying" if pred == 0 else "Not bullying"


In [None]:
lstm_predict("Saale randi")

In [None]:
lstm_preds = []
for text in X_test:
    lstm_preds.append(0 if lstm_predict(text) == "Bullying" else 1)


**LINEAR REGRESSION**

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_reg_clf = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=3000,          # increase for better accuracy
        ngram_range=(1,2),          # unigrams + bigrams
        preprocessor=preprocess_text
    )),
    ("clf", LogisticRegression(
        class_weight="balanced",    # handles imbalanced bullying data
        max_iter=300                # increase iterations for convergence
    ))
])

In [None]:
log_reg_clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, accuracy_score

log_preds = log_reg_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, log_preds))
print(classification_report(y_test, log_preds))


In [None]:
def predict_comment(text):
    pred = log_reg_clf.predict([text])[0]
    return "Bullying" if pred == 0 else "Not bullying"
print(predict_comment("Red"))
print(predict_comment("Have a nice day üòä"))

**COMPARISON**

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(name, true, pred, results):
    acc = accuracy_score(true, pred)
    prec = precision_score(true, pred, average="binary")
    rec = recall_score(true, pred, average="binary")
    f1 = f1_score(true, pred, average="binary")

    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1 Score": f1
    })

    print(f"\n{name} Results:")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")


In [None]:
results = []


In [None]:
evaluate_model("Logistic Regression", y_test, log_preds, results)
evaluate_model("SVM", y_test, svm_preds, results)
evaluate_model("Random Forest", y_test, rafo_preds, results)
evaluate_model("DistilBERT", y_test, bert_preds, results)
evaluate_model("LSTM", y_test, lstm_preds, results)

In [None]:
import pandas as pd

df_results = pd.DataFrame(results)
df_results


In [None]:
df_results.plot(x="Model", y=["Accuracy", "F1 Score"], kind="bar", figsize=(10,5))


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt


In [None]:
models = {
    "LSTM": lstm_preds,
    "DistilBERT": bert_preds,
    "Random Forest": rafo_preds,
    "SVM": svm_preds,
    "Logistic Regression": log_preds
}

for name, pred in models.items():
    cm = confusion_matrix(y_test, pred)
    disp = ConfusionMatrixDisplay(cm, display_labels=["Not Bullying", "Bullying"])

    plt.figure(figsize=(6,5))
    disp.plot(cmap="Blues", values_format="d")
    plt.title(f"{name} - Confusion Matrix")
    plt.show()


In [None]:
!pip install nbformat nbconvert
!jupyter nbconvert --to notebook --ClearOutputPreprocessor.enabled=True Bullying.ipynb
