In [None]:
# hate_speech_models.py

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BertTokenizer, BertModel,
    DistilBertTokenizer, DistilBertModel,
    RobertaTokenizer, RobertaModel,
    get_scheduler
)
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import torch.nn.functional as F

In [None]:
# -------------------- Dataset --------------------

class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx] if self.labels is not None else 0
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# -------------------- Models --------------------

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_ids, attention_mask=None):
        embedded = self.embedding(input_ids)
        _, (hidden, _) = self.lstm(embedded)
        return self.fc(hidden[-1])

class TransformerClassifier(nn.Module):
    def __init__(self, transformer_model, hidden_size):
        super(TransformerClassifier, self).__init__()
        self.transformer = transformer_model
        self.fc = nn.Linear(hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        return self.fc(cls_output)

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, kernel_sizes=[3,4,5], num_filters=100):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (k, embed_dim)) for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)

    def forward(self, input_ids, attention_mask=None):
        x = self.embedding(input_ids).unsqueeze(1)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = [F.max_pool1d(item, item.size(2)).squeeze(2) for item in x]
        x = torch.cat(x, 1)
        x = self.dropout(x)
        return self.fc(x)

In [None]:
# -------------------- Training Utilities --------------------

def get_optimizer_scheduler(model, train_loader, lr, epochs):
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    total_steps = len(train_loader) * epochs
    scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    return optimizer, scheduler


In [None]:

# -------------------- Run Model --------------------

def run_model(model_name='bert'):
    df = pd.read_excel("/content/(2) dev_en.xlsx")
    df = df[['text', 'HS']].dropna()
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        df['text'], df['HS'], test_size=0.2, stratify=df['HS'], random_state=42
    )
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if model_name == 'bert':
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TransformerClassifier(BertModel.from_pretrained('bert-base-uncased'), 768)
    elif model_name == 'distilbert':
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        model = TransformerClassifier(DistilBertModel.from_pretrained('distilbert-base-uncased'), 768)
    elif model_name == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        model = TransformerClassifier(RobertaModel.from_pretrained('roberta-base'), 768)
    elif model_name == 'lstm':
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = LSTMModel(tokenizer.vocab_size, embedding_dim=100, hidden_dim=128, output_dim=2)
    elif model_name == 'cnn':
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TextCNN(vocab_size=tokenizer.vocab_size, embed_dim=100, num_classes=2)
    else:
        raise ValueError("Unsupported model name")

    model = model.to(device)
    train_dataset = HateSpeechDataset(train_texts.tolist(), train_labels.tolist(), tokenizer, max_len=64)
    val_dataset = HateSpeechDataset(val_texts.tolist(), val_labels.tolist(), tokenizer, max_len=64)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32)

    optimizer, scheduler = get_optimizer_scheduler(model, train_loader, lr=2e-5, epochs=5)
    loss_fn = nn.CrossEntropyLoss()
    best_accuracy = 0.0

    for epoch in range(3):
        print(f"\nEpoch {epoch+1}/3")
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
        print(f"Train Loss: {total_loss / len(train_loader):.4f}")
                # Evaluation
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids, attention_mask)
                _, preds = torch.max(outputs, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        report = classification_report(all_labels, all_preds, output_dict=True)
        val_accuracy = report['accuracy']
        print(f"Validation Accuracy: {val_accuracy:.4f}")
        print(classification_report(all_labels, all_preds))

        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            torch.save(model.state_dict(), f"best_model_{model_name}.pt")
            print("✅ Best model saved!")

    return model, tokenizer, device

In [None]:
#vailable models:
#run_model('bert')
run_model('distilbert')
# run_model('roberta')
# run_model('lstm')
# run_model('cnn')