# Installing Libraries and Dependencies


In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np
import re

In [2]:
class Config:
    MODEL_NAME = "bert-base-multilingual-cased"
    BATCH_SIZE = 16
    LEARNING_RATE = 2e-5
    EPOCHS = 4
    MAX_LENGTH = 128 # Max token length for sentences
    RANDOM_STATE = 42

# Set random seed for reproducibility
np.random.seed(Config.RANDOM_STATE)
torch.manual_seed(Config.RANDOM_STATE)
torch.cuda.manual_seed_all(Config.RANDOM_STATE)

# Set device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and Split the Dataset

In [3]:
# IMPORTANT: Replace "Dataset - Sheet1.csv" with the actual path to your file.
try:
    df = pd.read_csv("/content/Dataset - Sheet1.csv")
    print("Dataset loaded successfully. Displaying first 5 rows:")
    print(df.head())
except FileNotFoundError:
    print("Error: The file 'Dataset - Sheet1.csv' was not found.")
    print("Please update the path to point to your actual CSV file and run the script again.")
    exit()

Dataset loaded successfully. Displaying first 5 rows:
   ID                                               Text  Label
0   1  Kadiri naman yung nakikita ko dito sa twitter ...      1
1   2       Mukhang masaya ang mga kasama mo sa picture.      0
2   3  hahaha tangina niyo skunky pa ang nais ni negr...      1
3   4  PAG SINAGOT KA NI BAYS WALANG SUMBUNGAN KAY TA...      1
4   5  Deadline na po ng mga proposals bukas, tapusin...      0


In [4]:
# Ensure no missing values in critical columns
df = df.dropna(subset=['Text', 'Label']).reset_index(drop=True)
# Ensure labels are integers
df['Label'] = df['Label'].astype(int)

stopwords_tl = {'ang', 'ng', 'sa', 'na', 'at', 'ay', 'ni', 'kay'}
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in stopwords_tl]
    return ' '.join(filtered_tokens)

df['clean_text'] = df['Text'].apply(preprocess_text)

In [5]:
train_val_df, test_df = train_test_split(
    df,
    test_size=0.30,
    random_state=Config.RANDOM_STATE,
    stratify=df['Label']
)

val_df, train_df = train_test_split(
    train_val_df,
    test_size=0.8571,
    random_state=Config.RANDOM_STATE,
    stratify=train_val_df['Label']
)

# Print the final split sizes
print(f"Training set size: {len(train_df)} ({len(train_df)/len(df):.0%})")
print(f"Test set size: {len(test_df)} ({len(test_df)/len(df):.0%})")
print(f"Validation set size: {len(val_df)} ({len(val_df)/len(df):.0%})")

Training set size: 414 (60%)
Test set size: 207 (30%)
Validation set size: 68 (10%)


In [6]:
tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)

class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=self.max_len,
            return_token_type_ids=False, padding='max_length',
            truncation=True, return_attention_mask=True, return_tensors='pt',
        )
        return {
            'text': text, 'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = HateSpeechDataset(texts=df.clean_text.to_numpy(), labels=df.Label.to_numpy(), tokenizer=tokenizer, max_len=max_len)
    return DataLoader(ds, batch_size=batch_size, num_workers=2)

train_data_loader = create_data_loader(train_df, tokenizer, Config.MAX_LENGTH, Config.BATCH_SIZE)
val_data_loader = create_data_loader(val_df, tokenizer, Config.MAX_LENGTH, Config.BATCH_SIZE)
test_data_loader = create_data_loader(test_df, tokenizer, Config.MAX_LENGTH, Config.BATCH_SIZE)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [7]:
class BertClassifier(nn.Module):
    def __init__(self, n_classes):
        super(BertClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(Config.MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        output = self.drop(pooled_output)
        return self.out(output)

model = BertClassifier(n_classes=2).to(device)

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [8]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler):
    model = model.train()
    losses = []
    total_predictions = 0
    correct_predictions = 0

    print("Starting training epoch...")
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        total_predictions += len(labels)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    print("Training epoch finished.")
    accuracy = correct_predictions.double() / total_predictions
    return accuracy, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device):
    model = model.eval()
    losses = []
    all_labels = []
    all_preds = []

    print("Starting evaluation...")
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)

            losses.append(loss.item())
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    print("Evaluation finished.")
    report = classification_report(all_labels, all_preds, target_names=['Non-Hate', 'Hate'], output_dict=True, zero_division=0)
    return np.mean(losses), report


# Train the Model

In [9]:
optimizer = AdamW(model.parameters(), lr=Config.LEARNING_RATE)
total_steps = len(train_data_loader) * Config.EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
loss_fn = nn.CrossEntropyLoss().to(device)

history = {'train_acc': [], 'train_loss': [], 'val_acc': [], 'val_loss': [], 'val_f1': []}
best_accuracy = 0

for epoch in range(Config.EPOCHS):
    print(f'\nEpoch {epoch + 1}/{Config.EPOCHS}')
    print('-' * 20)

    train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler)
    print(f'Train -> Loss: {train_loss:.4f} | Accuracy: {train_acc:.4f}')

    val_loss, val_report = eval_model(model, val_data_loader, loss_fn, device)
    val_acc = val_report['accuracy']
    val_f1 = val_report['Hate']['f1-score']
    print(f'Validation -> Loss: {val_loss:.4f} | Accuracy: {val_acc:.4f} | F1 (Hate): {val_f1:.4f}')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    history['val_f1'].append(val_f1)

    if val_acc > best_accuracy:
        print("Validation accuracy improved. Saving model...")
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc


Epoch 1/4
--------------------
Starting training epoch...
Training epoch finished.
Train -> Loss: 0.4989 | Accuracy: 0.7464
Starting evaluation...
Evaluation finished.
Validation -> Loss: 0.3083 | Accuracy: 0.8676 | F1 (Hate): 0.8767
Validation accuracy improved. Saving model...

Epoch 2/4
--------------------
Starting training epoch...
Training epoch finished.
Train -> Loss: 0.2023 | Accuracy: 0.9420
Starting evaluation...
Evaluation finished.
Validation -> Loss: 0.3452 | Accuracy: 0.8676 | F1 (Hate): 0.8889

Epoch 3/4
--------------------
Starting training epoch...
Training epoch finished.
Train -> Loss: 0.0814 | Accuracy: 0.9758
Starting evaluation...
Evaluation finished.
Validation -> Loss: 0.2816 | Accuracy: 0.8971 | F1 (Hate): 0.9067
Validation accuracy improved. Saving model...

Epoch 4/4
--------------------
Starting training epoch...
Training epoch finished.
Train -> Loss: 0.0684 | Accuracy: 0.9807
Starting evaluation...
Evaluation finished.
Validation -> Loss: 0.3259 | Accur

In [10]:
model.load_state_dict(torch.load('best_model_state.bin'))

# Generate the final classification report string
all_labels_test = []
all_preds_test = []
with torch.no_grad():
    for d in test_data_loader:
        outputs = model(input_ids=d["input_ids"].to(device), attention_mask=d["attention_mask"].to(device))
        _, preds = torch.max(outputs, dim=1)
        all_labels_test.extend(d["labels"].cpu().numpy())
        all_preds_test.extend(preds.cpu().numpy())

final_report_str = classification_report(all_labels_test, all_preds_test, target_names=['Non-Hate', 'Hate'])
print("\nFinal Test Set Performance Report:")
print(final_report_str)


Final Test Set Performance Report:
              precision    recall  f1-score   support

    Non-Hate       0.99      0.97      0.98        96
        Hate       0.97      0.99      0.98       111

    accuracy                           0.98       207
   macro avg       0.98      0.98      0.98       207
weighted avg       0.98      0.98      0.98       207



# Inference

In [13]:
# Load the best model checkpoint
inference_model = BertClassifier(n_classes=2)
inference_model.load_state_dict(torch.load('best_model_state.bin'))
inference_model = inference_model.to(device)
inference_model.eval()

def predict_hate_speech(text, model, tokenizer, device, max_len=128):

    cleaned_text = preprocess_text(text)

    encoding = tokenizer.encode_plus(
        cleaned_text,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, prediction = torch.max(outputs, dim=1)

    class_names = ['Non-Hate Speech', 'Hate Speech']
    return class_names[prediction.item()]

# Start inference loop
print("\nModel is ready. Type a sentence to classify it, or type 'quit' to exit.")

while True:
    user_input = input("Enter a sentence (or type 'quit' to exit): ")
    if user_input.lower().strip() == 'quit':
        print("Exiting inference mode.")
        break

    prediction = predict_hate_speech(user_input, inference_model, tokenizer, device, Config.MAX_LENGTH)
    print(f"\nInput Text: {user_input}")
    print(f"Prediction: {prediction}")
    print("-" * 40)


Model is ready. Type a sentence to classify it, or type 'quit' to exit.
Enter a sentence (or type 'quit' to exit): may crush ako sa kabilang section

Input Text: may crush ako sa kabilang section
Prediction: Non-Hate Speech
----------------------------------------
Enter a sentence (or type 'quit' to exit): ang ganda mo beh san ka nagpagupit?

Input Text: ang ganda mo beh san ka nagpagupit?
Prediction: Non-Hate Speech
----------------------------------------
Enter a sentence (or type 'quit' to exit): hehehe gusto mo ba makipag date bukas?

Input Text: hehehe gusto mo ba makipag date bukas?
Prediction: Non-Hate Speech
----------------------------------------
Enter a sentence (or type 'quit' to exit): TANGINANG BUHAY TO bakit ba ang daming bakla sa mundo

Input Text: TANGINANG BUHAY TO bakit ba ang daming bakla sa mundo
Prediction: Hate Speech
----------------------------------------
Enter a sentence (or type 'quit' to exit): ang bantot mo naman kaya walang nagkakagusto sayo dahil ang pa