In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/blp25-task1/blp25_hatespeech_subtask_1A_dev_test.tsv
/kaggle/input/blp25-task1/blp25_hatespeech_subtask_1A_dev.tsv
/kaggle/input/blp25-task1/blp25_hatespeech_subtask_1C_dev_test.tsv
/kaggle/input/blp25-task1/blp25_hatespeech_subtask_1C_dev.tsv
/kaggle/input/blp25-task1/blp25_hatespeech_subtask_1A_train.tsv
/kaggle/input/blp25-task1/blp25_hatespeech_subtask_1C_train.tsv


In [None]:
# =========================
# 0. Reproducibility
# =========================  
import os, random
import numpy as np
import pandas as pd
import torch
from torch import nn
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from torch.optim import AdamW

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)
print("‚úÖ Seeds set")

# =========================
# 1. Paths & Config
# =========================
train_path = "/kaggle/input/blp25-task1/blp25_hatespeech_subtask_1C_train.tsv"
dev_path   = "/kaggle/input/blp25-task1/blp25_hatespeech_subtask_1C_dev.tsv"
test_path  = "/kaggle/input/blp25-task1/blp25_hatespeech_subtask_1C_dev_test.tsv"

MODEL_NAME = "csebuetnlp/banglabert"
MAX_LEN = 256
BATCH_SIZE = 8
EPOCHS = 5
LR = 2e-5
MODEL_TAG = "BanglaBERT_CSEBUET"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("üñ•Ô∏è Device:", device)

# =========================
# 2. Load datasets
# =========================
train_df = pd.read_csv(train_path, sep="\t")
dev_df   = pd.read_csv(dev_path,   sep="\t")
test_df  = pd.read_csv(test_path,  sep="\t")

print("üìÇ Train shape:", train_df.shape)
print("üìÇ Dev shape  :", dev_df.shape)
print("üìÇ Test shape :", test_df.shape)
print("üîπ Sample train row:\n", train_df.head(1))

# =========================
# 3. Encode labels
# =========================
le_type = LabelEncoder()
le_severity = LabelEncoder()
le_whom = LabelEncoder()

train_df["hate_type"]      = le_type.fit_transform(train_df["hate_type"])
train_df["hate_severity"]  = le_severity.fit_transform(train_df["hate_severity"])
train_df["to_whom"]        = le_whom.fit_transform(train_df["to_whom"])

dev_df["hate_type"]        = le_type.transform(dev_df["hate_type"])
dev_df["hate_severity"]    = le_severity.transform(dev_df["hate_severity"])
dev_df["to_whom"]          = le_whom.transform(dev_df["to_whom"])

print("‚úÖ Labels encoded")
print("  hate_type classes    :", list(le_type.classes_))
print("  hate_severity classes:", list(le_severity.classes_))
print("  to_whom classes      :", list(le_whom.classes_))

# =========================
# 4. Tokenizer
# =========================
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("‚úÖ Tokenizer loaded:", MODEL_NAME)

# =========================
# 5. Dataset
# =========================
class HateSpeechDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=64, is_test=False):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = str(row["text"])

        enc = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
        }

        if self.is_test:
            item["id"] = int(row["id"])
        else:
            item["hate_type"]     = torch.tensor(int(row["hate_type"]), dtype=torch.long)
            item["hate_severity"] = torch.tensor(int(row["hate_severity"]), dtype=torch.long)
            item["to_whom"]       = torch.tensor(int(row["to_whom"]), dtype=torch.long)

        return item

# =========================
# 6. Datasets & Loaders
# =========================
train_dataset = HateSpeechDataset(train_df, tokenizer, max_len=MAX_LEN, is_test=False)
dev_dataset   = HateSpeechDataset(dev_df,   tokenizer, max_len=MAX_LEN, is_test=False)
test_dataset  = HateSpeechDataset(test_df,  tokenizer, max_len=MAX_LEN, is_test=True)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_loader   = DataLoader(dev_dataset,   batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False)

print("‚úÖ DataLoaders ready")

# =========================
# 7. Model
# =========================
class MultiTaskBERT(nn.Module):
    def __init__(self, model_name, num_type, num_severity, num_whom, dropout=0.45):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout)
        hidden = self.encoder.config.hidden_size

        self.type_head     = nn.Linear(hidden, num_type)
        self.severity_head = nn.Linear(hidden, num_severity)
        self.whom_head     = nn.Linear(hidden, num_whom)

    def forward(self, input_ids, attention_mask):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)

        if hasattr(out, "pooler_output") and out.pooler_output is not None:
            pooled = out.pooler_output
        else:
            pooled = out.last_hidden_state[:, 0, :]

        pooled = self.dropout(pooled)

        return (
            self.type_head(pooled),
            self.severity_head(pooled),
            self.whom_head(pooled),
        )

model = MultiTaskBERT(
    MODEL_NAME,
    num_type=len(le_type.classes_),
    num_severity=len(le_severity.classes_),
    num_whom=len(le_whom.classes_),
).to(device)

optimizer = AdamW(model.parameters(), lr=LR)
loss_fn = nn.CrossEntropyLoss()
print("‚úÖ Model & Optimizer ready")

# =========================
# 8. Training
# =========================
for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0
    for step, batch in enumerate(train_loader, start=1):
        optimizer.zero_grad(set_to_none=True)

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        y_type = batch["hate_type"].to(device)
        y_sev  = batch["hate_severity"].to(device)
        y_whom = batch["to_whom"].to(device)

        logits_type, logits_sev, logits_whom = model(input_ids, attention_mask)

        loss = (loss_fn(logits_type, y_type) +
                loss_fn(logits_sev,  y_sev)  +
                loss_fn(logits_whom, y_whom)) / 3.0

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        if step % 100 == 0:
            print(f"Epoch {epoch} | Step {step} | Batch Loss: {loss.item():.4f}")

    avg_loss = total_loss / max(1, len(train_loader))
    print(f"üìâ Epoch {epoch}/{EPOCHS} | Avg Loss: {avg_loss:.4f}")

# =========================
# 9. Evaluation on Dev
# =========================
model.eval()
y_true_type, y_pred_type = [], []
y_true_sev,  y_pred_sev  = [], []
y_true_whom, y_pred_whom = [], []

with torch.no_grad():
    for batch in dev_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        logits_type, logits_sev, logits_whom = model(input_ids, attention_mask)

        y_true_type.extend(batch["hate_type"].cpu().numpy())
        y_true_sev.extend(batch["hate_severity"].cpu().numpy())
        y_true_whom.extend(batch["to_whom"].cpu().numpy())

        y_pred_type.extend(logits_type.argmax(dim=1).cpu().numpy())
        y_pred_sev.extend(logits_sev.argmax(dim=1).cpu().numpy())
        y_pred_whom.extend(logits_whom.argmax(dim=1).cpu().numpy())

f1_type = f1_score(y_true_type, y_pred_type, average="micro")
f1_severity = f1_score(y_true_sev,  y_pred_sev,  average="micro")
f1_whom = f1_score(y_true_whom, y_pred_whom, average="micro")
final_f1 = (f1_type + f1_severity + f1_whom) / 3.0
print(f"üìä Dev Avg Micro-F1: {final_f1:.4f} "
      f"(type={f1_type:.4f}, severity={f1_severity:.4f}, whom={f1_whom:.4f})")

# =========================
# 10. Predict for Submission
# =========================
preds_type, preds_sev, preds_whom, ids = [], [], [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        logits_type, logits_sev, logits_whom = model(input_ids, attention_mask)

        preds_type.extend(logits_type.argmax(dim=1).cpu().numpy().tolist())
        preds_sev.extend(logits_sev.argmax(dim=1).cpu().numpy().tolist())
        preds_whom.extend(logits_whom.argmax(dim=1).cpu().numpy().tolist())

        if isinstance(batch["id"], torch.Tensor):
            ids.extend(batch["id"].cpu().numpy().astype(int).tolist())
        else:
            ids.extend([int(x) for x in batch["id"]])

# Decode back to original labels
preds_type = le_type.inverse_transform(np.array(preds_type))
preds_sev  = le_severity.inverse_transform(np.array(preds_sev))
preds_whom = le_whom.inverse_transform(np.array(preds_whom))

# Create submission DataFrame
submission = pd.DataFrame({
    "id": ids,
    "hate_type": preds_type,
    "hate_severity": preds_sev,
    "to_whom": preds_whom,
    "model": [MODEL_TAG] * len(ids)
})

# ‚úÖ Fix invalid values (Codabench checker strict)
submission["hate_type"] = submission["hate_type"].fillna("None")
submission["hate_severity"] = submission["hate_severity"].fillna("Little to None")
submission["to_whom"] = submission["to_whom"].fillna("None")

# Strip accidental spaces
for col in ["hate_type", "hate_severity", "to_whom"]:
    submission[col] = submission[col].astype(str).str.strip()

# Save with header (checker skips header itself)
submission.to_csv("submission.tsv", sep="\t", index=False)
print("‚úÖ Submission file saved as submission.tsv")
print(submission.head(20))


‚úÖ Seeds set
üñ•Ô∏è Device: cuda
üìÇ Train shape: (35522, 5)
üìÇ Dev shape  : (2512, 5)
üìÇ Test shape : (2512, 2)
üîπ Sample train row:
        id                                               text hate_type  \
0  147963  ‡¶ß‡¶®‡ßç‡¶Ø‡¶¨‡¶æ‡¶¶ ‡¶¨‡¶∞‡ßç‡¶°‡¶æ‡¶∞ ‡¶ó‡¶æ‡¶∞‡ßç‡¶° ‡¶¶‡ßá‡¶∞‡¶ï‡ßá ‡¶è‡¶≠‡¶æ‡¶¨‡ßá ‡¶™‡¶æ‡¶π‡¶æ‡¶∞‡¶æ ‡¶¶‡¶ø‡¶§‡ßá ‡¶π...       NaN   

    hate_severity to_whom  
0  Little to None     NaN  
‚úÖ Labels encoded
  hate_type classes    : ['Abusive', 'Political Hate', 'Profane', 'Religious Hate', 'Sexism', nan]
  hate_severity classes: ['Little to None', 'Mild', 'Severe']
  to_whom classes      : ['Community', 'Individual', 'Organization', 'Society', nan]
‚úÖ Tokenizer loaded: csebuetnlp/banglabert
‚úÖ DataLoaders ready


2025-08-20 07:04:06.159950: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755673446.181719      88 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755673446.188341      88 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

‚úÖ Model & Optimizer ready
Epoch 1 | Step 100 | Batch Loss: 0.9671
Epoch 1 | Step 200 | Batch Loss: 0.8617
Epoch 1 | Step 300 | Batch Loss: 0.4732
Epoch 1 | Step 400 | Batch Loss: 0.9671
Epoch 1 | Step 500 | Batch Loss: 0.6258
Epoch 1 | Step 600 | Batch Loss: 1.1698
Epoch 1 | Step 700 | Batch Loss: 1.0837
Epoch 1 | Step 800 | Batch Loss: 1.2145
