In [1]:
!pip install transformers

Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.35.3-py3-none-any.whl.metadata (14 kB)
Downloading huggingface_hub-0.35.3-py3-none-any.whl (564 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.3/564.3 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: huggingface-hub
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 1.0.0rc2
    Uninstalling huggingface-hub-1.0.0rc2:
      Successfully uninstalled huggingface-hub-1.0.0rc2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 4.1.1 requires pyarrow>=21.0.0, but you have pyarrow 19.0.1 which is incompatible.
gradio 5.38.1 requires pydantic<2.12,>=2.0, but you have pydantic 2.12.0a1 which is incompatible.[0m[31m
[0mSuccessfully installed huggingface-hub-0.35.3

In [2]:
!pip show transformers

Name: transformers
Version: 4.53.3
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.11/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: kaggle-environments, peft, sentence-transformers


In [3]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, XLMRobertaTokenizer, XLMRobertaModel, ElectraTokenizer, ElectraModel
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tqdm import tqdm

In [4]:
# Load datasets
df_train = pd.read_csv('/kaggle/input/blp-2025-task-1/data/subtask_1C/blp25_hatespeech_subtask_1C_train.tsv', sep="\t")
df_valid = pd.read_csv('/kaggle/input/blp-2025-task-1/data/subtask_1C/blp25_hatespeech_subtask_1C_dev.tsv', sep="\t")
df_test = pd.read_csv('/kaggle/input/blp-2025-task-1/data/subtask_1C/blp25_hatespeech_subtask_1C_test.tsv', sep="\t")

In [5]:
df_train.head(15)

Unnamed: 0,id,text,hate_type,hate_severity,to_whom
0,147963,ধন্যবাদ বর্ডার গার্ড দেরকে এভাবে পাহারা দিতে হ...,,Little to None,
1,214275,ছোটবেলায় অনেক কষ্ট করে কিছু গালাগালি শিখছিলাম...,,Little to None,
2,849172,অতিরিক্ত এ নিজেকে বাদুর বানাইয়া ফেলছেন রে,Abusive,Little to None,Individual
3,821985,চিন ভারত রাশিয়া এই তিন দেশ এক থাকলে বিশ্বকে শা...,,Little to None,
4,477288,এটার বিচার কে করবেযে বিচার করবে সেই তো হলো এই ...,Abusive,Severe,Individual
5,933728,তুরা কিসের জন্য দুভাই যাবি অন্য দেশে কেন জাস না,,Little to None,
6,398351,দেশ বিভাগের সময়ে পশ্চিম পাকিস্তানে ২৫ শতাংশ স...,,Little to None,
7,786609,ইরান পারমাণবিক বোমা বানাবে বানাবে বলতে বলতে বি...,Abusive,Little to None,Society
8,917115,আজকে এই উৎসব কেনো,,Little to None,
9,415453,ইমরান ছাড়া পাকিস্তান কখনোই ঘুরে দাড়াতে পারবে ন...,,Little to None,


In [6]:
df_train.describe()

Unnamed: 0,id
count,35522.0
mean,470131.934435
std,271256.805054
min,96.0
25%,235179.0
50%,470483.0
75%,705906.25
max,939762.0


In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35522 entries, 0 to 35521
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             35522 non-null  int64 
 1   text           35522 non-null  object
 2   hate_type      15568 non-null  object
 3   hate_severity  35522 non-null  object
 4   to_whom        14332 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.4+ MB


In [8]:
print(df_valid.head(10), df_valid.info(), df_valid.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2512 entries, 0 to 2511
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             2512 non-null   int64 
 1   text           2512 non-null   object
 2   hate_type      1061 non-null   object
 3   hate_severity  2512 non-null   object
 4   to_whom        976 non-null    object
dtypes: int64(1), object(4)
memory usage: 98.3+ KB
       id                                               text       hate_type  \
0  166449  ইন্ডিয়া কি মাছ ধরা বন্ধ রাখছেএক নদীতে দুইনীতি ...  Political Hate   
1  267692  লক্ষ টাকা ঘুষ দিয়ে অযোগ্য আর দায়িত্বহীন মানস...         Abusive   
2  184031                                    ওহা ভবনের দালাল             NaN   
3  939131  আর কতো শিখবে আমার সোনার ছেলেরা এগুলো কে টাকা দ...         Abusive   
4  210284                             কি সাংঘাতিক ভাই রে তুই         Abusive   
5  712332        লঞ্চ মালিকদের অভিশপ্ত চক্ষু পদ্মা সেতুর উপর  

In [10]:
# Preprocess: replace NaN with 'None'
for df in [df_train, df_valid]:
    df['hate_type'] = df['hate_type'].fillna('None')
    df['hate_severity'] = df['hate_severity'].fillna('None')
    df['to_whom'] = df['to_whom'].fillna('None')

# Get unique labels
hate_type_unique = ['None', 'Abusive', 'Profane', 'Religious Hate', 'Political Hate', 'Sexism']
hate_severity_unique = ['Little to None', 'Severe', 'Mild', 'None']
to_whom_unique = ['None', 'Individual', 'Society', 'Community', 'Organization']

In [11]:
# Label Encoders
le_hate_type = LabelEncoder().fit(hate_type_unique)
le_hate_severity = LabelEncoder().fit(hate_severity_unique)
le_to_whom = LabelEncoder().fit(to_whom_unique)

num_hate_types = len(hate_type_unique)
num_severities = len(hate_severity_unique)
num_to_whoms = len(to_whom_unique)

In [12]:
print(le_hate_type, le_hate_severity, le_to_whom)

LabelEncoder() LabelEncoder() LabelEncoder()


In [13]:
print(num_hate_types, num_severities, num_to_whoms)

6 4 5


In [14]:
# Encode labels for train and valid
df_train['hate_type_label'] = le_hate_type.transform(df_train['hate_type'])
df_train['hate_severity_label'] = le_hate_severity.transform(df_train['hate_severity'])
df_train['to_whom_label'] = le_to_whom.transform(df_train['to_whom'])

df_valid['hate_type_label'] = le_hate_type.transform(df_valid['hate_type'])
df_valid['hate_severity_label'] = le_hate_severity.transform(df_valid['hate_severity'])
df_valid['to_whom_label'] = le_to_whom.transform(df_valid['to_whom'])

In [15]:
df_train.head(15)

Unnamed: 0,id,text,hate_type,hate_severity,to_whom,hate_type_label,hate_severity_label,to_whom_label
0,147963,ধন্যবাদ বর্ডার গার্ড দেরকে এভাবে পাহারা দিতে হ...,,Little to None,,1,0,2
1,214275,ছোটবেলায় অনেক কষ্ট করে কিছু গালাগালি শিখছিলাম...,,Little to None,,1,0,2
2,849172,অতিরিক্ত এ নিজেকে বাদুর বানাইয়া ফেলছেন রে,Abusive,Little to None,Individual,0,0,1
3,821985,চিন ভারত রাশিয়া এই তিন দেশ এক থাকলে বিশ্বকে শা...,,Little to None,,1,0,2
4,477288,এটার বিচার কে করবেযে বিচার করবে সেই তো হলো এই ...,Abusive,Severe,Individual,0,3,1
5,933728,তুরা কিসের জন্য দুভাই যাবি অন্য দেশে কেন জাস না,,Little to None,,1,0,2
6,398351,দেশ বিভাগের সময়ে পশ্চিম পাকিস্তানে ২৫ শতাংশ স...,,Little to None,,1,0,2
7,786609,ইরান পারমাণবিক বোমা বানাবে বানাবে বলতে বলতে বি...,Abusive,Little to None,Society,0,0,4
8,917115,আজকে এই উৎসব কেনো,,Little to None,,1,0,2
9,415453,ইমরান ছাড়া পাকিস্তান কখনোই ঘুরে দাড়াতে পারবে ন...,,Little to None,,1,0,2


In [16]:
# Dataset Class
class MultiTaskDataset(Dataset):
    def __init__(self, texts, hate_types=None, severities=None, to_whoms=None, tokenizer=None, max_len=128):
        self.texts = texts
        self.hate_types = hate_types
        self.severities = severities
        self.to_whoms = to_whoms
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

        if self.hate_types is not None:
            item['hate_type'] = torch.tensor(self.hate_types[idx], dtype=torch.long)
            item['hate_severity'] = torch.tensor(self.severities[idx], dtype=torch.long)
            item['to_whom'] = torch.tensor(self.to_whoms[idx], dtype=torch.long)

        return item

In [17]:
# Multi-task model base
class MultiTaskModel(nn.Module):
    def __init__(self, base_model, num_hate_types, num_severities, num_to_whoms):
        super().__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(0.1)
        self.hate_type_head = nn.Linear(self.base_model.config.hidden_size, num_hate_types)
        self.severity_head = nn.Linear(self.base_model.config.hidden_size, num_severities)
        self.to_whom_head = nn.Linear(self.base_model.config.hidden_size, num_to_whoms)

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output if hasattr(outputs, 'pooler_output') else outputs.last_hidden_state[:, 0]
        dropout_output = self.dropout(pooled_output)
        hate_type_logits = self.hate_type_head(dropout_output)
        severity_logits = self.severity_head(dropout_output)
        to_whom_logits = self.to_whom_head(dropout_output)
        return hate_type_logits, severity_logits, to_whom_logits

In [18]:
# Training function
def train_model(model, train_loader, valid_loader, optimizer, scheduler, device, epochs=2):
    model = nn.DataParallel(model).to(device)
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            hate_type = batch['hate_type'].to(device)
            hate_severity = batch['hate_severity'].to(device)
            to_whom = batch['to_whom'].to(device)

            optimizer.zero_grad()
            hate_type_logits, severity_logits, to_whom_logits = model(input_ids, attention_mask)

            loss = (loss_fn(hate_type_logits, hate_type) +
                    loss_fn(severity_logits, hate_severity) +
                    loss_fn(to_whom_logits, to_whom))
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()

        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1}/{epochs}, Train Loss: {avg_loss:.4f}')

        # Validation
        model.eval()
        val_hate_type_preds, val_hate_type_true = [], []
        val_severity_preds, val_severity_true = [], []
        val_to_whom_preds, val_to_whom_true = [], []
        with torch.no_grad():
            for batch in valid_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                hate_type = batch['hate_type'].to(device)
                hate_severity = batch['hate_severity'].to(device)
                to_whom = batch['to_whom'].to(device)

                hate_type_logits, severity_logits, to_whom_logits = model(input_ids, attention_mask)

                val_hate_type_preds.extend(torch.argmax(hate_type_logits, dim=1).cpu().numpy())
                val_hate_type_true.extend(hate_type.cpu().numpy())
                val_severity_preds.extend(torch.argmax(severity_logits, dim=1).cpu().numpy())
                val_severity_true.extend(hate_severity.cpu().numpy())
                val_to_whom_preds.extend(torch.argmax(to_whom_logits, dim=1).cpu().numpy())
                val_to_whom_true.extend(to_whom.cpu().numpy())

        hate_type_acc = accuracy_score(val_hate_type_true, val_hate_type_preds)
        severity_acc = accuracy_score(val_severity_true, val_severity_preds)
        to_whom_acc = accuracy_score(val_to_whom_true, val_to_whom_preds)
        print(f'Validation - Hate Type Acc: {hate_type_acc:.4f}, Severity Acc: {severity_acc:.4f}, To Whom Acc: {to_whom_acc:.4f}')

    return model.module  # Return the original model without DataParallel

In [19]:
# Adjusted prediction function to handle different tokenizers
def predict_ensemble_separate(models, test_datasets, device):
    for model in models:
        model = nn.DataParallel(model).to(device)
        model.eval()

    # Assume test_datasets = [test_dataset_bangla, test_dataset_xlmr, ...]
    test_loaders = [DataLoader(ds, batch_size=32) for ds in test_datasets]

    num_samples = len(test_datasets[0])
    hate_type_logits_sum = torch.zeros((num_samples, num_hate_types), device=device)
    severity_logits_sum = torch.zeros((num_samples, num_severities), device=device)
    to_whom_logits_sum = torch.zeros((num_samples, num_to_whoms), device=device)

    with torch.no_grad():
        for loader, model in zip(test_loaders, models):
            local_idx = 0
            for batch in loader:
                batch_size = batch['input_ids'].size(0)
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)

                hate_type_logits, severity_logits, to_whom_logits = model(input_ids, attention_mask)

                hate_type_logits_sum[local_idx:local_idx+batch_size] += hate_type_logits
                severity_logits_sum[local_idx:local_idx+batch_size] += severity_logits
                to_whom_logits_sum[local_idx:local_idx+batch_size] += to_whom_logits

                local_idx += batch_size

    # Average
    num_models = len(models)
    hate_type_avg_logits = hate_type_logits_sum / num_models
    severity_avg_logits = severity_logits_sum / num_models
    to_whom_avg_logits = to_whom_logits_sum / num_models

    hate_type_probs = torch.softmax(hate_type_avg_logits, dim=1).cpu().numpy()
    severity_probs = torch.softmax(severity_avg_logits, dim=1).cpu().numpy()
    to_whom_probs = torch.softmax(to_whom_avg_logits, dim=1).cpu().numpy()

    hate_type_pred = np.argmax(hate_type_probs, axis=1)
    severity_pred = np.argmax(severity_probs, axis=1)
    to_whom_pred = np.argmax(to_whom_probs, axis=1)

    hate_type_labels = le_hate_type.inverse_transform(hate_type_pred)
    severity_labels = le_hate_severity.inverse_transform(severity_pred)
    to_whom_labels = le_to_whom.inverse_transform(to_whom_pred)

    return hate_type_labels, severity_labels, to_whom_labels

In [20]:
# Main
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [8]:
# Bangla-BERT (sagorsarker/bangla-bert-base)
bangla_tokenizer = BertTokenizer.from_pretrained('sagorsarker/bangla-bert-base')
bangla_base = BertModel.from_pretrained('sagorsarker/bangla-bert-base')
model_bangla = MultiTaskModel(bangla_base, num_hate_types, num_severities, num_to_whoms)

train_dataset_bangla = MultiTaskDataset(df_train['text'].tolist(), df_train['hate_type_label'].tolist(), df_train['hate_severity_label'].tolist(), df_train['to_whom_label'].tolist(), bangla_tokenizer)
valid_dataset_bangla = MultiTaskDataset(df_valid['text'].tolist(), df_valid['hate_type_label'].tolist(), df_valid['hate_severity_label'].tolist(), df_valid['to_whom_label'].tolist(), bangla_tokenizer)
test_dataset_bangla = MultiTaskDataset(df_test['text'].tolist(), tokenizer=bangla_tokenizer)

train_loader_bangla = DataLoader(train_dataset_bangla, batch_size=32, shuffle=True)
valid_loader_bangla = DataLoader(valid_dataset_bangla, batch_size=32)

optimizer_bangla = AdamW(model_bangla.parameters(), lr=2.5e-5)
total_steps = len(train_loader_bangla) * 2
scheduler_bangla = get_linear_schedule_with_warmup(optimizer_bangla, num_warmup_steps=0, num_training_steps=total_steps)

model_bangla = train_model(model_bangla, train_loader_bangla, valid_loader_bangla, optimizer_bangla, scheduler_bangla, device)

In [38]:
# XLMR (xlm-roberta-base)
xlmr_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
xlmr_base = XLMRobertaModel.from_pretrained('xlm-roberta-base')
model_xlmr = MultiTaskModel(xlmr_base, num_hate_types, num_severities, num_to_whoms)

train_dataset_xlmr = MultiTaskDataset(df_train['text'].tolist(), df_train['hate_type_label'].tolist(), df_train['hate_severity_label'].tolist(), df_train['to_whom_label'].tolist(), xlmr_tokenizer)
valid_dataset_xlmr = MultiTaskDataset(df_valid['text'].tolist(), df_valid['hate_type_label'].tolist(), df_valid['hate_severity_label'].tolist(), df_valid['to_whom_label'].tolist(), xlmr_tokenizer)
test_dataset_xlmr = MultiTaskDataset(df_test['text'].tolist(), tokenizer=xlmr_tokenizer)

train_loader_xlmr = DataLoader(train_dataset_xlmr, batch_size=32, shuffle=True)
valid_loader_xlmr = DataLoader(valid_dataset_xlmr, batch_size=32)

optimizer_xlmr = AdamW(model_xlmr.parameters(), lr=2e-5)
total_steps = len(train_loader_xlmr) * 2
scheduler_xlmr = get_linear_schedule_with_warmup(optimizer_xlmr, num_warmup_steps=0, num_training_steps=total_steps)

model_xlmr = train_model(model_xlmr, train_loader_xlmr, valid_loader_xlmr, optimizer_xlmr, scheduler_xlmr, device)

In [40]:
# BUET BanglaBERT (csebuetnlp/banglabert)
buet_tokenizer = ElectraTokenizer.from_pretrained('csebuetnlp/banglabert')
buet_base = ElectraModel.from_pretrained('csebuetnlp/banglabert')
model_buet = MultiTaskModel(buet_base, num_hate_types, num_severities, num_to_whoms)

train_dataset_buet = MultiTaskDataset(df_train['text'].tolist(), df_train['hate_type_label'].tolist(), df_train['hate_severity_label'].tolist(), df_train['to_whom_label'].tolist(), buet_tokenizer)
valid_dataset_buet = MultiTaskDataset(df_valid['text'].tolist(), df_valid['hate_type_label'].tolist(), df_valid['hate_severity_label'].tolist(), df_valid['to_whom_label'].tolist(), buet_tokenizer)
test_dataset_buet = MultiTaskDataset(df_test['text'].tolist(), tokenizer=buet_tokenizer)

train_loader_buet = DataLoader(train_dataset_buet, batch_size=32, shuffle=True)
valid_loader_buet = DataLoader(valid_dataset_buet, batch_size=32)

optimizer_buet = AdamW(model_buet.parameters(), lr=2e-5)
total_steps = len(train_loader_buet) * 2
scheduler_buet = get_linear_schedule_with_warmup(optimizer_buet, num_warmup_steps=0, num_training_steps=total_steps)

model_buet = train_model(model_buet, train_loader_buet, valid_loader_buet, optimizer_buet, scheduler_buet, device)

In [None]:
# Test datasets
test_datasets = [test_dataset_bangla, test_dataset_xlmr, test_dataset_buet]
models = [model_bangla, model_xlmr, model_buet]

hate_type_pred, severity_pred, to_whom_pred = predict_ensemble_separate(models, test_datasets, device)

# Add to df_test
df_test['hate_type'] = hate_type_pred
df_test['hate_severity'] = severity_pred
df_test['to_whom'] = to_whom_pred
df_test['model'] = 'ensemble'

# Save predictions
df_test[['id', 'hate_type', 'hate_severity', 'to_whom', 'model']].to_csv('subtask_1C.tsv', sep='\t', index=False)