In [2]:

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoConfig, AutoModel
from torch.optim import AdamW  # <-- use this instead
from collections import defaultdict
from tqdm import tqdm as tq
from sklearn.model_selection import train_test_split


MODEL_NAME ="csebuetnlp/banglabert"

# You can keep these as they are or tune them
MAX_LEN = 256
TRAIN_BATCH_SIZE = 16
VAL_BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 2e-5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer


In [4]:
train_file = 'blp25_hatespeech_subtask_1A_train.tsv'
validation_file = 'blp25_hatespeech_subtask_1A_dev.tsv'
test_file = 'blp25_hatespeech_subtask_1A_test.tsv'

#tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
# Load train/val/test DataFrames
train_df = pd.read_csv(train_file, sep="\t")
dev_df = pd.read_csv(validation_file , sep="\t")
test_df = pd.read_csv(test_file, sep="\t")






In [5]:
l2id = {
    'None': 0,
    'Religious Hate': 1,
    'Sexism': 2,
    'Political Hate': 3,
    'Profane': 4,
    'Abusive': 5
}
id2l = {v: k for k, v in l2id.items()}


def clean_label(x):
    # handle missing or NaN ‚Üí "None"
    if pd.isna(x) or x == 'None':
        return 'None'
    # already list-like e.g. ['Abusive']
    if isinstance(x, list):
        return x[0] if len(x) > 0 else 'None'
    # string cases like "[]" or "[Abusive]" or "[Political Hate]"
    x = x.strip("[]").strip()
    if x == "":
        return 'None'
    return x


def process_df(df):
    # Ensure labels are proper lists
    df["label"] = df["label"].apply(clean_label)
    df["label"] = df["label"].fillna("None")
    # Now create binary label
    df["toxic"] = df["label"].apply(lambda x: 0 if x == "None" else 1)
    df["label_id"] = df["label"].map(l2id)

    

  

    return df

train_df = process_df(train_df)
dev_df  = process_df(dev_df)

train_df

Unnamed: 0,id,text,label,toxic,label_id
0,147963,‡¶ß‡¶®‡ßç‡¶Ø‡¶¨‡¶æ‡¶¶ ‡¶¨‡¶∞‡ßç‡¶°‡¶æ‡¶∞ ‡¶ó‡¶æ‡¶∞‡ßç‡¶° ‡¶¶‡ßá‡¶∞‡¶ï‡ßá ‡¶è‡¶≠‡¶æ‡¶¨‡ßá ‡¶™‡¶æ‡¶π‡¶æ‡¶∞‡¶æ ‡¶¶‡¶ø‡¶§‡ßá ‡¶π...,,0,0
1,214275,‡¶õ‡ßã‡¶ü‡¶¨‡ßá‡¶≤‡¶æ‡¶Ø‡¶º ‡¶Ö‡¶®‡ßá‡¶ï ‡¶ï‡¶∑‡ßç‡¶ü ‡¶ï‡¶∞‡ßá ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ó‡¶æ‡¶≤‡¶æ‡¶ó‡¶æ‡¶≤‡¶ø ‡¶∂‡¶ø‡¶ñ‡¶õ‡¶ø‡¶≤‡¶æ‡¶Æ...,,0,0
2,849172,‡¶Ö‡¶§‡¶ø‡¶∞‡¶ø‡¶ï‡ßç‡¶§ ‡¶è ‡¶®‡¶ø‡¶ú‡ßá‡¶ï‡ßá ‡¶¨‡¶æ‡¶¶‡ßÅ‡¶∞ ‡¶¨‡¶æ‡¶®‡¶æ‡¶á‡ßü‡¶æ ‡¶´‡ßá‡¶≤‡¶õ‡ßá‡¶® ‡¶∞‡ßá,Abusive,1,5
3,821985,‡¶ö‡¶ø‡¶® ‡¶≠‡¶æ‡¶∞‡¶§ ‡¶∞‡¶æ‡¶∂‡¶ø‡ßü‡¶æ ‡¶è‡¶á ‡¶§‡¶ø‡¶® ‡¶¶‡ßá‡¶∂ ‡¶è‡¶ï ‡¶•‡¶æ‡¶ï‡¶≤‡ßá ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡¶ï‡ßá ‡¶∂‡¶æ...,,0,0
4,477288,‡¶è‡¶ü‡¶æ‡¶∞ ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡ßá ‡¶ï‡¶∞‡¶¨‡ßá‡¶Ø‡ßá ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡¶∞‡¶¨‡ßá ‡¶∏‡ßá‡¶á ‡¶§‡ßã ‡¶π‡¶≤‡ßã ‡¶è‡¶á ...,Abusive,1,5
...,...,...,...,...,...
35517,790325,‡¶§‡¶á‡¶ì‡ßü‡¶æ‡¶®‡ßá‡¶∞ ‡¶è‡¶§ ‡¶ï‡ßç‡¶∑‡¶Æ‡¶§‡¶æ ‡¶π‡ßü‡¶®‡¶ø ‡¶Ø‡ßá ‡¶è‡¶ï ‡¶ü‡ßÅ‡¶ï‡¶∞‡ßã ‡¶ú‡¶æ‡ßü‡¶ó‡¶æ ‡¶®‡¶∑‡ßç‡¶ü...,,0,0
35518,328377,‡¶ö‡ßÅ‡¶∞‡ßá‡¶∞ ‡¶ò‡¶∞‡ßá‡¶∞ ‡¶ö‡ßÅ‡¶∞ ‡¶π‡¶æ‡¶≤‡¶æ,Profane,1,4
35519,69803,‡¶ú‡¶æ‡¶π‡¶æ‡¶ô‡ßç‡¶ó‡ßÄ‡¶∞ ‡¶¨‡ßÅ‡¶¶‡ßç‡¶ß‡¶ø ‡¶®‡ßá‡¶á ‡¶Æ‡¶æ‡¶†‡ßá ‡¶Æ‡¶æ‡¶∞‡¶æ ‡¶Ø‡¶æ‡¶¨‡ßá,Abusive,1,5
35520,419984,‡¶è‡¶ï‡¶ü‡¶æ ‡¶´‡ßá‡¶á‡¶≤‡ßç‡¶° ‡¶è‡¶∏‡ßç‡¶ü‡ßá‡¶ü ‡¶è‡¶ì ‡¶∏‡ßÅ‡¶∑‡ßç‡¶†‡ßÅ ‡¶®‡¶ø‡¶∞‡ßç‡¶¨‡¶æ‡¶ö‡¶® ‡¶π‡ßü ‡¶®‡ßá‡¶§‡¶æ‡¶∞...,Abusive,1,5


In [6]:
df_train = train_df[train_df['toxic'] == 1].copy()

df_val = dev_df[dev_df['toxic'] == 1].copy()

df_val

Unnamed: 0,id,text,label,toxic,label_id
0,166449,‡¶á‡¶®‡ßç‡¶°‡¶ø‡ßü‡¶æ ‡¶ï‡¶ø ‡¶Æ‡¶æ‡¶õ ‡¶ß‡¶∞‡¶æ ‡¶¨‡¶®‡ßç‡¶ß ‡¶∞‡¶æ‡¶ñ‡¶õ‡ßá‡¶è‡¶ï ‡¶®‡¶¶‡ßÄ‡¶§‡ßá ‡¶¶‡ßÅ‡¶á‡¶®‡ßÄ‡¶§‡¶ø ...,Political Hate,1,3
1,267692,‡¶≤‡¶ï‡ßç‡¶∑ ‡¶ü‡¶æ‡¶ï‡¶æ ‡¶ò‡ßÅ‡¶∑ ‡¶¶‡¶ø‡¶Ø‡¶º‡ßá ‡¶Ö‡¶Ø‡ßã‡¶ó‡ßç‡¶Ø ‡¶Ü‡¶∞ ‡¶¶‡¶æ‡¶Ø‡¶º‡¶ø‡¶§‡ßç‡¶¨‡¶π‡ßÄ‡¶® ‡¶Æ‡¶æ‡¶®‡¶∏...,Abusive,1,5
3,939131,‡¶Ü‡¶∞ ‡¶ï‡¶§‡ßã ‡¶∂‡¶ø‡¶ñ‡¶¨‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶∏‡ßã‡¶®‡¶æ‡¶∞ ‡¶õ‡ßá‡¶≤‡ßá‡¶∞‡¶æ ‡¶è‡¶ó‡ßÅ‡¶≤‡ßã ‡¶ï‡ßá ‡¶ü‡¶æ‡¶ï‡¶æ ‡¶¶...,Abusive,1,5
4,210284,‡¶ï‡¶ø ‡¶∏‡¶æ‡¶Ç‡¶ò‡¶æ‡¶§‡¶ø‡¶ï ‡¶≠‡¶æ‡¶á ‡¶∞‡ßá ‡¶§‡ßÅ‡¶á,Abusive,1,5
5,712332,‡¶≤‡¶û‡ßç‡¶ö ‡¶Æ‡¶æ‡¶≤‡¶ø‡¶ï‡¶¶‡ßá‡¶∞ ‡¶Ö‡¶≠‡¶ø‡¶∂‡¶™‡ßç‡¶§ ‡¶ö‡¶ï‡ßç‡¶∑‡ßÅ ‡¶™‡¶¶‡ßç‡¶Æ‡¶æ ‡¶∏‡ßá‡¶§‡ßÅ‡¶∞ ‡¶â‡¶™‡¶∞,Abusive,1,5
...,...,...,...,...,...
2496,653048,‡¶ï‡¶ø‡¶∞‡ßá ‡¶Æ‡¶æ‡¶®‡¶ø‡¶ï ‡¶ö‡ßã‡¶∞‡¶æ ‡¶§‡ßÅ‡¶á‡¶ì ‡¶Ü‡¶õ‡ßã‡¶∏,Abusive,1,5
2503,121961,‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶∏‡¶¨‡¶á ‡¶§‡ßã ‡¶ö‡ßÅ‡¶∞‡¶ø ‡¶π‡ßü‡ßá ‡¶Ø‡¶æ‡¶ö‡ßç‡¶õ‡ßá ‡¶Ü‡¶∞ ‡¶ö‡ßã‡¶∞‡¶¶‡ßá‡¶∞ ‡¶ß‡¶∞‡¶æ‡¶∞ ‡¶ï‡ßã...,Abusive,1,5
2504,555021,‡¶´‡¶ï‡¶ø‡¶®‡¶ø ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ ‡¶¶‡ßá‡¶∂ ‡¶ï‡¶æ‡¶∞‡ßá‡¶®‡ßç‡¶ü ‡¶®‡¶æ‡¶á,Abusive,1,5
2505,858412,‡¶ï‡¶æ‡¶ï‡ßÅ ‡¶∏‡¶æ‡¶Æ‡¶®‡ßá ‡¶ö‡¶∂‡¶Æ‡¶æ ‡¶•‡¶æ‡¶ï‡¶¨‡ßá ‡¶ï‡¶ø‡¶®‡ßç‡¶§‡ßÅ ‡¶ó‡ßç‡¶≤‡¶æ‡¶∏ ‡¶•‡¶æ‡¶ï‡¶¨‡ßá‡¶®‡¶æ‡¶™‡¶æ‡¶õ‡¶æ...,Profane,1,4


In [7]:
toxic_df = df_train[df_train['toxic'] == 1].copy()
target_list = sorted(toxic_df['label'].unique().tolist()) # Sort for consistent column order
print(f"Target Categories: {target_list}")

Target Categories: ['Abusive', 'Political Hate', 'Profane', 'Religious Hate', 'Sexism']


In [8]:
df_train = pd.get_dummies(df_train, columns=['label'], prefix='', prefix_sep='')[['text'] + target_list]

df_val = pd.get_dummies(df_val, columns=['label'], prefix='', prefix_sep='')[['text'] + target_list]

df_val

Unnamed: 0,text,Abusive,Political Hate,Profane,Religious Hate,Sexism
0,‡¶á‡¶®‡ßç‡¶°‡¶ø‡ßü‡¶æ ‡¶ï‡¶ø ‡¶Æ‡¶æ‡¶õ ‡¶ß‡¶∞‡¶æ ‡¶¨‡¶®‡ßç‡¶ß ‡¶∞‡¶æ‡¶ñ‡¶õ‡ßá‡¶è‡¶ï ‡¶®‡¶¶‡ßÄ‡¶§‡ßá ‡¶¶‡ßÅ‡¶á‡¶®‡ßÄ‡¶§‡¶ø ...,False,True,False,False,False
1,‡¶≤‡¶ï‡ßç‡¶∑ ‡¶ü‡¶æ‡¶ï‡¶æ ‡¶ò‡ßÅ‡¶∑ ‡¶¶‡¶ø‡¶Ø‡¶º‡ßá ‡¶Ö‡¶Ø‡ßã‡¶ó‡ßç‡¶Ø ‡¶Ü‡¶∞ ‡¶¶‡¶æ‡¶Ø‡¶º‡¶ø‡¶§‡ßç‡¶¨‡¶π‡ßÄ‡¶® ‡¶Æ‡¶æ‡¶®‡¶∏...,True,False,False,False,False
3,‡¶Ü‡¶∞ ‡¶ï‡¶§‡ßã ‡¶∂‡¶ø‡¶ñ‡¶¨‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶∏‡ßã‡¶®‡¶æ‡¶∞ ‡¶õ‡ßá‡¶≤‡ßá‡¶∞‡¶æ ‡¶è‡¶ó‡ßÅ‡¶≤‡ßã ‡¶ï‡ßá ‡¶ü‡¶æ‡¶ï‡¶æ ‡¶¶...,True,False,False,False,False
4,‡¶ï‡¶ø ‡¶∏‡¶æ‡¶Ç‡¶ò‡¶æ‡¶§‡¶ø‡¶ï ‡¶≠‡¶æ‡¶á ‡¶∞‡ßá ‡¶§‡ßÅ‡¶á,True,False,False,False,False
5,‡¶≤‡¶û‡ßç‡¶ö ‡¶Æ‡¶æ‡¶≤‡¶ø‡¶ï‡¶¶‡ßá‡¶∞ ‡¶Ö‡¶≠‡¶ø‡¶∂‡¶™‡ßç‡¶§ ‡¶ö‡¶ï‡ßç‡¶∑‡ßÅ ‡¶™‡¶¶‡ßç‡¶Æ‡¶æ ‡¶∏‡ßá‡¶§‡ßÅ‡¶∞ ‡¶â‡¶™‡¶∞,True,False,False,False,False
...,...,...,...,...,...,...
2496,‡¶ï‡¶ø‡¶∞‡ßá ‡¶Æ‡¶æ‡¶®‡¶ø‡¶ï ‡¶ö‡ßã‡¶∞‡¶æ ‡¶§‡ßÅ‡¶á‡¶ì ‡¶Ü‡¶õ‡ßã‡¶∏,True,False,False,False,False
2503,‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶∏‡¶¨‡¶á ‡¶§‡ßã ‡¶ö‡ßÅ‡¶∞‡¶ø ‡¶π‡ßü‡ßá ‡¶Ø‡¶æ‡¶ö‡ßç‡¶õ‡ßá ‡¶Ü‡¶∞ ‡¶ö‡ßã‡¶∞‡¶¶‡ßá‡¶∞ ‡¶ß‡¶∞‡¶æ‡¶∞ ‡¶ï‡ßã...,True,False,False,False,False
2504,‡¶´‡¶ï‡¶ø‡¶®‡¶ø ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ ‡¶¶‡ßá‡¶∂ ‡¶ï‡¶æ‡¶∞‡ßá‡¶®‡ßç‡¶ü ‡¶®‡¶æ‡¶á,True,False,False,False,False
2505,‡¶ï‡¶æ‡¶ï‡ßÅ ‡¶∏‡¶æ‡¶Æ‡¶®‡ßá ‡¶ö‡¶∂‡¶Æ‡¶æ ‡¶•‡¶æ‡¶ï‡¶¨‡ßá ‡¶ï‡¶ø‡¶®‡ßç‡¶§‡ßÅ ‡¶ó‡ßç‡¶≤‡¶æ‡¶∏ ‡¶•‡¶æ‡¶ï‡¶¨‡ßá‡¶®‡¶æ‡¶™‡¶æ‡¶õ‡¶æ...,False,False,True,False,False


In [9]:

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len, target_list):
        self.tokenizer = tokenizer
        self.df = df
        # Corrected column name from 'title' or 'Text' to 'text'
        self.texts = list(df['text']) 
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index]),
            'text': text
        }

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
train_dataset = CustomDataset(df_train, tokenizer, MAX_LEN, target_list)
val_dataset = CustomDataset(df_val, tokenizer, MAX_LEN, target_list)

train_data_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=VAL_BATCH_SIZE, shuffle=False)


In [10]:
# class BERTClass(torch.nn.Module):
#     def __init__(self, model_name, target_list):
#         super(BERTClass, self).__init__()
#         self.bert_model = AutoModel.from_pretrained(model_name, return_dict=True)
#         self.dropout = torch.nn.Dropout(0.3)
#         self.linear = torch.nn.Linear(self.bert_model.config.hidden_size, len(target_list))
    
#     def forward(self, input_ids, attn_mask, token_type_ids):
#         output = self.bert_model(
#             input_ids, 
#             attention_mask=attn_mask, 
#             token_type_ids=token_type_ids
#         )
        
#         # --- FIX IS HERE ---
#         # Instead of pooler_output, we take the last hidden state of the [CLS] token
#         # output.last_hidden_state has shape (batch_size, sequence_length, hidden_size)
#         # We select the [CLS] token by indexing with [:, 0, :]
#         cls_output = output.last_hidden_state[:, 0, :]
        
#         output_dropout = self.dropout(cls_output)
#         final_output = self.linear(output_dropout)
        
#         return final_output

# # Instantiate the model
# model = BERTClass(MODEL_NAME, target_list)
# model.to(device)

# # ==================================
# # 4. Loss Function and Optimizer
# # ==================================
# # For multi-label classification, BCEWithLogitsLoss is the correct choice.
# # It combines a Sigmoid layer and the BCELoss in one single class.
# def loss_fn(outputs, targets):
#     return nn.BCEWithLogitsLoss()(outputs, targets)

# optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# # ==================================
# # 5. Training and Evaluation Functions
# # ==================================
# # Your train_model and eval_model functions are well-written and can be used as they are.
# # I've just adjusted the tqdm progress bar description for more clarity.

# def train_model(training_loader, model, optimizer):
#     model.train()
#     losses = []
#     correct_predictions = 0
#     num_samples = 0
    
#     loop = tq(training_loader, leave=True)
#     for batch_idx, data in enumerate(loop):
#         ids = data['input_ids'].to(device, dtype=torch.long)
#         mask = data['attention_mask'].to(device, dtype=torch.long)
#         token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
#         targets = data['targets'].to(device, dtype=torch.float)

#         outputs = model(ids, mask, token_type_ids)
#         loss = loss_fn(outputs, targets)
#         losses.append(loss.item())
        
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         # Accuracy calculation
#         outputs_sigmoid = torch.sigmoid(outputs).cpu().detach().numpy().round()
#         targets_np = targets.cpu().detach().numpy()
#         correct_predictions += np.sum(outputs_sigmoid == targets_np)
#         num_samples += targets_np.size
        
#         loop.set_description(f"Train - Loss: {loss.item():.4f}")

#     return model, float(correct_predictions) / num_samples, np.mean(losses)


# def eval_model(validation_loader, model):
#     model.eval()
#     losses = []
#     correct_predictions = 0
#     num_samples = 0
    
#     with torch.no_grad():
#         for data in validation_loader:
#             ids = data['input_ids'].to(device, dtype=torch.long)
#             mask = data['attention_mask'].to(device, dtype=torch.long)
#             token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
#             targets = data['targets'].to(device, dtype=torch.float)
            
#             outputs = model(ids, mask, token_type_ids)
#             loss = loss_fn(outputs, targets)
#             losses.append(loss.item())

#             outputs_sigmoid = torch.sigmoid(outputs).cpu().detach().numpy().round()
#             targets_np = targets.cpu().detach().numpy()
#             correct_predictions += np.sum(outputs_sigmoid == targets_np)
#             num_samples += targets_np.size

#     return float(correct_predictions) / num_samples, np.mean(losses)


# # ==================================
# # 6. Training Loop
# # ==================================
# history = defaultdict(list)
# best_accuracy = 0

# for epoch in range(1, EPOCHS + 1):
#     print(f'\n--- Epoch {epoch}/{EPOCHS} ---')
    
#     model, train_acc, train_loss = train_model(train_data_loader, model, optimizer)
#     val_acc, val_loss = eval_model(val_data_loader, model)

#     print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
#     print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

#     history['train_acc'].append(train_acc)
#     history['train_loss'].append(train_loss)
#     history['val_acc'].append(val_acc)
#     history['val_loss'].append(val_loss)

#     if val_acc > best_accuracy:
#         torch.save(model.state_dict(), "best_model_state.bin")
#         best_accuracy = val_acc


--- Epoch 1/10 ---


Train - Loss: 0.2005:  24%|‚ñà‚ñà‚ñç       | 237/973 [00:53<02:47,  4.41it/s]


KeyboardInterrupt: 

In [11]:

MODEL_NAME ="csebuetnlp/banglabert"

# You can keep these as they are or tune them
MAX_LEN = 256
BATCH_SIZE = 16
#VAL_BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 2e-5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')





# ==================================
# 4. Model 1: Binary Toxicity Detector
# ==================================
print("\n--- Training Model 1: Binary Toxicity Detector ---")

class CustomDataset_Binary(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.texts = list(df['text'])
        self.targets = list(df['toxic'])
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        inputs = self.tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=self.max_len, padding='max_length',
            return_token_type_ids=True, truncation=True, return_attention_mask=True, return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor([self.targets[index]])
        }

class BERTClass_Binary(nn.Module):
    def __init__(self, model_name):
        super(BERTClass_Binary, self).__init__()
        self.bert_model = AutoModel.from_pretrained(model_name, return_dict=True)
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(self.bert_model.config.hidden_size, 1) # Output is 1 for binary

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(input_ids, attention_mask=attn_mask, token_type_ids=token_type_ids)
        cls_output = output.last_hidden_state[:, 0, :]
        output_dropout = self.dropout(cls_output)
        final_output = self.linear(output_dropout)
        return final_output

# Create DataLoaders
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
train_dataset_1 = CustomDataset_Binary(train_df, tokenizer, MAX_LEN)
dev_dataset_1 = CustomDataset_Binary(dev_df, tokenizer, MAX_LEN)
train_loader_1 = DataLoader(train_dataset_1, batch_size=BATCH_SIZE, shuffle=True)
dev_loader_1 = DataLoader(dev_dataset_1, batch_size=BATCH_SIZE, shuffle=False)

# Instantiate Model, Loss, Optimizer
model_1 = BERTClass_Binary(MODEL_NAME).to(device)
loss_fn_1 = nn.BCEWithLogitsLoss()
optimizer_1 = AdamW(model_1.parameters(), lr=LEARNING_RATE)

# Training Loop (simplified for brevity)
model_1.train()
for epoch in range(EPOCHS):
    for data in tq(train_loader_1, desc=f"Model 1 - Epoch {epoch+1}"):
        ids, mask, token_ids, targets = data['input_ids'].to(device), data['attention_mask'].to(device), data['token_type_ids'].to(device), data['targets'].to(device)
        optimizer_1.zero_grad()
        outputs = model_1(ids, mask, token_ids)
        loss = loss_fn_1(outputs, targets)
        loss.backward()
        optimizer_1.step()
torch.save(model_1.state_dict(), "model_1_binary_state.bin")
print("Model 1 training complete.")





--- Training Model 1: Binary Toxicity Detector ---


Model 1 - Epoch 1:   7%|‚ñã         | 160/2221 [00:34<07:26,  4.61it/s]


KeyboardInterrupt: 

In [50]:

print("\n--- Evaluating Pipeline on Test Set ---")

# Load the trained model weights

model_1.load_state_dict(torch.load("model_1_binary_state.bin"))

model.load_state_dict(torch.load("best_model_state.bin"))

model_1.eval()

model.eval()




--- Evaluating Pipeline on Test Set ---


BERTClass(
  (bert_model): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps

In [52]:
import numpy as np
import torch
from tqdm.notebook import  tqdm

def predict_toxicity_pipeline(text, tokenizer, model_1, model_2, device, max_len, target_list):
    # --- Tokenization (no changes here) ---
    inputs = tokenizer.encode_plus(
        text, add_special_tokens=True, max_length=max_len, padding='max_length',
        return_token_type_ids=True, truncation=True, return_attention_mask=True, return_tensors='pt'
    )
    ids = inputs['input_ids'].to(device)
    mask = inputs['attention_mask'].to(device)
    
    # Check if 'token_type_ids' exists in the tokenizer output
    token_ids = inputs['token_type_ids'].to(device) if 'token_type_ids' in inputs else None

    with torch.no_grad():
        # Step 1: Check if toxic with Model 1
        # The model forward pass may vary depending on its definition
        output_1 = model_1(ids, mask, token_ids)
        # Assuming the output is logits from a SequenceClassifierOutput object
        prob_1 = torch.sigmoid(output_1).item()

        if prob_1 < 0.5:
            # Not toxic, return a vector of all zeros
            return np.zeros(len(target_list), dtype=int)
        else:
            # Toxic, proceed to Model 2
            output_2 =  model_2(ids, mask, token_ids)
            
            # --- üëá KEY CHANGE IS HERE ---
            # 1. Find the index of the label with the highest score (logit)
            # We use argmax directly on the logits, which is efficient.
            # --- AFTER ---
            pred_index = torch.argmax(output_2, dim=1).item()
            
            # 2. Create a one-hot encoded vector
            # This creates an array of zeros...
            one_hot_prediction = np.zeros(len(target_list), dtype=int)
            # ...and sets the predicted index to 1.
            one_hot_prediction[pred_index] = 1
            
            return one_hot_prediction

# --- Your evaluation loop (assuming target_list is defined) ---
# Example: target_list = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

all_predictions = []
# Ensure test_df, tokenizer, model_1, etc. are correctly defined and loaded
for text in tq(test_df['text'], desc="Predicting on test data"):
    prediction = predict_toxicity_pipeline(text, tokenizer, model_1, model, device, MAX_LEN, target_list)
    all_predictions.append(prediction)

y_pred = np.array(all_predictions)

# Now y_pred will be a 2D array where each row has at most one '1'. 

Predicting on test data: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2512/2512 [00:22<00:00, 111.19it/s]


In [53]:
y_pred

array([[0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0],
       ...,
       [0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [54]:
import numpy as np
import pandas as pd


#'Abusive', 'Political Hate', 'Profane', 'Religious Hate', 'Sexism']

# Your mapping
id2l = {
    0: 'Abusive',
    1: 'Political Hate',
    2: 'Profane',
    3:  'Religious Hate',
    4: 'Sexism'
}

# Example y_pred
# y_pred = np.array([[0,0,0,0,0],[0,0,0,1,0],[1,0,0,0,0]])

def decode_labels(row):
    indices = np.where(row == 1)[0]
    if len(indices) == 0:
        return "None"
    # If multiple labels, join them with comma
    return ", ".join([id2l[i] for i in indices])

# Convert predictions into a DataFrame column
df = pd.DataFrame()
df["Predicted_Label"] = [decode_labels(row) for row in y_pred]

print(df.head())


  Predicted_Label
0            None
1         Profane
2            None
3            None
4            None


In [55]:
test_df['label']=df['Predicted_Label']
test_df['model']='bangla-bert'
test_df

Unnamed: 0,id,text,label,model
0,879187,‡¶∂‡ßÅ‡¶≠ ‡¶ï‡¶æ‡¶Æ‡¶®‡¶æ ‡¶∞‡¶á‡¶≤ ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶ú‡¶®‡ßç‡¶Ø ‡¶á‡¶®‡¶∂‡¶æ‡¶Ü‡¶≤‡ßç‡¶≤‡¶æ‡¶π ‡¶ú‡¶Ø‡¶º ‡¶π‡¶¨‡ßá,,bangla-bert
1,316919,‡¶ó‡ßã‡ßü‡¶æ ‡¶Æ‡¶æ‡¶∞‡¶æ ‡¶¶‡¶ø‡ßü‡ßá ‡¶Ü‡¶õ‡ßá ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶Æ‡¶æ‡¶¶‡¶æ‡¶∞‡¶ö‡ßã‡¶¶ ‡¶®‡¶ø‡¶â‡¶ú ‡¶ï‡¶∞‡ßá ...,Profane,bangla-bert
2,916242,‡¶≠‡¶æ‡¶á‡¶Ø‡¶º‡¶æ ‡¶Ü‡¶™‡¶®‡¶ø ‡¶Ö‡¶≠‡¶ø‡¶®‡ßá‡¶§‡¶æ ‡¶π‡¶á‡¶Ø‡¶º‡ßá‡¶® ‡¶®‡¶æ ‡¶®‡¶æ ‡¶π‡¶≤‡ßá ‡¶∏‡¶¨‡¶æ‡¶á ‡¶¨‡¶æ‡¶ö‡ßç...,,bangla-bert
3,786824,‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞‡ßã ‡¶§‡¶æ‡¶á ‡¶¶‡ßá‡¶ñ‡¶õ‡¶ø,,bangla-bert
4,47284,‡¶™‡ßÅ‡¶≤‡¶ø‡¶∂ ‡¶ï‡¶§‡¶ü‡¶æ ‡¶ü‡¶æ‡¶ï‡¶æ ‡¶®‡¶ø‡ßü‡ßá,,bangla-bert
...,...,...,...,...
2507,776466,‡¶∏‡¶§‡ßç‡¶Ø ‡¶ï‡¶•‡¶æ ‡¶§‡ßá‡¶§‡ßÅ ‡¶≤‡¶æ‡¶ó‡ßá,,bangla-bert
2508,849227,‡¶è‡¶á ‡¶´‡¶ï‡¶ø‡¶®‡¶®‡¶ø ‡¶Æ‡¶æ‡¶ó‡ßÄ‡¶ü‡¶æ ‡¶Ü‡¶∞ ‡¶ï‡¶§ ‡¶®‡¶æ‡¶ü‡¶ï ‡¶¶‡ßá‡¶ñ‡¶æ‡¶¨‡ßá,Profane,bangla-bert
2509,532697,‡¶¶‡ßá‡¶ñ‡ßã ‡¶Ü‡¶ú‡¶ï‡ßá ‡¶ï‡¶æ‡¶∞ ‡¶´‡¶ø‡¶ü‡¶®‡ßá‡¶∏ ‡¶ï‡ßã‡¶•‡¶æ‡¶Ø‡¶º ‡¶¶‡¶æ‡¶Å‡¶°‡¶º‡¶ø‡¶Ø‡¶º‡ßá‡¶õ‡ßá ‡¶§‡ßÅ‡¶Æ‡¶ø ‡¶ö...,Profane,bangla-bert
2510,861411,‡¶õ‡ßã‡¶ü ‡¶≠‡¶æ‡¶á‡¶ü‡¶ø‡¶∞ ‡¶™‡¶æ‡¶∏‡ßá ‡¶•‡¶æ‡¶ï‡ßÅ‡¶® ‡¶ó‡ßá‡¶Æ ‡¶≠‡¶ø‡ßú‡¶ø‡¶ì ‡¶¨‡¶æ‡¶®‡¶æ‡¶á,,bangla-bert


In [56]:
test_df = test_df[['id', 'label', 'model']]
test_df

Unnamed: 0,id,label,model
0,879187,,bangla-bert
1,316919,Profane,bangla-bert
2,916242,,bangla-bert
3,786824,,bangla-bert
4,47284,,bangla-bert
...,...,...,...
2507,776466,,bangla-bert
2508,849227,Profane,bangla-bert
2509,532697,Profane,bangla-bert
2510,861411,,bangla-bert


In [57]:
test_df.to_csv("final_banth_v7.tsv", sep="\t", index=False)
print("Saved to final_ensemble.tsv")

Saved to final_ensemble.tsv
