In [31]:

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoConfig, AutoModel
from torch.optim import AdamW  # <-- use this instead
from collections import defaultdict
from tqdm import tqdm as tq
from sklearn.model_selection import train_test_split


MODEL_NAME ="csebuetnlp/banglabert"

# You can keep these as they are or tune them
MAX_LEN = 256
TRAIN_BATCH_SIZE = 16
VAL_BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [32]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer


In [33]:
train_file = 'blp25_hatespeech_subtask_1A_train.tsv'
validation_file = 'blp25_hatespeech_subtask_1A_dev.tsv'
test_file = 'blp25_hatespeech_subtask_1A_test.tsv'


validation_file2 = 'blp25_hatespeech_subtask_1A_dev_test_with_labels.tsv'

test_file = 'blp25_hatespeech_subtask_1A_test_with_labels.tsv'

#tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
# Load train/val/test DataFrames
#tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
# Load train/val/test DataFrames
#tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
# Load train/val/test DataFrames
train_df = pd.read_csv(train_file, sep="\t")
dev_df1 = pd.read_csv(validation_file , sep="\t")
dev_df2 = pd.read_csv(validation_file2 , sep="\t")
test_df = pd.read_csv(test_file, sep="\t")



# Concatenate them into one dataframe
train_df = pd.concat([train_df, dev_df1, dev_df2], ignore_index=True)
dev_df =test_df





In [34]:
l2id = {
    'None': 0,
    'Religious Hate': 1,
    'Sexism': 2,
    'Political Hate': 3,
    'Profane': 4,
    'Abusive': 5
}
id2l = {v: k for k, v in l2id.items()}


def clean_label(x):
    # handle missing or NaN ‚Üí "None"
    if pd.isna(x) or x == 'None':
        return 'None'
    # already list-like e.g. ['Abusive']
    if isinstance(x, list):
        return x[0] if len(x) > 0 else 'None'
    # string cases like "[]" or "[Abusive]" or "[Political Hate]"
    x = x.strip("[]").strip()
    if x == "":
        return 'None'
    return x


def process_df(df):
    # Ensure labels are proper lists
    df["label"] = df["label"].apply(clean_label)
    df["label"] = df["label"].fillna("None")
    # Now create binary label
    df["toxic"] = df["label"].apply(lambda x: 0 if x == "None" else 1)
    df["label_id"] = df["label"].map(l2id)

    

  

    return df

train_df = process_df(train_df)
dev_df  = process_df(dev_df)

train_df

Unnamed: 0,id,text,label,toxic,label_id
0,147963,‡¶ß‡¶®‡ßç‡¶Ø‡¶¨‡¶æ‡¶¶ ‡¶¨‡¶∞‡ßç‡¶°‡¶æ‡¶∞ ‡¶ó‡¶æ‡¶∞‡ßç‡¶° ‡¶¶‡ßá‡¶∞‡¶ï‡ßá ‡¶è‡¶≠‡¶æ‡¶¨‡ßá ‡¶™‡¶æ‡¶π‡¶æ‡¶∞‡¶æ ‡¶¶‡¶ø‡¶§‡ßá ‡¶π...,,0,0
1,214275,‡¶õ‡ßã‡¶ü‡¶¨‡ßá‡¶≤‡¶æ‡¶Ø‡¶º ‡¶Ö‡¶®‡ßá‡¶ï ‡¶ï‡¶∑‡ßç‡¶ü ‡¶ï‡¶∞‡ßá ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ó‡¶æ‡¶≤‡¶æ‡¶ó‡¶æ‡¶≤‡¶ø ‡¶∂‡¶ø‡¶ñ‡¶õ‡¶ø‡¶≤‡¶æ‡¶Æ...,,0,0
2,849172,‡¶Ö‡¶§‡¶ø‡¶∞‡¶ø‡¶ï‡ßç‡¶§ ‡¶è ‡¶®‡¶ø‡¶ú‡ßá‡¶ï‡ßá ‡¶¨‡¶æ‡¶¶‡ßÅ‡¶∞ ‡¶¨‡¶æ‡¶®‡¶æ‡¶á‡ßü‡¶æ ‡¶´‡ßá‡¶≤‡¶õ‡ßá‡¶® ‡¶∞‡ßá,Abusive,1,5
3,821985,‡¶ö‡¶ø‡¶® ‡¶≠‡¶æ‡¶∞‡¶§ ‡¶∞‡¶æ‡¶∂‡¶ø‡ßü‡¶æ ‡¶è‡¶á ‡¶§‡¶ø‡¶® ‡¶¶‡ßá‡¶∂ ‡¶è‡¶ï ‡¶•‡¶æ‡¶ï‡¶≤‡ßá ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡¶ï‡ßá ‡¶∂‡¶æ...,,0,0
4,477288,‡¶è‡¶ü‡¶æ‡¶∞ ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡ßá ‡¶ï‡¶∞‡¶¨‡ßá‡¶Ø‡ßá ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡¶∞‡¶¨‡ßá ‡¶∏‡ßá‡¶á ‡¶§‡ßã ‡¶π‡¶≤‡ßã ‡¶è‡¶á ...,Abusive,1,5
...,...,...,...,...,...
40541,776466,‡¶∏‡¶§‡ßç‡¶Ø ‡¶ï‡¶•‡¶æ ‡¶§‡ßá‡¶§‡ßÅ ‡¶≤‡¶æ‡¶ó‡ßá,,0,0
40542,849227,‡¶è‡¶á ‡¶´‡¶ï‡¶ø‡¶®‡¶®‡¶ø ‡¶Æ‡¶æ‡¶ó‡ßÄ‡¶ü‡¶æ ‡¶Ü‡¶∞ ‡¶ï‡¶§ ‡¶®‡¶æ‡¶ü‡¶ï ‡¶¶‡ßá‡¶ñ‡¶æ‡¶¨‡ßá,Abusive,1,5
40543,532697,‡¶¶‡ßá‡¶ñ‡ßã ‡¶Ü‡¶ú‡¶ï‡ßá ‡¶ï‡¶æ‡¶∞ ‡¶´‡¶ø‡¶ü‡¶®‡ßá‡¶∏ ‡¶ï‡ßã‡¶•‡¶æ‡¶Ø‡¶º ‡¶¶‡¶æ‡¶Å‡¶°‡¶º‡¶ø‡¶Ø‡¶º‡ßá‡¶õ‡ßá ‡¶§‡ßÅ‡¶Æ‡¶ø ‡¶ö...,Profane,1,4
40544,861411,‡¶õ‡ßã‡¶ü ‡¶≠‡¶æ‡¶á‡¶ü‡¶ø‡¶∞ ‡¶™‡¶æ‡¶∏‡ßá ‡¶•‡¶æ‡¶ï‡ßÅ‡¶® ‡¶ó‡ßá‡¶Æ ‡¶≠‡¶ø‡ßú‡¶ø‡¶ì ‡¶¨‡¶æ‡¶®‡¶æ‡¶á,,0,0


In [35]:
df_train = train_df

df_val = dev_df

df_val

Unnamed: 0,id,text,label,toxic,label_id
0,12764,‡¶á‡¶ú‡¶∞‡¶æ‡¶Ø‡¶º‡ßá‡¶≤‡ßá‡¶∞ ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶π‡¶ì‡¶Ø‡¶º‡¶æ ‡¶â‡¶ö‡¶ø‡ßé,Abusive,1,5
1,202933,‡¶∂‡¶æ‡¶Æ‡ßÄ‡¶Æ ‡¶ì‡¶∏‡¶æ‡¶Æ‡¶æ ‡¶¨‡¶ø‡¶® ‡¶π‡¶æ‡¶∏‡¶ø‡¶®‡¶æ,,0,0
2,165894,‡¶π‡ßá‡¶® ‡¶ï‡¶æ‡¶™ ‡¶™‡ßÅ‡¶≤‡¶ø‡¶∂‡ßá‡¶∞ ‡¶Æ‡¶æ‡¶∞‡ßá ‡¶Ö‡¶®‡ßç‡¶Ø‡¶∞‡¶æ ‡¶§‡¶æ‡¶π‡¶≤‡ßá ‡¶™‡¶≤‡¶ø‡¶∂‡ßá‡¶∞ ‡¶ï‡¶ø ‡¶π‡¶¨...,,0,0
3,124999,‡¶Ü‡¶≤‡ßç‡¶≤‡¶æ‡¶π‡ßç ‡¶è‡¶∏‡¶¨ ‡¶ú‡¶æ‡¶®‡ßã‡¶Ø‡¶º‡¶æ‡¶∞‡¶¶‡ßá‡¶∞ ‡¶∂‡ßá‡¶∑ ‡¶ï‡¶∞‡ßá ‡¶¶‡¶æ‡¶ì,Profane,1,4
4,535301,‡¶á‡¶π‡ßÅ‡¶¶‡¶ø‡¶∞ ‡¶¨‡¶æ‡¶ö‡ßç‡¶ö‡¶æ ‡¶á‡¶π‡ßÅ‡¶¶‡ßÄ ‡¶á ‡¶π‡¶¨‡ßá,Religious Hate,1,1
...,...,...,...,...,...
10195,908819,‡¶∂‡¶æ‡¶≤‡¶æ ‡¶ú‡¶ô‡ßç‡¶ó‡¶ø ‡¶®‡¶ø‡¶ö‡ßÅ ‡¶ú‡¶æ‡¶§ ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶∂‡¶æ‡¶≤‡¶æ ‡¶®‡¶ø‡¶ú‡ßá‡¶∞ ‡¶¶‡ßá‡¶∂ ...,Profane,1,4
10196,597085,‡¶è‡¶∞‡ßá ‡¶π‡ßá‡¶§‡¶ø ‡¶π‡¶æ‡¶ó‡¶≤ ‡¶Ö‡¶á ‡¶ó‡ßá‡¶õ‡ßá ‡¶π‡ßá‡¶§‡¶ø‡¶∞‡ßá ‡¶™‡¶æ‡¶¨‡¶®‡¶æ ‡¶®‡ßá,Abusive,1,5
10197,901448,‡¶è‡¶ñ‡¶æ‡¶® ‡¶•‡ßá‡¶ï‡ßá ‡¶™‡ßç‡¶∞‡¶Æ‡¶æ‡¶® ‡¶π‡¶≤‡ßã ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶∞ ‡¶Æ‡ßÄ‡¶∞ ‡¶ú‡¶æ‡¶´‡¶∞ ‡¶ö‡¶æ‡¶á‡¶≤‡ßá‡¶á ‡¶§‡¶æ...,,0,0
10198,617821,‡¶∂‡ßÅ‡¶®‡¶ø ‡¶Ø‡ßá ‡¶è‡¶á ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶∏‡¶Ç‡¶¨‡¶ø‡¶ß‡¶æ‡¶® ‡¶ó‡¶£‡¶§‡¶æ‡¶®‡ßç‡¶§‡ßç‡¶∞‡¶ø‡¶ï ‡¶∏‡¶Ç‡¶¨‡¶ø‡¶ß‡¶æ‡¶® ‡¶ï...,,0,0


In [36]:
toxic_df = df_train
target_list = sorted(toxic_df['label'].unique().tolist()) # Sort for consistent column order
print(f"Target Categories: {target_list}")

Target Categories: ['Abusive', 'None', 'Political Hate', 'Profane', 'Religious Hate', 'Sexism']


In [37]:
df_train = pd.get_dummies(df_train, columns=['label'], prefix='', prefix_sep='')[['text'] + target_list]

df_val = pd.get_dummies(df_val, columns=['label'], prefix='', prefix_sep='')[['text'] + target_list]

df_val

Unnamed: 0,text,Abusive,None,Political Hate,Profane,Religious Hate,Sexism
0,‡¶á‡¶ú‡¶∞‡¶æ‡¶Ø‡¶º‡ßá‡¶≤‡ßá‡¶∞ ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶π‡¶ì‡¶Ø‡¶º‡¶æ ‡¶â‡¶ö‡¶ø‡ßé,True,False,False,False,False,False
1,‡¶∂‡¶æ‡¶Æ‡ßÄ‡¶Æ ‡¶ì‡¶∏‡¶æ‡¶Æ‡¶æ ‡¶¨‡¶ø‡¶® ‡¶π‡¶æ‡¶∏‡¶ø‡¶®‡¶æ,False,True,False,False,False,False
2,‡¶π‡ßá‡¶® ‡¶ï‡¶æ‡¶™ ‡¶™‡ßÅ‡¶≤‡¶ø‡¶∂‡ßá‡¶∞ ‡¶Æ‡¶æ‡¶∞‡ßá ‡¶Ö‡¶®‡ßç‡¶Ø‡¶∞‡¶æ ‡¶§‡¶æ‡¶π‡¶≤‡ßá ‡¶™‡¶≤‡¶ø‡¶∂‡ßá‡¶∞ ‡¶ï‡¶ø ‡¶π‡¶¨...,False,True,False,False,False,False
3,‡¶Ü‡¶≤‡ßç‡¶≤‡¶æ‡¶π‡ßç ‡¶è‡¶∏‡¶¨ ‡¶ú‡¶æ‡¶®‡ßã‡¶Ø‡¶º‡¶æ‡¶∞‡¶¶‡ßá‡¶∞ ‡¶∂‡ßá‡¶∑ ‡¶ï‡¶∞‡ßá ‡¶¶‡¶æ‡¶ì,False,False,False,True,False,False
4,‡¶á‡¶π‡ßÅ‡¶¶‡¶ø‡¶∞ ‡¶¨‡¶æ‡¶ö‡ßç‡¶ö‡¶æ ‡¶á‡¶π‡ßÅ‡¶¶‡ßÄ ‡¶á ‡¶π‡¶¨‡ßá,False,False,False,False,True,False
...,...,...,...,...,...,...,...
10195,‡¶∂‡¶æ‡¶≤‡¶æ ‡¶ú‡¶ô‡ßç‡¶ó‡¶ø ‡¶®‡¶ø‡¶ö‡ßÅ ‡¶ú‡¶æ‡¶§ ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶∂‡¶æ‡¶≤‡¶æ ‡¶®‡¶ø‡¶ú‡ßá‡¶∞ ‡¶¶‡ßá‡¶∂ ...,False,False,False,True,False,False
10196,‡¶è‡¶∞‡ßá ‡¶π‡ßá‡¶§‡¶ø ‡¶π‡¶æ‡¶ó‡¶≤ ‡¶Ö‡¶á ‡¶ó‡ßá‡¶õ‡ßá ‡¶π‡ßá‡¶§‡¶ø‡¶∞‡ßá ‡¶™‡¶æ‡¶¨‡¶®‡¶æ ‡¶®‡ßá,True,False,False,False,False,False
10197,‡¶è‡¶ñ‡¶æ‡¶® ‡¶•‡ßá‡¶ï‡ßá ‡¶™‡ßç‡¶∞‡¶Æ‡¶æ‡¶® ‡¶π‡¶≤‡ßã ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶∞ ‡¶Æ‡ßÄ‡¶∞ ‡¶ú‡¶æ‡¶´‡¶∞ ‡¶ö‡¶æ‡¶á‡¶≤‡ßá‡¶á ‡¶§‡¶æ...,False,True,False,False,False,False
10198,‡¶∂‡ßÅ‡¶®‡¶ø ‡¶Ø‡ßá ‡¶è‡¶á ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶∏‡¶Ç‡¶¨‡¶ø‡¶ß‡¶æ‡¶® ‡¶ó‡¶£‡¶§‡¶æ‡¶®‡ßç‡¶§‡ßç‡¶∞‡¶ø‡¶ï ‡¶∏‡¶Ç‡¶¨‡¶ø‡¶ß‡¶æ‡¶® ‡¶ï...,False,True,False,False,False,False


In [38]:

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len, target_list):
        self.tokenizer = tokenizer
        self.df = df
        # Corrected column name from 'title' or 'Text' to 'text'
        self.texts = list(df['text']) 
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index]),
            'text': text
        }

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
train_dataset = CustomDataset(df_train, tokenizer, MAX_LEN, target_list)
val_dataset = CustomDataset(df_val, tokenizer, MAX_LEN, target_list)

train_data_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=VAL_BATCH_SIZE, shuffle=False)


In [39]:
class BERTClass(torch.nn.Module):
    def __init__(self, model_name, target_list):
        super(BERTClass, self).__init__()
        self.bert_model = AutoModel.from_pretrained(model_name, return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(self.bert_model.config.hidden_size, len(target_list))
    
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        
        # --- FIX IS HERE ---
        # Instead of pooler_output, we take the last hidden state of the [CLS] token
        # output.last_hidden_state has shape (batch_size, sequence_length, hidden_size)
        # We select the [CLS] token by indexing with [:, 0, :]
        cls_output = output.last_hidden_state[:, 0, :]
        
        output_dropout = self.dropout(cls_output)
        final_output = self.linear(output_dropout)
        
        return final_output

# Instantiate the model
model = BERTClass(MODEL_NAME, target_list)
model.to(device)

# ==================================
# 4. Loss Function and Optimizer
# ==================================
# For multi-label classification, BCEWithLogitsLoss is the correct choice.
# It combines a Sigmoid layer and the BCELoss in one single class.
def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# ==================================
# 5. Training and Evaluation Functions
# ==================================
# Your train_model and eval_model functions are well-written and can be used as they are.
# I've just adjusted the tqdm progress bar description for more clarity.

def train_model(training_loader, model, optimizer):
    model.train()
    losses = []
    correct_predictions = 0
    num_samples = 0
    
    loop = tq(training_loader, leave=True)
    for batch_idx, data in enumerate(loop):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Accuracy calculation
        outputs_sigmoid = torch.sigmoid(outputs).cpu().detach().numpy().round()
        targets_np = targets.cpu().detach().numpy()
        correct_predictions += np.sum(outputs_sigmoid == targets_np)
        num_samples += targets_np.size
        
        loop.set_description(f"Train - Loss: {loss.item():.4f}")

    return model, float(correct_predictions) / num_samples, np.mean(losses)


def eval_model(validation_loader, model):
    model.eval()
    losses = []
    correct_predictions = 0
    num_samples = 0
    
    with torch.no_grad():
        for data in validation_loader:
            ids = data['input_ids'].to(device, dtype=torch.long)
            mask = data['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)
            
            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            outputs_sigmoid = torch.sigmoid(outputs).cpu().detach().numpy().round()
            targets_np = targets.cpu().detach().numpy()
            correct_predictions += np.sum(outputs_sigmoid == targets_np)
            num_samples += targets_np.size

    return float(correct_predictions) / num_samples, np.mean(losses)


# ==================================
# 6. Training Loop
# ==================================
history = defaultdict(list)
best_accuracy = 0

for epoch in range(1, EPOCHS + 1):
    print(f'\n--- Epoch {epoch}/{EPOCHS} ---')
    
    model, train_acc, train_loss = train_model(train_data_loader, model, optimizer)
    val_acc, val_loss = eval_model(val_data_loader, model)

    print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
    print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_accuracy:
        torch.save(model.state_dict(), "all_label_best_model_state.bin")
        best_accuracy = val_acc


--- Epoch 1/3 ---


Train - Loss: 0.2473: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2535/2535 [09:36<00:00,  4.40it/s]


Train Loss: 0.2380, Train Acc: 0.8987
Val Loss: 0.2090, Val Acc: 0.9078

--- Epoch 2/3 ---


Train - Loss: 0.0587: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2535/2535 [09:36<00:00,  4.40it/s]


Train Loss: 0.1907, Train Acc: 0.9196
Val Loss: 0.2111, Val Acc: 0.9080

--- Epoch 3/3 ---


Train - Loss: 0.2893: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2535/2535 [09:34<00:00,  4.41it/s]


Train Loss: 0.1550, Train Acc: 0.9372
Val Loss: 0.2326, Val Acc: 0.9063


In [42]:

print("\n--- Evaluating Pipeline on Test Set ---")

model.load_state_dict(torch.load("all_label_best_model_state.bin"))



model.eval()




--- Evaluating Pipeline on Test Set ---


BERTClass(
  (bert_model): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps

In [43]:
import numpy as np
import torch
from tqdm.notebook import  tqdm

def predict_toxicity_pipeline(text, tokenizer, model_2, device, max_len, target_list):
    # --- Tokenization (no changes here) ---
    inputs = tokenizer.encode_plus(
        text, add_special_tokens=True, max_length=max_len, padding='max_length',
        return_token_type_ids=True, truncation=True, return_attention_mask=True, return_tensors='pt'
    )
    ids = inputs['input_ids'].to(device)
    mask = inputs['attention_mask'].to(device)
    
    # Check if 'token_type_ids' exists in the tokenizer output
    token_ids = inputs['token_type_ids'].to(device) if 'token_type_ids' in inputs else None

    with torch.no_grad():
       
        output_2 =  model_2(ids, mask, token_ids)
            
            # --- üëá KEY CHANGE IS HERE ---
            # 1. Find the index of the label with the highest score (logit)
            # We use argmax directly on the logits, which is efficient.
            # --- AFTER ---
        pred_index = torch.argmax(output_2, dim=1).item()
            
            # 2. Create a one-hot encoded vector
            # This creates an array of zeros...
        one_hot_prediction = np.zeros(len(target_list), dtype=int)
            # ...and sets the predicted index to 1.
        one_hot_prediction[pred_index] = 1
            
        return one_hot_prediction

# --- Your evaluation loop (assuming target_list is defined) ---
# Example: target_list = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

all_predictions = []
# Ensure test_df, tokenizer, model_1, etc. are correctly defined and loaded
for text in tq(test_df['text'], desc="Predicting on test data"):
    prediction = predict_toxicity_pipeline(text, tokenizer, model, device, MAX_LEN, target_list)
    all_predictions.append(prediction)

y_pred = np.array(all_predictions)

# Now y_pred will be a 2D array where each row has at most one '1'. 

Predicting on test data: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10200/10200 [01:56<00:00, 87.37it/s]


In [44]:
y_pred

array([[1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       ...,
       [1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0]])

In [45]:
import numpy as np
import pandas as pd

#'Abusive', 'None', 'Political Hate', 'Profane', 'Religious Hate', 'Sexism'
#'Abusive', 'Political Hate', 'Profane', 'Religious Hate', 'Sexism']

# Your mapping
id2l = {
    0: 'Abusive',
    1: 'None',
    2: 'Political Hate',
    3: 'Profane',
    4:  'Religious Hate',
    5: 'Sexism'
}

# Example y_pred
# y_pred = np.array([[0,0,0,0,0],[0,0,0,1,0],[1,0,0,0,0]])

def decode_labels(row):
    indices = np.where(row == 1)[0]
    if len(indices) == 0:
        return "None"
    # If multiple labels, join them with comma
    return ", ".join([id2l[i] for i in indices])

# Convert predictions into a DataFrame column
df = pd.DataFrame()
df["Predicted_Label"] = [decode_labels(row) for row in y_pred]

print(df.head())


  Predicted_Label
0         Abusive
1            None
2            None
3         Profane
4  Religious Hate


In [46]:
test_df['label']=df['Predicted_Label']
test_df['model']='bangla-bert'
test_df

Unnamed: 0,id,text,label,toxic,label_id,model
0,12764,‡¶á‡¶ú‡¶∞‡¶æ‡¶Ø‡¶º‡ßá‡¶≤‡ßá‡¶∞ ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶π‡¶ì‡¶Ø‡¶º‡¶æ ‡¶â‡¶ö‡¶ø‡ßé,Abusive,1,5,bangla-bert
1,202933,‡¶∂‡¶æ‡¶Æ‡ßÄ‡¶Æ ‡¶ì‡¶∏‡¶æ‡¶Æ‡¶æ ‡¶¨‡¶ø‡¶® ‡¶π‡¶æ‡¶∏‡¶ø‡¶®‡¶æ,,0,0,bangla-bert
2,165894,‡¶π‡ßá‡¶® ‡¶ï‡¶æ‡¶™ ‡¶™‡ßÅ‡¶≤‡¶ø‡¶∂‡ßá‡¶∞ ‡¶Æ‡¶æ‡¶∞‡ßá ‡¶Ö‡¶®‡ßç‡¶Ø‡¶∞‡¶æ ‡¶§‡¶æ‡¶π‡¶≤‡ßá ‡¶™‡¶≤‡¶ø‡¶∂‡ßá‡¶∞ ‡¶ï‡¶ø ‡¶π‡¶¨...,,0,0,bangla-bert
3,124999,‡¶Ü‡¶≤‡ßç‡¶≤‡¶æ‡¶π‡ßç ‡¶è‡¶∏‡¶¨ ‡¶ú‡¶æ‡¶®‡ßã‡¶Ø‡¶º‡¶æ‡¶∞‡¶¶‡ßá‡¶∞ ‡¶∂‡ßá‡¶∑ ‡¶ï‡¶∞‡ßá ‡¶¶‡¶æ‡¶ì,Profane,1,4,bangla-bert
4,535301,‡¶á‡¶π‡ßÅ‡¶¶‡¶ø‡¶∞ ‡¶¨‡¶æ‡¶ö‡ßç‡¶ö‡¶æ ‡¶á‡¶π‡ßÅ‡¶¶‡ßÄ ‡¶á ‡¶π‡¶¨‡ßá,Religious Hate,1,1,bangla-bert
...,...,...,...,...,...,...
10195,908819,‡¶∂‡¶æ‡¶≤‡¶æ ‡¶ú‡¶ô‡ßç‡¶ó‡¶ø ‡¶®‡¶ø‡¶ö‡ßÅ ‡¶ú‡¶æ‡¶§ ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶∂‡¶æ‡¶≤‡¶æ ‡¶®‡¶ø‡¶ú‡ßá‡¶∞ ‡¶¶‡ßá‡¶∂ ...,Profane,1,4,bangla-bert
10196,597085,‡¶è‡¶∞‡ßá ‡¶π‡ßá‡¶§‡¶ø ‡¶π‡¶æ‡¶ó‡¶≤ ‡¶Ö‡¶á ‡¶ó‡ßá‡¶õ‡ßá ‡¶π‡ßá‡¶§‡¶ø‡¶∞‡ßá ‡¶™‡¶æ‡¶¨‡¶®‡¶æ ‡¶®‡ßá,Abusive,1,5,bangla-bert
10197,901448,‡¶è‡¶ñ‡¶æ‡¶® ‡¶•‡ßá‡¶ï‡ßá ‡¶™‡ßç‡¶∞‡¶Æ‡¶æ‡¶® ‡¶π‡¶≤‡ßã ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶∞ ‡¶Æ‡ßÄ‡¶∞ ‡¶ú‡¶æ‡¶´‡¶∞ ‡¶ö‡¶æ‡¶á‡¶≤‡ßá‡¶á ‡¶§‡¶æ...,Abusive,0,0,bangla-bert
10198,617821,‡¶∂‡ßÅ‡¶®‡¶ø ‡¶Ø‡ßá ‡¶è‡¶á ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶∏‡¶Ç‡¶¨‡¶ø‡¶ß‡¶æ‡¶® ‡¶ó‡¶£‡¶§‡¶æ‡¶®‡ßç‡¶§‡ßç‡¶∞‡¶ø‡¶ï ‡¶∏‡¶Ç‡¶¨‡¶ø‡¶ß‡¶æ‡¶® ‡¶ï...,,0,0,bangla-bert


In [47]:
test_df = test_df[['id', 'label', 'model']]
test_df

Unnamed: 0,id,label,model
0,12764,Abusive,bangla-bert
1,202933,,bangla-bert
2,165894,,bangla-bert
3,124999,Profane,bangla-bert
4,535301,Religious Hate,bangla-bert
...,...,...,...
10195,908819,Profane,bangla-bert
10196,597085,Abusive,bangla-bert
10197,901448,Abusive,bangla-bert
10198,617821,,bangla-bert


In [48]:
test_df.to_csv("finalllllll_task1.tsv", sep="\t", index=False)
print("Saved to final_ensemble.tsv")

Saved to final_ensemble.tsv
