In [2]:

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoConfig, AutoModel, AdamW
from collections import defaultdict
from tqdm import tqdm as tq
from sklearn.model_selection import train_test_split


MODEL_NAME ="aplycaebous/tb-BanglaBERT-fpt"

# You can keep these as they are or tune them
MAX_LEN = 256
TRAIN_BATCH_SIZE = 16
VAL_BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 2e-5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer


In [4]:
train_file = 'blp25_hatespeech_subtask_1A_train.tsv'
validation_file = 'blp25_hatespeech_subtask_1A_dev.tsv'
test_file = 'blp25_hatespeech_subtask_1A_dev_test.tsv'

#tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
# Load train/val/test DataFrames
train_df = pd.read_csv(train_file, sep="\t")
dev_df = pd.read_csv(validation_file , sep="\t")
test_df = pd.read_csv(test_file, sep="\t")






In [5]:
l2id = {
    'None': 0,
    'Religious Hate': 1,
    'Sexism': 2,
    'Political Hate': 3,
    'Profane': 4,
    'Abusive': 5
}
id2l = {v: k for k, v in l2id.items()}


def clean_label(x):
    # handle missing or NaN → "None"
    if pd.isna(x) or x == 'None':
        return 'None'
    # already list-like e.g. ['Abusive']
    if isinstance(x, list):
        return x[0] if len(x) > 0 else 'None'
    # string cases like "[]" or "[Abusive]" or "[Political Hate]"
    x = x.strip("[]").strip()
    if x == "":
        return 'None'
    return x


def process_df(df):
    # Ensure labels are proper lists
    df["label"] = df["label"].apply(clean_label)
    df["label"] = df["label"].fillna("None")
    # Now create binary label
    df["toxic"] = df["label"].apply(lambda x: 0 if x == "None" else 1)
    df["label_id"] = df["label"].map(l2id)

    

  

    return df

train_df = process_df(train_df)
dev_df  = process_df(dev_df)

train_df

Unnamed: 0,id,text,label,toxic,label_id
0,147963,ধন্যবাদ বর্ডার গার্ড দেরকে এভাবে পাহারা দিতে হ...,,0,0
1,214275,ছোটবেলায় অনেক কষ্ট করে কিছু গালাগালি শিখছিলাম...,,0,0
2,849172,অতিরিক্ত এ নিজেকে বাদুর বানাইয়া ফেলছেন রে,Abusive,1,5
3,821985,চিন ভারত রাশিয়া এই তিন দেশ এক থাকলে বিশ্বকে শা...,,0,0
4,477288,এটার বিচার কে করবেযে বিচার করবে সেই তো হলো এই ...,Abusive,1,5
...,...,...,...,...,...
35517,790325,তইওয়ানের এত ক্ষমতা হয়নি যে এক টুকরো জায়গা নষ্ট...,,0,0
35518,328377,চুরের ঘরের চুর হালা,Profane,1,4
35519,69803,জাহাঙ্গীর বুদ্ধি নেই মাঠে মারা যাবে,Abusive,1,5
35520,419984,একটা ফেইল্ড এস্টেট এও সুষ্ঠু নির্বাচন হয় নেতার...,Abusive,1,5


In [6]:
df_train = train_df

df_val = dev_df

df_val

Unnamed: 0,id,text,label,toxic,label_id
0,166449,ইন্ডিয়া কি মাছ ধরা বন্ধ রাখছেএক নদীতে দুইনীতি ...,Political Hate,1,3
1,267692,লক্ষ টাকা ঘুষ দিয়ে অযোগ্য আর দায়িত্বহীন মানস...,Abusive,1,5
2,184031,ওহা ভবনের দালাল,,0,0
3,939131,আর কতো শিখবে আমার সোনার ছেলেরা এগুলো কে টাকা দ...,Abusive,1,5
4,210284,কি সাংঘাতিক ভাই রে তুই,Abusive,1,5
...,...,...,...,...,...
2507,350971,পুরোনো ইতিহাস তুলে ধরার জন্য সময় সংবাদ কে ধন্...,,0,0
2508,539053,এই জন্যই আমাদের মেয়েরা কোরিয়া চলে যেতে চায়,,0,0
2509,200314,এই শালা ইবলিশ এর বস এবলিশ এদের দেখে ভয় পায়,Abusive,1,5
2510,788171,আমি কিনে ফেলছি আই ফোন ১৪,,0,0


In [7]:
toxic_df = df_train
target_list = sorted(toxic_df['label'].unique().tolist()) # Sort for consistent column order
print(f"Target Categories: {target_list}")

Target Categories: ['Abusive', 'None', 'Political Hate', 'Profane', 'Religious Hate', 'Sexism']


In [8]:
df_train = pd.get_dummies(df_train, columns=['label'], prefix='', prefix_sep='')[['text'] + target_list]

df_val = pd.get_dummies(df_val, columns=['label'], prefix='', prefix_sep='')[['text'] + target_list]

df_val

Unnamed: 0,text,Abusive,None,Political Hate,Profane,Religious Hate,Sexism
0,ইন্ডিয়া কি মাছ ধরা বন্ধ রাখছেএক নদীতে দুইনীতি ...,False,False,True,False,False,False
1,লক্ষ টাকা ঘুষ দিয়ে অযোগ্য আর দায়িত্বহীন মানস...,True,False,False,False,False,False
2,ওহা ভবনের দালাল,False,True,False,False,False,False
3,আর কতো শিখবে আমার সোনার ছেলেরা এগুলো কে টাকা দ...,True,False,False,False,False,False
4,কি সাংঘাতিক ভাই রে তুই,True,False,False,False,False,False
...,...,...,...,...,...,...,...
2507,পুরোনো ইতিহাস তুলে ধরার জন্য সময় সংবাদ কে ধন্...,False,True,False,False,False,False
2508,এই জন্যই আমাদের মেয়েরা কোরিয়া চলে যেতে চায়,False,True,False,False,False,False
2509,এই শালা ইবলিশ এর বস এবলিশ এদের দেখে ভয় পায়,True,False,False,False,False,False
2510,আমি কিনে ফেলছি আই ফোন ১৪,False,True,False,False,False,False


In [9]:

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len, target_list):
        self.tokenizer = tokenizer
        self.df = df
        # Corrected column name from 'title' or 'Text' to 'text'
        self.texts = list(df['text']) 
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index]),
            'text': text
        }

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
train_dataset = CustomDataset(df_train, tokenizer, MAX_LEN, target_list)
val_dataset = CustomDataset(df_val, tokenizer, MAX_LEN, target_list)

train_data_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=VAL_BATCH_SIZE, shuffle=False)


tokenizer_config.json:   0%|          | 0.00/343 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [10]:
class BERTClass(torch.nn.Module):
    def __init__(self, model_name, target_list):
        super(BERTClass, self).__init__()
        self.bert_model = AutoModel.from_pretrained(model_name, return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(self.bert_model.config.hidden_size, len(target_list))
    
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        
        # --- FIX IS HERE ---
        # Instead of pooler_output, we take the last hidden state of the [CLS] token
        # output.last_hidden_state has shape (batch_size, sequence_length, hidden_size)
        # We select the [CLS] token by indexing with [:, 0, :]
        cls_output = output.last_hidden_state[:, 0, :]
        
        output_dropout = self.dropout(cls_output)
        final_output = self.linear(output_dropout)
        
        return final_output

# Instantiate the model
model = BERTClass(MODEL_NAME, target_list)
model.to(device)

# ==================================
# 4. Loss Function and Optimizer
# ==================================
# For multi-label classification, BCEWithLogitsLoss is the correct choice.
# It combines a Sigmoid layer and the BCELoss in one single class.
def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# ==================================
# 5. Training and Evaluation Functions
# ==================================
# Your train_model and eval_model functions are well-written and can be used as they are.
# I've just adjusted the tqdm progress bar description for more clarity.

def train_model(training_loader, model, optimizer):
    model.train()
    losses = []
    correct_predictions = 0
    num_samples = 0
    
    loop = tq(training_loader, leave=True)
    for batch_idx, data in enumerate(loop):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Accuracy calculation
        outputs_sigmoid = torch.sigmoid(outputs).cpu().detach().numpy().round()
        targets_np = targets.cpu().detach().numpy()
        correct_predictions += np.sum(outputs_sigmoid == targets_np)
        num_samples += targets_np.size
        
        loop.set_description(f"Train - Loss: {loss.item():.4f}")

    return model, float(correct_predictions) / num_samples, np.mean(losses)


def eval_model(validation_loader, model):
    model.eval()
    losses = []
    correct_predictions = 0
    num_samples = 0
    
    with torch.no_grad():
        for data in validation_loader:
            ids = data['input_ids'].to(device, dtype=torch.long)
            mask = data['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)
            
            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            outputs_sigmoid = torch.sigmoid(outputs).cpu().detach().numpy().round()
            targets_np = targets.cpu().detach().numpy()
            correct_predictions += np.sum(outputs_sigmoid == targets_np)
            num_samples += targets_np.size

    return float(correct_predictions) / num_samples, np.mean(losses)


# ==================================
# 6. Training Loop
# ==================================
history = defaultdict(list)
best_accuracy = 0

for epoch in range(1, EPOCHS + 1):
    print(f'\n--- Epoch {epoch}/{EPOCHS} ---')
    
    model, train_acc, train_loss = train_model(train_data_loader, model, optimizer)
    val_acc, val_loss = eval_model(val_data_loader, model)

    print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
    print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_accuracy:
        torch.save(model.state_dict(), "all_label_best_model_state.bin")
        best_accuracy = val_acc

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]




--- Epoch 1/5 ---



  0%|          | 0/2221 [00:00<?, ?it/s][A
Train - Loss: 0.6666:   0%|          | 0/2221 [00:00<?, ?it/s][A
Train - Loss: 0.6666:   0%|          | 1/2221 [00:00<19:06,  1.94it/s][A
Train - Loss: 0.6494:   0%|          | 1/2221 [00:00<19:06,  1.94it/s][A
Train - Loss: 0.6494:   0%|          | 2/2221 [00:00<11:38,  3.18it/s][A
Train - Loss: 0.6071:   0%|          | 2/2221 [00:00<11:38,  3.18it/s][A
Train - Loss: 0.6071:   0%|          | 3/2221 [00:00<09:15,  3.99it/s][A
Train - Loss: 0.5736:   0%|          | 3/2221 [00:01<09:15,  3.99it/s][A
Train - Loss: 0.5736:   0%|          | 4/2221 [00:01<08:08,  4.54it/s][A
Train - Loss: 0.5414:   0%|          | 4/2221 [00:01<08:08,  4.54it/s][A
Train - Loss: 0.5414:   0%|          | 5/2221 [00:01<07:31,  4.91it/s][A
Train - Loss: 0.5467:   0%|          | 5/2221 [00:01<07:31,  4.91it/s][A
Train - Loss: 0.5467:   0%|          | 6/2221 [00:01<07:08,  5.17it/s][A
Train - Loss: 0.5233:   0%|          | 6/2221 [00:01<07:08,  5.17it/s][A
T

Train Loss: 0.2489, Train Acc: 0.8954
Val Loss: 0.2136, Val Acc: 0.9053

--- Epoch 2/5 ---


Train - Loss: 0.3225: 100%|██████████| 2221/2221 [06:32<00:00,  5.65it/s]


Train Loss: 0.2013, Train Acc: 0.9131
Val Loss: 0.2068, Val Acc: 0.9057

--- Epoch 3/5 ---


Train - Loss: 0.0947: 100%|██████████| 2221/2221 [14:28<00:00,  2.56it/s]


Train Loss: 0.1705, Train Acc: 0.9291
Val Loss: 0.2420, Val Acc: 0.9029

--- Epoch 4/5 ---


Train - Loss: 0.0097: 100%|██████████| 2221/2221 [14:28<00:00,  2.56it/s]


Train Loss: 0.1354, Train Acc: 0.9467
Val Loss: 0.2659, Val Acc: 0.9019

--- Epoch 5/5 ---


Train - Loss: 0.0353: 100%|██████████| 2221/2221 [14:28<00:00,  2.56it/s]


Train Loss: 0.1029, Train Acc: 0.9625
Val Loss: 0.2779, Val Acc: 0.8974


In [11]:

print("\n--- Evaluating Pipeline on Test Set ---")

model.load_state_dict(torch.load("all_label_best_model_state.bin"))



model.eval()




--- Evaluating Pipeline on Test Set ---


BERTClass(
  (bert_model): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps

In [12]:
import numpy as np
import torch
from tqdm.notebook import  tqdm

def predict_toxicity_pipeline(text, tokenizer, model_2, device, max_len, target_list):
    # --- Tokenization (no changes here) ---
    inputs = tokenizer.encode_plus(
        text, add_special_tokens=True, max_length=max_len, padding='max_length',
        return_token_type_ids=True, truncation=True, return_attention_mask=True, return_tensors='pt'
    )
    ids = inputs['input_ids'].to(device)
    mask = inputs['attention_mask'].to(device)
    
    # Check if 'token_type_ids' exists in the tokenizer output
    token_ids = inputs['token_type_ids'].to(device) if 'token_type_ids' in inputs else None

    with torch.no_grad():
       
        output_2 =  model_2(ids, mask, token_ids)
            
            # --- 👇 KEY CHANGE IS HERE ---
            # 1. Find the index of the label with the highest score (logit)
            # We use argmax directly on the logits, which is efficient.
            # --- AFTER ---
        pred_index = torch.argmax(output_2, dim=1).item()
            
            # 2. Create a one-hot encoded vector
            # This creates an array of zeros...
        one_hot_prediction = np.zeros(len(target_list), dtype=int)
            # ...and sets the predicted index to 1.
        one_hot_prediction[pred_index] = 1
            
        return one_hot_prediction

# --- Your evaluation loop (assuming target_list is defined) ---
# Example: target_list = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

all_predictions = []
# Ensure test_df, tokenizer, model_1, etc. are correctly defined and loaded
for text in tq(test_df['text'], desc="Predicting on test data"):
    prediction = predict_toxicity_pipeline(text, tokenizer, model, device, MAX_LEN, target_list)
    all_predictions.append(prediction)

y_pred = np.array(all_predictions)

# Now y_pred will be a 2D array where each row has at most one '1'. 

Predicting on test data: 100%|██████████| 2512/2512 [00:49<00:00, 50.50it/s]


In [13]:
y_pred

array([[0, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0]])

In [14]:
import numpy as np
import pandas as pd

#'Abusive', 'None', 'Political Hate', 'Profane', 'Religious Hate', 'Sexism'
#'Abusive', 'Political Hate', 'Profane', 'Religious Hate', 'Sexism']

# Your mapping
id2l = {
    0: 'Abusive',
    1: 'None',
    2: 'Political Hate',
    3: 'Profane',
    4:  'Religious Hate',
    5: 'Sexism'
}

# Example y_pred
# y_pred = np.array([[0,0,0,0,0],[0,0,0,1,0],[1,0,0,0,0]])

def decode_labels(row):
    indices = np.where(row == 1)[0]
    if len(indices) == 0:
        return "None"
    # If multiple labels, join them with comma
    return ", ".join([id2l[i] for i in indices])

# Convert predictions into a DataFrame column
df = pd.DataFrame()
df["Predicted_Label"] = [decode_labels(row) for row in y_pred]

print(df.head())


  Predicted_Label
0            None
1         Profane
2            None
3            None
4            None


In [15]:
test_df['label']=df['Predicted_Label']
test_df['model']='bangla-bert'
test_df

Unnamed: 0,id,text,label,model
0,879187,শুভ কামনা রইল বাংলাদেশ জন্য ইনশাআল্লাহ জয় হবে,,bangla-bert
1,316919,গোয়া মারা দিয়ে আছে বাংলাদেশ মাদারচোদ নিউজ করে ...,Profane,bangla-bert
2,916242,ভাইয়া আপনি অভিনেতা হইয়েন না না হলে সবাই বাচ্...,,bangla-bert
3,786824,আমাদেরো তাই দেখছি,,bangla-bert
4,47284,পুলিশ কতটা টাকা নিয়ে,,bangla-bert
...,...,...,...,...
2507,776466,সত্য কথা তেতু লাগে,,bangla-bert
2508,849227,এই ফকিননি মাগীটা আর কত নাটক দেখাবে,Profane,bangla-bert
2509,532697,দেখো আজকে কার ফিটনেস কোথায় দাঁড়িয়েছে তুমি চ...,Profane,bangla-bert
2510,861411,ছোট ভাইটির পাসে থাকুন গেম ভিড়িও বানাই,,bangla-bert


In [16]:
test_df = test_df[['id', 'label', 'model']]
test_df

Unnamed: 0,id,label,model
0,879187,,bangla-bert
1,316919,Profane,bangla-bert
2,916242,,bangla-bert
3,786824,,bangla-bert
4,47284,,bangla-bert
...,...,...,...
2507,776466,,bangla-bert
2508,849227,Profane,bangla-bert
2509,532697,Profane,bangla-bert
2510,861411,,bangla-bert


In [17]:
test_df.to_csv("final_banth_v9.tsv", sep="\t", index=False)
print("Saved to final_ensemble.tsv")

Saved to final_ensemble.tsv
