In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer


In [29]:
train_file = 'blp25_hatespeech_subtask_1A_train.tsv'
validation_file = 'blp25_hatespeech_subtask_1A_dev.tsv'
test_file = 'blp25_hatespeech_subtask_1A_dev_test.tsv'

In [30]:
#tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
# Load train/val/test DataFrames
train_df = pd.read_csv(train_file, sep="\t")
dev_df = pd.read_csv(validation_file , sep="\t")
test_df = pd.read_csv(test_file, sep="\t")





In [4]:
l2id = {
    'None': 0,
    'Religious Hate': 1,
    'Sexism': 2,
    'Political Hate': 3,
    'Profane': 4,
    'Abusive': 5
}
id2l = {v: k for k, v in l2id.items()}


def clean_label(x):
    # handle missing or NaN ‚Üí "None"
    if pd.isna(x) or x == 'None':
        return 'None'
    # already list-like e.g. ['Abusive']
    if isinstance(x, list):
        return x[0] if len(x) > 0 else 'None'
    # string cases like "[]" or "[Abusive]" or "[Political Hate]"
    x = x.strip("[]").strip()
    if x == "":
        return 'None'
    return x


def process_df(df):
    # Ensure labels are proper lists
    df["label"] = df["label"].apply(clean_label)
    df["label"] = df["label"].fillna("None")
    # Now create binary label
    df["toxic"] = df["label"].apply(lambda x: 0 if x == "None" else 1)
    df["label_id"] = df["label"].map(l2id)

    

  

    return df

train_df = process_df(train_df)
dev_df  = process_df(dev_df)

train_df

Unnamed: 0,id,text,label,toxic,label_id
0,147963,‡¶ß‡¶®‡ßç‡¶Ø‡¶¨‡¶æ‡¶¶ ‡¶¨‡¶∞‡ßç‡¶°‡¶æ‡¶∞ ‡¶ó‡¶æ‡¶∞‡ßç‡¶° ‡¶¶‡ßá‡¶∞‡¶ï‡ßá ‡¶è‡¶≠‡¶æ‡¶¨‡ßá ‡¶™‡¶æ‡¶π‡¶æ‡¶∞‡¶æ ‡¶¶‡¶ø‡¶§‡ßá ‡¶π...,,0,0
1,214275,‡¶õ‡ßã‡¶ü‡¶¨‡ßá‡¶≤‡¶æ‡¶Ø‡¶º ‡¶Ö‡¶®‡ßá‡¶ï ‡¶ï‡¶∑‡ßç‡¶ü ‡¶ï‡¶∞‡ßá ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ó‡¶æ‡¶≤‡¶æ‡¶ó‡¶æ‡¶≤‡¶ø ‡¶∂‡¶ø‡¶ñ‡¶õ‡¶ø‡¶≤‡¶æ‡¶Æ...,,0,0
2,849172,‡¶Ö‡¶§‡¶ø‡¶∞‡¶ø‡¶ï‡ßç‡¶§ ‡¶è ‡¶®‡¶ø‡¶ú‡ßá‡¶ï‡ßá ‡¶¨‡¶æ‡¶¶‡ßÅ‡¶∞ ‡¶¨‡¶æ‡¶®‡¶æ‡¶á‡ßü‡¶æ ‡¶´‡ßá‡¶≤‡¶õ‡ßá‡¶® ‡¶∞‡ßá,Abusive,1,5
3,821985,‡¶ö‡¶ø‡¶® ‡¶≠‡¶æ‡¶∞‡¶§ ‡¶∞‡¶æ‡¶∂‡¶ø‡ßü‡¶æ ‡¶è‡¶á ‡¶§‡¶ø‡¶® ‡¶¶‡ßá‡¶∂ ‡¶è‡¶ï ‡¶•‡¶æ‡¶ï‡¶≤‡ßá ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡¶ï‡ßá ‡¶∂‡¶æ...,,0,0
4,477288,‡¶è‡¶ü‡¶æ‡¶∞ ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡ßá ‡¶ï‡¶∞‡¶¨‡ßá‡¶Ø‡ßá ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡¶∞‡¶¨‡ßá ‡¶∏‡ßá‡¶á ‡¶§‡ßã ‡¶π‡¶≤‡ßã ‡¶è‡¶á ...,Abusive,1,5
...,...,...,...,...,...
35517,790325,‡¶§‡¶á‡¶ì‡ßü‡¶æ‡¶®‡ßá‡¶∞ ‡¶è‡¶§ ‡¶ï‡ßç‡¶∑‡¶Æ‡¶§‡¶æ ‡¶π‡ßü‡¶®‡¶ø ‡¶Ø‡ßá ‡¶è‡¶ï ‡¶ü‡ßÅ‡¶ï‡¶∞‡ßã ‡¶ú‡¶æ‡ßü‡¶ó‡¶æ ‡¶®‡¶∑‡ßç‡¶ü...,,0,0
35518,328377,‡¶ö‡ßÅ‡¶∞‡ßá‡¶∞ ‡¶ò‡¶∞‡ßá‡¶∞ ‡¶ö‡ßÅ‡¶∞ ‡¶π‡¶æ‡¶≤‡¶æ,Profane,1,4
35519,69803,‡¶ú‡¶æ‡¶π‡¶æ‡¶ô‡ßç‡¶ó‡ßÄ‡¶∞ ‡¶¨‡ßÅ‡¶¶‡ßç‡¶ß‡¶ø ‡¶®‡ßá‡¶á ‡¶Æ‡¶æ‡¶†‡ßá ‡¶Æ‡¶æ‡¶∞‡¶æ ‡¶Ø‡¶æ‡¶¨‡ßá,Abusive,1,5
35520,419984,‡¶è‡¶ï‡¶ü‡¶æ ‡¶´‡ßá‡¶á‡¶≤‡ßç‡¶° ‡¶è‡¶∏‡ßç‡¶ü‡ßá‡¶ü ‡¶è‡¶ì ‡¶∏‡ßÅ‡¶∑‡ßç‡¶†‡ßÅ ‡¶®‡¶ø‡¶∞‡ßç‡¶¨‡¶æ‡¶ö‡¶® ‡¶π‡ßü ‡¶®‡ßá‡¶§‡¶æ‡¶∞...,Abusive,1,5


In [5]:

# Display the result
print(train_df[['text', 'toxic']].head())

                                                text  toxic
0  ‡¶ß‡¶®‡ßç‡¶Ø‡¶¨‡¶æ‡¶¶ ‡¶¨‡¶∞‡ßç‡¶°‡¶æ‡¶∞ ‡¶ó‡¶æ‡¶∞‡ßç‡¶° ‡¶¶‡ßá‡¶∞‡¶ï‡ßá ‡¶è‡¶≠‡¶æ‡¶¨‡ßá ‡¶™‡¶æ‡¶π‡¶æ‡¶∞‡¶æ ‡¶¶‡¶ø‡¶§‡ßá ‡¶π...      0
1  ‡¶õ‡ßã‡¶ü‡¶¨‡ßá‡¶≤‡¶æ‡¶Ø‡¶º ‡¶Ö‡¶®‡ßá‡¶ï ‡¶ï‡¶∑‡ßç‡¶ü ‡¶ï‡¶∞‡ßá ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ó‡¶æ‡¶≤‡¶æ‡¶ó‡¶æ‡¶≤‡¶ø ‡¶∂‡¶ø‡¶ñ‡¶õ‡¶ø‡¶≤‡¶æ‡¶Æ...      0
2          ‡¶Ö‡¶§‡¶ø‡¶∞‡¶ø‡¶ï‡ßç‡¶§ ‡¶è ‡¶®‡¶ø‡¶ú‡ßá‡¶ï‡ßá ‡¶¨‡¶æ‡¶¶‡ßÅ‡¶∞ ‡¶¨‡¶æ‡¶®‡¶æ‡¶á‡ßü‡¶æ ‡¶´‡ßá‡¶≤‡¶õ‡ßá‡¶® ‡¶∞‡ßá      1
3  ‡¶ö‡¶ø‡¶® ‡¶≠‡¶æ‡¶∞‡¶§ ‡¶∞‡¶æ‡¶∂‡¶ø‡ßü‡¶æ ‡¶è‡¶á ‡¶§‡¶ø‡¶® ‡¶¶‡ßá‡¶∂ ‡¶è‡¶ï ‡¶•‡¶æ‡¶ï‡¶≤‡ßá ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡¶ï‡ßá ‡¶∂‡¶æ...      0
4  ‡¶è‡¶ü‡¶æ‡¶∞ ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡ßá ‡¶ï‡¶∞‡¶¨‡ßá‡¶Ø‡ßá ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡¶∞‡¶¨‡ßá ‡¶∏‡ßá‡¶á ‡¶§‡ßã ‡¶π‡¶≤‡ßã ‡¶è‡¶á ...      1


In [6]:
train_df.columns

Index(['id', 'text', 'label', 'toxic', 'label_id'], dtype='object')

In [7]:
# Filter for toxic comments only
toxic_train_df = train_df[train_df['toxic'] == 1].copy()
toxic_dev_df = dev_df[dev_df['toxic'] == 1].copy()


# Display the result
print(toxic_train_df[['text', 'label']].head())

                                                 text    label
2           ‡¶Ö‡¶§‡¶ø‡¶∞‡¶ø‡¶ï‡ßç‡¶§ ‡¶è ‡¶®‡¶ø‡¶ú‡ßá‡¶ï‡ßá ‡¶¨‡¶æ‡¶¶‡ßÅ‡¶∞ ‡¶¨‡¶æ‡¶®‡¶æ‡¶á‡ßü‡¶æ ‡¶´‡ßá‡¶≤‡¶õ‡ßá‡¶® ‡¶∞‡ßá  Abusive
4   ‡¶è‡¶ü‡¶æ‡¶∞ ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡ßá ‡¶ï‡¶∞‡¶¨‡ßá‡¶Ø‡ßá ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡¶∞‡¶¨‡ßá ‡¶∏‡ßá‡¶á ‡¶§‡ßã ‡¶π‡¶≤‡ßã ‡¶è‡¶á ...  Abusive
7   ‡¶á‡¶∞‡¶æ‡¶® ‡¶™‡¶æ‡¶∞‡¶Æ‡¶æ‡¶£‡¶¨‡¶ø‡¶ï ‡¶¨‡ßã‡¶Æ‡¶æ ‡¶¨‡¶æ‡¶®‡¶æ‡¶¨‡ßá ‡¶¨‡¶æ‡¶®‡¶æ‡¶¨‡ßá ‡¶¨‡¶≤‡¶§‡ßá ‡¶¨‡¶≤‡¶§‡ßá ‡¶¨‡¶ø...  Abusive
11  ‡¶Æ‡ßÅ‡¶∏‡¶≤‡¶ø‡¶Æ ‡¶¨‡¶æ‡¶ö‡ßç‡¶ö‡¶æ‡¶ó‡ßÅ‡¶≤‡ßã ‡¶¨‡¶æ‡¶ö‡ßç‡¶ö‡¶æ ‡¶™‡ßá‡¶∞‡ßá ‡¶™‡ßá‡¶∞‡ßá ‡¶ó‡ßã‡¶ü‡¶æ ‡¶™‡ßÉ‡¶•‡¶ø‡¶¨‡ßÄ...  Profane
15                                   ‡¶¨‡¶æ‡ßú‡¶ø ‡¶¨‡¶æ‡¶≤ ‡¶´‡¶æ‡¶≤‡¶æ‡ßü‡¶õ‡ßá  Profane


In [25]:
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments,
    DataCollatorWithPadding  # <-- Import the data collator
)
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# --- 1. Load Tokenizer and Model ---
model_name = "csebuetnlp/banglabert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_1 = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


# def truncate_long_texts(df, max_chars=10000):
#     df['text'] = df['text'].astype(str).apply(lambda x: x[:max_chars])
#     return df

# train_df = truncate_long_texts(train_df)
# dev_df = truncate_long_texts(dev_df)

# --- 2. Create Datasets ---
train_dataset_1 = Dataset.from_pandas(train_df[['text', 'toxic']].rename(columns={'toxic': 'label'}))
dev_dataset_1 = Dataset.from_pandas(dev_df[['text', 'toxic']].rename(columns={'toxic': 'label'}))

def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, max_length=512)



train_dataset_1 = train_dataset_1.map(tokenize, batched=True)
dev_dataset_1 = dev_dataset_1.map(tokenize, batched=True)

# Set the format to torch tensors and specify columns
train_dataset_1.set_format("torch", columns=["input_ids", "attention_mask", "label"])
dev_dataset_1.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# --- 3. Define Training ---

# Create a data collator that will dynamically pad the batches
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics_binary(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"accuracy": accuracy_score(p.label_ids, preds), "f1": f1_score(p.label_ids, preds)}

training_args_1 = TrainingArguments(
    output_dir="./results_model_1",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    logging_dir="./logs_1",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer_1 = Trainer(
    model=model_1,
    args=training_args_1,
    train_dataset=train_dataset_1,
    eval_dataset=dev_dataset_1,
    compute_metrics=compute_metrics_binary,
    data_collator=data_collator, # <-- Add the data collator here
)

# --- 4. Start Fine-tuning ---
trainer_1.train()
trainer_1.save_model("./final_model_1")

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/35522 [00:00<?, ? examples/s]

Map:   0%|          | 0/2512 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4614,0.467177,0.784236,0.744098
2,0.3695,0.497184,0.797771,0.763501
3,0.2634,0.669088,0.787022,0.749883


In [9]:
toxic_train_df

Unnamed: 0,id,text,label,toxic,label_id
2,849172,‡¶Ö‡¶§‡¶ø‡¶∞‡¶ø‡¶ï‡ßç‡¶§ ‡¶è ‡¶®‡¶ø‡¶ú‡ßá‡¶ï‡ßá ‡¶¨‡¶æ‡¶¶‡ßÅ‡¶∞ ‡¶¨‡¶æ‡¶®‡¶æ‡¶á‡ßü‡¶æ ‡¶´‡ßá‡¶≤‡¶õ‡ßá‡¶® ‡¶∞‡ßá,Abusive,1,5
4,477288,‡¶è‡¶ü‡¶æ‡¶∞ ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡ßá ‡¶ï‡¶∞‡¶¨‡ßá‡¶Ø‡ßá ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡¶∞‡¶¨‡ßá ‡¶∏‡ßá‡¶á ‡¶§‡ßã ‡¶π‡¶≤‡ßã ‡¶è‡¶á ...,Abusive,1,5
7,786609,‡¶á‡¶∞‡¶æ‡¶® ‡¶™‡¶æ‡¶∞‡¶Æ‡¶æ‡¶£‡¶¨‡¶ø‡¶ï ‡¶¨‡ßã‡¶Æ‡¶æ ‡¶¨‡¶æ‡¶®‡¶æ‡¶¨‡ßá ‡¶¨‡¶æ‡¶®‡¶æ‡¶¨‡ßá ‡¶¨‡¶≤‡¶§‡ßá ‡¶¨‡¶≤‡¶§‡ßá ‡¶¨‡¶ø...,Abusive,1,5
11,124917,‡¶Æ‡ßÅ‡¶∏‡¶≤‡¶ø‡¶Æ ‡¶¨‡¶æ‡¶ö‡ßç‡¶ö‡¶æ‡¶ó‡ßÅ‡¶≤‡ßã ‡¶¨‡¶æ‡¶ö‡ßç‡¶ö‡¶æ ‡¶™‡ßá‡¶∞‡ßá ‡¶™‡ßá‡¶∞‡ßá ‡¶ó‡ßã‡¶ü‡¶æ ‡¶™‡ßÉ‡¶•‡¶ø‡¶¨‡ßÄ...,Profane,1,4
15,432369,‡¶¨‡¶æ‡ßú‡¶ø ‡¶¨‡¶æ‡¶≤ ‡¶´‡¶æ‡¶≤‡¶æ‡ßü‡¶õ‡ßá,Profane,1,4
...,...,...,...,...,...
35514,668158,‡¶ï‡ßÉ‡¶∑‡¶ø ‡¶Æ‡¶®‡ßç‡¶§‡ßç‡¶∞‡ßÄ‡¶∞ ‡¶∏‡¶æ‡¶∞‡ßá‡¶∞ ‡¶¨‡ßç‡¶Ø‡¶¨‡¶∏‡¶æ ‡¶ï‡¶∞‡ßá ‡¶§‡¶æ‡¶á ‡¶è ‡¶Ö‡¶¨‡¶∏‡ßç‡¶•‡¶æ,Political Hate,1,3
35518,328377,‡¶ö‡ßÅ‡¶∞‡ßá‡¶∞ ‡¶ò‡¶∞‡ßá‡¶∞ ‡¶ö‡ßÅ‡¶∞ ‡¶π‡¶æ‡¶≤‡¶æ,Profane,1,4
35519,69803,‡¶ú‡¶æ‡¶π‡¶æ‡¶ô‡ßç‡¶ó‡ßÄ‡¶∞ ‡¶¨‡ßÅ‡¶¶‡ßç‡¶ß‡¶ø ‡¶®‡ßá‡¶á ‡¶Æ‡¶æ‡¶†‡ßá ‡¶Æ‡¶æ‡¶∞‡¶æ ‡¶Ø‡¶æ‡¶¨‡ßá,Abusive,1,5
35520,419984,‡¶è‡¶ï‡¶ü‡¶æ ‡¶´‡ßá‡¶á‡¶≤‡ßç‡¶° ‡¶è‡¶∏‡ßç‡¶ü‡ßá‡¶ü ‡¶è‡¶ì ‡¶∏‡ßÅ‡¶∑‡ßç‡¶†‡ßÅ ‡¶®‡¶ø‡¶∞‡ßç‡¶¨‡¶æ‡¶ö‡¶® ‡¶π‡ßü ‡¶®‡ßá‡¶§‡¶æ‡¶∞...,Abusive,1,5


In [10]:
toxic_dev_df

Unnamed: 0,id,text,label,toxic,label_id
0,166449,‡¶á‡¶®‡ßç‡¶°‡¶ø‡ßü‡¶æ ‡¶ï‡¶ø ‡¶Æ‡¶æ‡¶õ ‡¶ß‡¶∞‡¶æ ‡¶¨‡¶®‡ßç‡¶ß ‡¶∞‡¶æ‡¶ñ‡¶õ‡ßá‡¶è‡¶ï ‡¶®‡¶¶‡ßÄ‡¶§‡ßá ‡¶¶‡ßÅ‡¶á‡¶®‡ßÄ‡¶§‡¶ø ...,Political Hate,1,3
1,267692,‡¶≤‡¶ï‡ßç‡¶∑ ‡¶ü‡¶æ‡¶ï‡¶æ ‡¶ò‡ßÅ‡¶∑ ‡¶¶‡¶ø‡¶Ø‡¶º‡ßá ‡¶Ö‡¶Ø‡ßã‡¶ó‡ßç‡¶Ø ‡¶Ü‡¶∞ ‡¶¶‡¶æ‡¶Ø‡¶º‡¶ø‡¶§‡ßç‡¶¨‡¶π‡ßÄ‡¶® ‡¶Æ‡¶æ‡¶®‡¶∏...,Abusive,1,5
3,939131,‡¶Ü‡¶∞ ‡¶ï‡¶§‡ßã ‡¶∂‡¶ø‡¶ñ‡¶¨‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶∏‡ßã‡¶®‡¶æ‡¶∞ ‡¶õ‡ßá‡¶≤‡ßá‡¶∞‡¶æ ‡¶è‡¶ó‡ßÅ‡¶≤‡ßã ‡¶ï‡ßá ‡¶ü‡¶æ‡¶ï‡¶æ ‡¶¶...,Abusive,1,5
4,210284,‡¶ï‡¶ø ‡¶∏‡¶æ‡¶Ç‡¶ò‡¶æ‡¶§‡¶ø‡¶ï ‡¶≠‡¶æ‡¶á ‡¶∞‡ßá ‡¶§‡ßÅ‡¶á,Abusive,1,5
5,712332,‡¶≤‡¶û‡ßç‡¶ö ‡¶Æ‡¶æ‡¶≤‡¶ø‡¶ï‡¶¶‡ßá‡¶∞ ‡¶Ö‡¶≠‡¶ø‡¶∂‡¶™‡ßç‡¶§ ‡¶ö‡¶ï‡ßç‡¶∑‡ßÅ ‡¶™‡¶¶‡ßç‡¶Æ‡¶æ ‡¶∏‡ßá‡¶§‡ßÅ‡¶∞ ‡¶â‡¶™‡¶∞,Abusive,1,5
...,...,...,...,...,...
2496,653048,‡¶ï‡¶ø‡¶∞‡ßá ‡¶Æ‡¶æ‡¶®‡¶ø‡¶ï ‡¶ö‡ßã‡¶∞‡¶æ ‡¶§‡ßÅ‡¶á‡¶ì ‡¶Ü‡¶õ‡ßã‡¶∏,Abusive,1,5
2503,121961,‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶∏‡¶¨‡¶á ‡¶§‡ßã ‡¶ö‡ßÅ‡¶∞‡¶ø ‡¶π‡ßü‡ßá ‡¶Ø‡¶æ‡¶ö‡ßç‡¶õ‡ßá ‡¶Ü‡¶∞ ‡¶ö‡ßã‡¶∞‡¶¶‡ßá‡¶∞ ‡¶ß‡¶∞‡¶æ‡¶∞ ‡¶ï‡ßã...,Abusive,1,5
2504,555021,‡¶´‡¶ï‡¶ø‡¶®‡¶ø ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ ‡¶¶‡ßá‡¶∂ ‡¶ï‡¶æ‡¶∞‡ßá‡¶®‡ßç‡¶ü ‡¶®‡¶æ‡¶á,Abusive,1,5
2505,858412,‡¶ï‡¶æ‡¶ï‡ßÅ ‡¶∏‡¶æ‡¶Æ‡¶®‡ßá ‡¶ö‡¶∂‡¶Æ‡¶æ ‡¶•‡¶æ‡¶ï‡¶¨‡ßá ‡¶ï‡¶ø‡¶®‡ßç‡¶§‡ßÅ ‡¶ó‡ßç‡¶≤‡¶æ‡¶∏ ‡¶•‡¶æ‡¶ï‡¶¨‡ßá‡¶®‡¶æ‡¶™‡¶æ‡¶õ‡¶æ...,Profane,1,4


In [11]:

l2id = {
   
    'Religious Hate': 0,
    'Sexism': 1,
    'Political Hate': 2,
    'Profane': 3,
    'Abusive': 4
}
id2l = {v: k for k, v in l2id.items()}


toxic_train_df["label"] = toxic_train_df["label"].map(l2id)
toxic_train_df['label'].unique()

toxic_dev_df["label"] = toxic_dev_df["label"].map(l2id)
toxic_dev_df['label'].unique()

array([2, 4, 3, 1, 0])

In [12]:
toxic_train_df

Unnamed: 0,id,text,label,toxic,label_id
2,849172,‡¶Ö‡¶§‡¶ø‡¶∞‡¶ø‡¶ï‡ßç‡¶§ ‡¶è ‡¶®‡¶ø‡¶ú‡ßá‡¶ï‡ßá ‡¶¨‡¶æ‡¶¶‡ßÅ‡¶∞ ‡¶¨‡¶æ‡¶®‡¶æ‡¶á‡ßü‡¶æ ‡¶´‡ßá‡¶≤‡¶õ‡ßá‡¶® ‡¶∞‡ßá,4,1,5
4,477288,‡¶è‡¶ü‡¶æ‡¶∞ ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡ßá ‡¶ï‡¶∞‡¶¨‡ßá‡¶Ø‡ßá ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡¶∞‡¶¨‡ßá ‡¶∏‡ßá‡¶á ‡¶§‡ßã ‡¶π‡¶≤‡ßã ‡¶è‡¶á ...,4,1,5
7,786609,‡¶á‡¶∞‡¶æ‡¶® ‡¶™‡¶æ‡¶∞‡¶Æ‡¶æ‡¶£‡¶¨‡¶ø‡¶ï ‡¶¨‡ßã‡¶Æ‡¶æ ‡¶¨‡¶æ‡¶®‡¶æ‡¶¨‡ßá ‡¶¨‡¶æ‡¶®‡¶æ‡¶¨‡ßá ‡¶¨‡¶≤‡¶§‡ßá ‡¶¨‡¶≤‡¶§‡ßá ‡¶¨‡¶ø...,4,1,5
11,124917,‡¶Æ‡ßÅ‡¶∏‡¶≤‡¶ø‡¶Æ ‡¶¨‡¶æ‡¶ö‡ßç‡¶ö‡¶æ‡¶ó‡ßÅ‡¶≤‡ßã ‡¶¨‡¶æ‡¶ö‡ßç‡¶ö‡¶æ ‡¶™‡ßá‡¶∞‡ßá ‡¶™‡ßá‡¶∞‡ßá ‡¶ó‡ßã‡¶ü‡¶æ ‡¶™‡ßÉ‡¶•‡¶ø‡¶¨‡ßÄ...,3,1,4
15,432369,‡¶¨‡¶æ‡ßú‡¶ø ‡¶¨‡¶æ‡¶≤ ‡¶´‡¶æ‡¶≤‡¶æ‡ßü‡¶õ‡ßá,3,1,4
...,...,...,...,...,...
35514,668158,‡¶ï‡ßÉ‡¶∑‡¶ø ‡¶Æ‡¶®‡ßç‡¶§‡ßç‡¶∞‡ßÄ‡¶∞ ‡¶∏‡¶æ‡¶∞‡ßá‡¶∞ ‡¶¨‡ßç‡¶Ø‡¶¨‡¶∏‡¶æ ‡¶ï‡¶∞‡ßá ‡¶§‡¶æ‡¶á ‡¶è ‡¶Ö‡¶¨‡¶∏‡ßç‡¶•‡¶æ,2,1,3
35518,328377,‡¶ö‡ßÅ‡¶∞‡ßá‡¶∞ ‡¶ò‡¶∞‡ßá‡¶∞ ‡¶ö‡ßÅ‡¶∞ ‡¶π‡¶æ‡¶≤‡¶æ,3,1,4
35519,69803,‡¶ú‡¶æ‡¶π‡¶æ‡¶ô‡ßç‡¶ó‡ßÄ‡¶∞ ‡¶¨‡ßÅ‡¶¶‡ßç‡¶ß‡¶ø ‡¶®‡ßá‡¶á ‡¶Æ‡¶æ‡¶†‡ßá ‡¶Æ‡¶æ‡¶∞‡¶æ ‡¶Ø‡¶æ‡¶¨‡ßá,4,1,5
35520,419984,‡¶è‡¶ï‡¶ü‡¶æ ‡¶´‡ßá‡¶á‡¶≤‡ßç‡¶° ‡¶è‡¶∏‡ßç‡¶ü‡ßá‡¶ü ‡¶è‡¶ì ‡¶∏‡ßÅ‡¶∑‡ßç‡¶†‡ßÅ ‡¶®‡¶ø‡¶∞‡ßç‡¶¨‡¶æ‡¶ö‡¶® ‡¶π‡ßü ‡¶®‡ßá‡¶§‡¶æ‡¶∞...,4,1,5


In [13]:

# import torch
# device = torch.device("cpu")
# model_2.to(device)


In [24]:
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments,
    DataCollatorWithPadding  # <-- Import the data collator
)
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# --- 1. Load Tokenizer and Model ---
model_name = "csebuetnlp/banglabert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_2 = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)


# def truncate_long_texts(df, max_chars=10000):
#     df['text'] = df['text'].astype(str).apply(lambda x: x[:max_chars])
#     return df

# train_df = truncate_long_texts(train_df)
# dev_df = truncate_long_texts(dev_df)

# --- 2. Create Datasets ---
train_dataset_2 = Dataset.from_pandas(toxic_train_df[['text', 'label']])
dev_dataset_2 = Dataset.from_pandas(toxic_dev_df[['text', 'label']])

def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, max_length=512)



train_dataset_2 = train_dataset_2.map(tokenize, batched=True)
dev_dataset_2 = dev_dataset_2.map(tokenize, batched=True)

# Set the format to torch tensors and specify columns
train_dataset_2.set_format("torch", columns=["input_ids", "attention_mask", "label"])
dev_dataset_2.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# --- 3. Define Training ---

# Create a data collator that will dynamically pad the batches
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics_multiclass(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "f1_weighted": f1_score(p.label_ids, preds, average="weighted"),
    }

training_args_2 = TrainingArguments(
    output_dir="./results_model_2",
    num_train_epochs=15,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    logging_dir="./logs_1",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer_2 = Trainer(
    model=model_2,
    args=training_args_2,
    train_dataset=train_dataset_2,
    eval_dataset=dev_dataset_2,
    compute_metrics=compute_metrics_multiclass,
    data_collator=data_collator, # <-- Add the data collator here
)

# --- 4. Start Fine-tuning ---
trainer_2.train()
trainer_2.save_model("./final_model_2")

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/15568 [00:00<?, ? examples/s]

Map:   0%|          | 0/1061 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted
1,0.8522,0.635245,0.760603,0.758042
2,0.603,0.616558,0.762488,0.757744
3,0.4441,0.716113,0.762488,0.757096
4,0.2881,0.934421,0.744581,0.740676
5,0.2081,1.131335,0.749293,0.748
6,0.1352,1.324743,0.726673,0.725817
7,0.1124,1.614933,0.727615,0.727433
8,0.0699,1.779849,0.741753,0.740247
9,0.054,1.958204,0.734213,0.730717
10,0.041,2.078136,0.742696,0.739061


In [31]:
from transformers import pipeline
from sklearn.metrics import classification_report
import numpy as np

# --- 1. Load the fine-tuned models with truncation ---
pipe_1 = pipeline(
    "text-classification",
    model="./final_model_1",
    tokenizer=tokenizer,
    device=-1,   # force CPU
    truncation=True,
    max_length=512
)

pipe_2 = pipeline(
    "text-classification",
    model="./final_model_2",
    tokenizer=tokenizer,
    device=-1,
    truncation=True,
    max_length=512,
    return_all_scores=True
)

# --- 2. Run the pipeline on the test set ---
predictions = []
for text in test_df['text']:
    # Model 1: Is it toxic?
    result_1 = pipe_1(text, truncation=True, max_length=512)[0]
    is_toxic = 1 if result_1['label'] == 'LABEL_1' else 0
    
    pred_labels = [0] * pipe_2.model.config.num_labels  # default: all zero
    
    if is_toxic:
        # Model 2: Which toxic class? (highest score = predicted label)
        result_2 = pipe_2(text, truncation=True, max_length=512)[0]
        best_label = max(result_2, key=lambda x: x['score'])
        pred_labels = [1 if r['label'] == best_label['label'] else 0 for r in result_2]
    
    predictions.append(pred_labels)

y_pred = np.array(predictions)
print(y_pred.shape)


Device set to use cpu
Device set to use cpu


(2512, 5)


In [32]:
test_df


Unnamed: 0,id,text
0,879187,‡¶∂‡ßÅ‡¶≠ ‡¶ï‡¶æ‡¶Æ‡¶®‡¶æ ‡¶∞‡¶á‡¶≤ ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶ú‡¶®‡ßç‡¶Ø ‡¶á‡¶®‡¶∂‡¶æ‡¶Ü‡¶≤‡ßç‡¶≤‡¶æ‡¶π ‡¶ú‡¶Ø‡¶º ‡¶π‡¶¨‡ßá
1,316919,‡¶ó‡ßã‡ßü‡¶æ ‡¶Æ‡¶æ‡¶∞‡¶æ ‡¶¶‡¶ø‡ßü‡ßá ‡¶Ü‡¶õ‡ßá ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶Æ‡¶æ‡¶¶‡¶æ‡¶∞‡¶ö‡ßã‡¶¶ ‡¶®‡¶ø‡¶â‡¶ú ‡¶ï‡¶∞‡ßá ...
2,916242,‡¶≠‡¶æ‡¶á‡¶Ø‡¶º‡¶æ ‡¶Ü‡¶™‡¶®‡¶ø ‡¶Ö‡¶≠‡¶ø‡¶®‡ßá‡¶§‡¶æ ‡¶π‡¶á‡¶Ø‡¶º‡ßá‡¶® ‡¶®‡¶æ ‡¶®‡¶æ ‡¶π‡¶≤‡ßá ‡¶∏‡¶¨‡¶æ‡¶á ‡¶¨‡¶æ‡¶ö‡ßç...
3,786824,‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞‡ßã ‡¶§‡¶æ‡¶á ‡¶¶‡ßá‡¶ñ‡¶õ‡¶ø
4,47284,‡¶™‡ßÅ‡¶≤‡¶ø‡¶∂ ‡¶ï‡¶§‡¶ü‡¶æ ‡¶ü‡¶æ‡¶ï‡¶æ ‡¶®‡¶ø‡ßü‡ßá
...,...,...
2507,776466,‡¶∏‡¶§‡ßç‡¶Ø ‡¶ï‡¶•‡¶æ ‡¶§‡ßá‡¶§‡ßÅ ‡¶≤‡¶æ‡¶ó‡ßá
2508,849227,‡¶è‡¶á ‡¶´‡¶ï‡¶ø‡¶®‡¶®‡¶ø ‡¶Æ‡¶æ‡¶ó‡ßÄ‡¶ü‡¶æ ‡¶Ü‡¶∞ ‡¶ï‡¶§ ‡¶®‡¶æ‡¶ü‡¶ï ‡¶¶‡ßá‡¶ñ‡¶æ‡¶¨‡ßá
2509,532697,‡¶¶‡ßá‡¶ñ‡ßã ‡¶Ü‡¶ú‡¶ï‡ßá ‡¶ï‡¶æ‡¶∞ ‡¶´‡¶ø‡¶ü‡¶®‡ßá‡¶∏ ‡¶ï‡ßã‡¶•‡¶æ‡¶Ø‡¶º ‡¶¶‡¶æ‡¶Å‡¶°‡¶º‡¶ø‡¶Ø‡¶º‡ßá‡¶õ‡ßá ‡¶§‡ßÅ‡¶Æ‡¶ø ‡¶ö...
2510,861411,‡¶õ‡ßã‡¶ü ‡¶≠‡¶æ‡¶á‡¶ü‡¶ø‡¶∞ ‡¶™‡¶æ‡¶∏‡ßá ‡¶•‡¶æ‡¶ï‡ßÅ‡¶® ‡¶ó‡ßá‡¶Æ ‡¶≠‡¶ø‡ßú‡¶ø‡¶ì ‡¶¨‡¶æ‡¶®‡¶æ‡¶á


In [33]:
import numpy as np
import pandas as pd

# Your mapping
id2l = {
    0: 'Religious Hate',
    1: 'Sexism',
    2: 'Political Hate',
    3: 'Profane',
    4: 'Abusive'
}


l2id = {
    'None': 0,
    'Religious Hate': 1,
    'Sexism': 2,
    'Political Hate': 3,
    'Profane': 4,
    'Abusive': 5
}

# Example y_pred
# y_pred = np.array([[0,0,0,0,0],[0,0,0,1,0],[1,0,0,0,0]])

def decode_labels(row):
    indices = np.where(row == 1)[0]
    if len(indices) == 0:
        return "None"
    # If multiple labels, join them with comma
    return ", ".join([id2l[i] for i in indices])

# Convert predictions into a DataFrame column
df = pd.DataFrame()
df["Predicted_Label"] = [decode_labels(row) for row in y_pred]

print(df.head())


  Predicted_Label
0            None
1         Profane
2            None
3            None
4            None


In [34]:
df

Unnamed: 0,Predicted_Label
0,
1,Profane
2,
3,
4,
...,...
2507,
2508,Profane
2509,Profane
2510,


In [35]:
test_df['label']=df['Predicted_Label']
test_df['model']='bangla-bert'
test_df

Unnamed: 0,id,text,label,model
0,879187,‡¶∂‡ßÅ‡¶≠ ‡¶ï‡¶æ‡¶Æ‡¶®‡¶æ ‡¶∞‡¶á‡¶≤ ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶ú‡¶®‡ßç‡¶Ø ‡¶á‡¶®‡¶∂‡¶æ‡¶Ü‡¶≤‡ßç‡¶≤‡¶æ‡¶π ‡¶ú‡¶Ø‡¶º ‡¶π‡¶¨‡ßá,,bangla-bert
1,316919,‡¶ó‡ßã‡ßü‡¶æ ‡¶Æ‡¶æ‡¶∞‡¶æ ‡¶¶‡¶ø‡ßü‡ßá ‡¶Ü‡¶õ‡ßá ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶Æ‡¶æ‡¶¶‡¶æ‡¶∞‡¶ö‡ßã‡¶¶ ‡¶®‡¶ø‡¶â‡¶ú ‡¶ï‡¶∞‡ßá ...,Profane,bangla-bert
2,916242,‡¶≠‡¶æ‡¶á‡¶Ø‡¶º‡¶æ ‡¶Ü‡¶™‡¶®‡¶ø ‡¶Ö‡¶≠‡¶ø‡¶®‡ßá‡¶§‡¶æ ‡¶π‡¶á‡¶Ø‡¶º‡ßá‡¶® ‡¶®‡¶æ ‡¶®‡¶æ ‡¶π‡¶≤‡ßá ‡¶∏‡¶¨‡¶æ‡¶á ‡¶¨‡¶æ‡¶ö‡ßç...,,bangla-bert
3,786824,‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞‡ßã ‡¶§‡¶æ‡¶á ‡¶¶‡ßá‡¶ñ‡¶õ‡¶ø,,bangla-bert
4,47284,‡¶™‡ßÅ‡¶≤‡¶ø‡¶∂ ‡¶ï‡¶§‡¶ü‡¶æ ‡¶ü‡¶æ‡¶ï‡¶æ ‡¶®‡¶ø‡ßü‡ßá,,bangla-bert
...,...,...,...,...
2507,776466,‡¶∏‡¶§‡ßç‡¶Ø ‡¶ï‡¶•‡¶æ ‡¶§‡ßá‡¶§‡ßÅ ‡¶≤‡¶æ‡¶ó‡ßá,,bangla-bert
2508,849227,‡¶è‡¶á ‡¶´‡¶ï‡¶ø‡¶®‡¶®‡¶ø ‡¶Æ‡¶æ‡¶ó‡ßÄ‡¶ü‡¶æ ‡¶Ü‡¶∞ ‡¶ï‡¶§ ‡¶®‡¶æ‡¶ü‡¶ï ‡¶¶‡ßá‡¶ñ‡¶æ‡¶¨‡ßá,Profane,bangla-bert
2509,532697,‡¶¶‡ßá‡¶ñ‡ßã ‡¶Ü‡¶ú‡¶ï‡ßá ‡¶ï‡¶æ‡¶∞ ‡¶´‡¶ø‡¶ü‡¶®‡ßá‡¶∏ ‡¶ï‡ßã‡¶•‡¶æ‡¶Ø‡¶º ‡¶¶‡¶æ‡¶Å‡¶°‡¶º‡¶ø‡¶Ø‡¶º‡ßá‡¶õ‡ßá ‡¶§‡ßÅ‡¶Æ‡¶ø ‡¶ö...,Profane,bangla-bert
2510,861411,‡¶õ‡ßã‡¶ü ‡¶≠‡¶æ‡¶á‡¶ü‡¶ø‡¶∞ ‡¶™‡¶æ‡¶∏‡ßá ‡¶•‡¶æ‡¶ï‡ßÅ‡¶® ‡¶ó‡ßá‡¶Æ ‡¶≠‡¶ø‡ßú‡¶ø‡¶ì ‡¶¨‡¶æ‡¶®‡¶æ‡¶á,,bangla-bert


In [36]:
#test_df['model']='bert-base-multilingual-cased'

In [37]:
test_df.columns

Index(['id', 'text', 'label', 'model'], dtype='object')

In [38]:
test_df = test_df[['id', 'label', 'model']]
test_df

Unnamed: 0,id,label,model
0,879187,,bangla-bert
1,316919,Profane,bangla-bert
2,916242,,bangla-bert
3,786824,,bangla-bert
4,47284,,bangla-bert
...,...,...,...
2507,776466,,bangla-bert
2508,849227,Profane,bangla-bert
2509,532697,Profane,bangla-bert
2510,861411,,bangla-bert


In [39]:
test_df.to_csv("final_bert_v2.tsv", sep="\t", index=False)
print("Saved to final_ensemble.tsv")

Saved to final_ensemble.tsv
