In [21]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer


In [22]:
train_file = 'blp25_hatespeech_subtask_1A_train.tsv'
validation_file = 'blp25_hatespeech_subtask_1A_dev.tsv'
test_file = 'blp25_hatespeech_subtask_1A_dev_test.tsv'

In [23]:
#tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
# Load train/val/test DataFrames
train_df = pd.read_csv(train_file, sep="\t")
dev_df = pd.read_csv(validation_file , sep="\t")
test_df = pd.read_csv(test_file, sep="\t")





In [4]:
l2id = {
    'None': 0,
    'Religious Hate': 1,
    'Sexism': 2,
    'Political Hate': 3,
    'Profane': 4,
    'Abusive': 5
}
id2l = {v: k for k, v in l2id.items()}


def clean_label(x):
    # handle missing or NaN ‚Üí "None"
    if pd.isna(x) or x == 'None':
        return 'None'
    # already list-like e.g. ['Abusive']
    if isinstance(x, list):
        return x[0] if len(x) > 0 else 'None'
    # string cases like "[]" or "[Abusive]" or "[Political Hate]"
    x = x.strip("[]").strip()
    if x == "":
        return 'None'
    return x


def process_df(df):
    # Ensure labels are proper lists
    df["label"] = df["label"].apply(clean_label)
    df["label"] = df["label"].fillna("None")
    # Now create binary label
    df["toxic"] = df["label"].apply(lambda x: 0 if x == "None" else 1)
    df["label_id"] = df["label"].map(l2id)

    

  

    return df

train_df = process_df(train_df)
dev_df  = process_df(dev_df)

train_df

Unnamed: 0,id,text,label,toxic,label_id
0,147963,‡¶ß‡¶®‡ßç‡¶Ø‡¶¨‡¶æ‡¶¶ ‡¶¨‡¶∞‡ßç‡¶°‡¶æ‡¶∞ ‡¶ó‡¶æ‡¶∞‡ßç‡¶° ‡¶¶‡ßá‡¶∞‡¶ï‡ßá ‡¶è‡¶≠‡¶æ‡¶¨‡ßá ‡¶™‡¶æ‡¶π‡¶æ‡¶∞‡¶æ ‡¶¶‡¶ø‡¶§‡ßá ‡¶π...,,0,0
1,214275,‡¶õ‡ßã‡¶ü‡¶¨‡ßá‡¶≤‡¶æ‡¶Ø‡¶º ‡¶Ö‡¶®‡ßá‡¶ï ‡¶ï‡¶∑‡ßç‡¶ü ‡¶ï‡¶∞‡ßá ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ó‡¶æ‡¶≤‡¶æ‡¶ó‡¶æ‡¶≤‡¶ø ‡¶∂‡¶ø‡¶ñ‡¶õ‡¶ø‡¶≤‡¶æ‡¶Æ...,,0,0
2,849172,‡¶Ö‡¶§‡¶ø‡¶∞‡¶ø‡¶ï‡ßç‡¶§ ‡¶è ‡¶®‡¶ø‡¶ú‡ßá‡¶ï‡ßá ‡¶¨‡¶æ‡¶¶‡ßÅ‡¶∞ ‡¶¨‡¶æ‡¶®‡¶æ‡¶á‡ßü‡¶æ ‡¶´‡ßá‡¶≤‡¶õ‡ßá‡¶® ‡¶∞‡ßá,Abusive,1,5
3,821985,‡¶ö‡¶ø‡¶® ‡¶≠‡¶æ‡¶∞‡¶§ ‡¶∞‡¶æ‡¶∂‡¶ø‡ßü‡¶æ ‡¶è‡¶á ‡¶§‡¶ø‡¶® ‡¶¶‡ßá‡¶∂ ‡¶è‡¶ï ‡¶•‡¶æ‡¶ï‡¶≤‡ßá ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡¶ï‡ßá ‡¶∂‡¶æ...,,0,0
4,477288,‡¶è‡¶ü‡¶æ‡¶∞ ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡ßá ‡¶ï‡¶∞‡¶¨‡ßá‡¶Ø‡ßá ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡¶∞‡¶¨‡ßá ‡¶∏‡ßá‡¶á ‡¶§‡ßã ‡¶π‡¶≤‡ßã ‡¶è‡¶á ...,Abusive,1,5
...,...,...,...,...,...
35517,790325,‡¶§‡¶á‡¶ì‡ßü‡¶æ‡¶®‡ßá‡¶∞ ‡¶è‡¶§ ‡¶ï‡ßç‡¶∑‡¶Æ‡¶§‡¶æ ‡¶π‡ßü‡¶®‡¶ø ‡¶Ø‡ßá ‡¶è‡¶ï ‡¶ü‡ßÅ‡¶ï‡¶∞‡ßã ‡¶ú‡¶æ‡ßü‡¶ó‡¶æ ‡¶®‡¶∑‡ßç‡¶ü...,,0,0
35518,328377,‡¶ö‡ßÅ‡¶∞‡ßá‡¶∞ ‡¶ò‡¶∞‡ßá‡¶∞ ‡¶ö‡ßÅ‡¶∞ ‡¶π‡¶æ‡¶≤‡¶æ,Profane,1,4
35519,69803,‡¶ú‡¶æ‡¶π‡¶æ‡¶ô‡ßç‡¶ó‡ßÄ‡¶∞ ‡¶¨‡ßÅ‡¶¶‡ßç‡¶ß‡¶ø ‡¶®‡ßá‡¶á ‡¶Æ‡¶æ‡¶†‡ßá ‡¶Æ‡¶æ‡¶∞‡¶æ ‡¶Ø‡¶æ‡¶¨‡ßá,Abusive,1,5
35520,419984,‡¶è‡¶ï‡¶ü‡¶æ ‡¶´‡ßá‡¶á‡¶≤‡ßç‡¶° ‡¶è‡¶∏‡ßç‡¶ü‡ßá‡¶ü ‡¶è‡¶ì ‡¶∏‡ßÅ‡¶∑‡ßç‡¶†‡ßÅ ‡¶®‡¶ø‡¶∞‡ßç‡¶¨‡¶æ‡¶ö‡¶® ‡¶π‡ßü ‡¶®‡ßá‡¶§‡¶æ‡¶∞...,Abusive,1,5


In [5]:

# Display the result
print(train_df[['text', 'toxic']].head())

                                                text  toxic
0  ‡¶ß‡¶®‡ßç‡¶Ø‡¶¨‡¶æ‡¶¶ ‡¶¨‡¶∞‡ßç‡¶°‡¶æ‡¶∞ ‡¶ó‡¶æ‡¶∞‡ßç‡¶° ‡¶¶‡ßá‡¶∞‡¶ï‡ßá ‡¶è‡¶≠‡¶æ‡¶¨‡ßá ‡¶™‡¶æ‡¶π‡¶æ‡¶∞‡¶æ ‡¶¶‡¶ø‡¶§‡ßá ‡¶π...      0
1  ‡¶õ‡ßã‡¶ü‡¶¨‡ßá‡¶≤‡¶æ‡¶Ø‡¶º ‡¶Ö‡¶®‡ßá‡¶ï ‡¶ï‡¶∑‡ßç‡¶ü ‡¶ï‡¶∞‡ßá ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ó‡¶æ‡¶≤‡¶æ‡¶ó‡¶æ‡¶≤‡¶ø ‡¶∂‡¶ø‡¶ñ‡¶õ‡¶ø‡¶≤‡¶æ‡¶Æ...      0
2          ‡¶Ö‡¶§‡¶ø‡¶∞‡¶ø‡¶ï‡ßç‡¶§ ‡¶è ‡¶®‡¶ø‡¶ú‡ßá‡¶ï‡ßá ‡¶¨‡¶æ‡¶¶‡ßÅ‡¶∞ ‡¶¨‡¶æ‡¶®‡¶æ‡¶á‡ßü‡¶æ ‡¶´‡ßá‡¶≤‡¶õ‡ßá‡¶® ‡¶∞‡ßá      1
3  ‡¶ö‡¶ø‡¶® ‡¶≠‡¶æ‡¶∞‡¶§ ‡¶∞‡¶æ‡¶∂‡¶ø‡ßü‡¶æ ‡¶è‡¶á ‡¶§‡¶ø‡¶® ‡¶¶‡ßá‡¶∂ ‡¶è‡¶ï ‡¶•‡¶æ‡¶ï‡¶≤‡ßá ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡¶ï‡ßá ‡¶∂‡¶æ...      0
4  ‡¶è‡¶ü‡¶æ‡¶∞ ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡ßá ‡¶ï‡¶∞‡¶¨‡ßá‡¶Ø‡ßá ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡¶∞‡¶¨‡ßá ‡¶∏‡ßá‡¶á ‡¶§‡ßã ‡¶π‡¶≤‡ßã ‡¶è‡¶á ...      1


In [6]:
train_df.columns

Index(['id', 'text', 'label', 'toxic', 'label_id'], dtype='object')

In [7]:
# Filter for toxic comments only
toxic_train_df = train_df[train_df['toxic'] == 1].copy()
toxic_dev_df = dev_df[dev_df['toxic'] == 1].copy()


# Display the result
print(toxic_train_df[['text', 'label']].head())

                                                 text    label
2           ‡¶Ö‡¶§‡¶ø‡¶∞‡¶ø‡¶ï‡ßç‡¶§ ‡¶è ‡¶®‡¶ø‡¶ú‡ßá‡¶ï‡ßá ‡¶¨‡¶æ‡¶¶‡ßÅ‡¶∞ ‡¶¨‡¶æ‡¶®‡¶æ‡¶á‡ßü‡¶æ ‡¶´‡ßá‡¶≤‡¶õ‡ßá‡¶® ‡¶∞‡ßá  Abusive
4   ‡¶è‡¶ü‡¶æ‡¶∞ ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡ßá ‡¶ï‡¶∞‡¶¨‡ßá‡¶Ø‡ßá ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡¶∞‡¶¨‡ßá ‡¶∏‡ßá‡¶á ‡¶§‡ßã ‡¶π‡¶≤‡ßã ‡¶è‡¶á ...  Abusive
7   ‡¶á‡¶∞‡¶æ‡¶® ‡¶™‡¶æ‡¶∞‡¶Æ‡¶æ‡¶£‡¶¨‡¶ø‡¶ï ‡¶¨‡ßã‡¶Æ‡¶æ ‡¶¨‡¶æ‡¶®‡¶æ‡¶¨‡ßá ‡¶¨‡¶æ‡¶®‡¶æ‡¶¨‡ßá ‡¶¨‡¶≤‡¶§‡ßá ‡¶¨‡¶≤‡¶§‡ßá ‡¶¨‡¶ø...  Abusive
11  ‡¶Æ‡ßÅ‡¶∏‡¶≤‡¶ø‡¶Æ ‡¶¨‡¶æ‡¶ö‡ßç‡¶ö‡¶æ‡¶ó‡ßÅ‡¶≤‡ßã ‡¶¨‡¶æ‡¶ö‡ßç‡¶ö‡¶æ ‡¶™‡ßá‡¶∞‡ßá ‡¶™‡ßá‡¶∞‡ßá ‡¶ó‡ßã‡¶ü‡¶æ ‡¶™‡ßÉ‡¶•‡¶ø‡¶¨‡ßÄ...  Profane
15                                   ‡¶¨‡¶æ‡ßú‡¶ø ‡¶¨‡¶æ‡¶≤ ‡¶´‡¶æ‡¶≤‡¶æ‡ßü‡¶õ‡ßá  Profane


In [80]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Ensure your labels are numpy arrays
labels = train_df['toxic'].to_numpy()
class_names = np.unique(labels)

# Calculate weights
weights = compute_class_weight(class_weight='balanced', classes=class_names, y=labels)

print(f"Class names: {class_names}")
print(f"Calculated weights: {weights}")



Class names: [0 1]
Calculated weights: [0.89009722 1.14086588]


In [39]:
# import torch
# import torch.nn as nn
# from transformers import AutoModel, AutoConfig, PreTrainedModel

# class AttentionPooling(nn.Module):
#     def __init__(self, hidden_size):
#         super().__init__()
#         self.attention = nn.Linear(hidden_size, 1)

#     def forward(self, hidden_states, mask):
#         # hidden_states: [batch_size, seq_len, hidden_size]
#         # mask: [batch_size, seq_len]
        
#         # Compute raw attention scores
#         scores = self.attention(hidden_states).squeeze(-1)  # [batch_size, seq_len]
        
#         # Apply mask (very important!)
#         scores = scores.masked_fill(mask == 0, -1e9)
        
#         # Normalize into probabilities
#         attn_weights = torch.softmax(scores, dim=-1)  # [batch_size, seq_len]
        
#         # Weighted sum
#         pooled = torch.sum(hidden_states * attn_weights.unsqueeze(-1), dim=1)  # [batch_size, hidden_size]
#         return pooled


# class BanglaBERTWithAttention(PreTrainedModel):
#     def __init__(self, model_name, num_labels):
#         config = AutoConfig.from_pretrained(model_name, num_labels=num_labels)
#         super().__init__(config)
        
#         self.bert = AutoModel.from_pretrained(model_name, config=config)
#         self.attention = AttentionPooling(config.hidden_size)
#         self.dropout = nn.Dropout(config.hidden_dropout_prob)
#         self.classifier = nn.Linear(config.hidden_size, num_labels)

#     def forward(self, input_ids, attention_mask=None, labels=None):
#         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        
#         # Get last hidden state
#         hidden_states = outputs.last_hidden_state  # [batch, seq_len, hidden_size]
        
#         # Apply attention pooling
#         pooled_output = self.attention(hidden_states, attention_mask)
#         pooled_output = self.dropout(pooled_output)
        
#         # Classification head
#         logits = self.classifier(pooled_output)
        
#         loss = None
#         if labels is not None:
#             loss_fn = nn.CrossEntropyLoss()
#             loss = loss_fn(logits, labels)
        
#         return {"loss": loss, "logits": logits}

import torch
import torch.nn as nn
from transformers import AutoModel, AutoConfig, PreTrainedModel
from transformers.modeling_outputs import SequenceClassifierOutput

class BanglaBERTWithLSTM(PreTrainedModel):
    def __init__(self, model_name, num_labels, class_weights=None):
        config = AutoConfig.from_pretrained(model_name, num_labels=num_labels)
        super().__init__(config)

        self.bert = AutoModel.from_pretrained(model_name, config=config)

        self.lstm = nn.LSTM(
            input_size=config.hidden_size,
            hidden_size=config.hidden_size,
            num_layers=1,
            batch_first=True,
            bidirectional=False
        )

        classifier_input_size = config.hidden_size * 2  # CLS + LSTM hidden

        self.classifier_head = nn.Sequential(
            nn.LayerNorm(classifier_input_size),
            nn.Linear(classifier_input_size, config.hidden_size),
            nn.GELU(),
            nn.Dropout(config.hidden_dropout_prob),
            nn.Linear(config.hidden_size, num_labels)
        )

        # ‚úÖ Properly register class weights
        if class_weights is not None:
            self.register_buffer("class_weights", torch.tensor(class_weights, dtype=torch.float))
        else:
            self.class_weights = None

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        hidden_states = outputs.last_hidden_state  # [batch, seq, hidden]

        cls_output = hidden_states[:, 0, :]  # [batch, hidden]

        # ‚úÖ Use final hidden state from LSTM instead of last timestep
        _, (h_n, _) = self.lstm(hidden_states)
        lstm_pooled = h_n[-1]  # [batch, hidden]

        combined_output = torch.cat((cls_output, lstm_pooled), dim=1)

        logits = self.classifier_head(combined_output)

        loss = None
        if labels is not None:
            labels = labels.long()
            loss_fn = nn.CrossEntropyLoss(weight=self.class_weights)
            loss = loss_fn(logits, labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [91]:
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments,
    DataCollatorWithPadding  # <-- Import the data collator
)
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# --- 1. Load Tokenizer and Model ---
model_name = "csebuetnlp/banglabert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_1 = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


# def truncate_long_texts(df, max_chars=10000):
#     df['text'] = df['text'].astype(str).apply(lambda x: x[:max_chars])
#     return df

# train_df = truncate_long_texts(train_df)
# dev_df = truncate_long_texts(dev_df)

# --- 2. Create Datasets ---
train_dataset_1 = Dataset.from_pandas(train_df[['text', 'toxic']].rename(columns={'toxic': 'label'}))
dev_dataset_1 = Dataset.from_pandas(dev_df[['text', 'toxic']].rename(columns={'toxic': 'label'}))

def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, max_length=512)



train_dataset_1 = train_dataset_1.map(tokenize, batched=True)
dev_dataset_1 = dev_dataset_1.map(tokenize, batched=True)

# Set the format to torch tensors and specify columns
train_dataset_1.set_format("torch", columns=["input_ids", "attention_mask", "label"])
dev_dataset_1.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# --- 3. Define Training ---

# Create a data collator that will dynamically pad the batches
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics_binary(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"accuracy": accuracy_score(p.label_ids, preds), "f1": f1_score(p.label_ids, preds)}

training_args_1 = TrainingArguments(
    output_dir="./results_model_1",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    logging_dir="./logs_1",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer_1 = Trainer(
    model=model_1,
    args=training_args_1,
    train_dataset=train_dataset_1,
    eval_dataset=dev_dataset_1,
    compute_metrics=compute_metrics_binary,
    data_collator=data_collator, # <-- Add the data collator here
)

# --- 4. Start Fine-tuning ---
trainer_1.train()
trainer_1.save_model("./final_model_1")

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/35522 [00:00<?, ? examples/s]

Map:   0%|          | 0/2512 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4719,0.47636,0.77707,0.727626
2,0.4223,0.483991,0.793392,0.758267
3,0.3456,0.541837,0.776672,0.755129
4,0.2707,0.704302,0.774682,0.743194
5,0.2366,0.696191,0.763137,0.728682
6,0.1813,0.931934,0.768312,0.728291
7,0.1522,0.926242,0.766322,0.728366
8,0.1157,1.036777,0.764331,0.724138
9,0.0968,1.177546,0.765525,0.719657
10,0.0753,1.235437,0.769108,0.720077


In [23]:
THis one works good
# from transformers import (
#     AutoTokenizer, 
#     AutoModelForSequenceClassification, 
#     Trainer, 
#     TrainingArguments,
#     DataCollatorWithPadding  # <-- Import the data collator
# )
# from datasets import Dataset
# import numpy as np
# from sklearn.metrics import accuracy_score, f1_score

# # --- 1. Load Tokenizer and Model ---
# model_name = "sagorsarker/bangla-bert-base"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model_1 = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


# # def truncate_long_texts(df, max_chars=10000):
# #     df['text'] = df['text'].astype(str).apply(lambda x: x[:max_chars])
# #     return df

# # train_df = truncate_long_texts(train_df)
# # dev_df = truncate_long_texts(dev_df)

# # --- 2. Create Datasets ---
# train_dataset_1 = Dataset.from_pandas(train_df[['text', 'toxic']].rename(columns={'toxic': 'label'}))
# dev_dataset_1 = Dataset.from_pandas(dev_df[['text', 'toxic']].rename(columns={'toxic': 'label'}))

# def tokenize(batch):
#     return tokenizer(batch['text'], truncation=True, max_length=512)



# train_dataset_1 = train_dataset_1.map(tokenize, batched=True)
# dev_dataset_1 = dev_dataset_1.map(tokenize, batched=True)

# # Set the format to torch tensors and specify columns
# train_dataset_1.set_format("torch", columns=["input_ids", "attention_mask", "label"])
# dev_dataset_1.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# # --- 3. Define Training ---

# # Create a data collator that will dynamically pad the batches
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# def compute_metrics_binary(p):
#     preds = np.argmax(p.predictions, axis=1)
#     return {"accuracy": accuracy_score(p.label_ids, preds), "f1": f1_score(p.label_ids, preds)}

# training_args_1 = TrainingArguments(
#     output_dir="./results_model_1",
#     num_train_epochs=10,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     evaluation_strategy="epoch",
#     logging_dir="./logs_1",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
# )

# trainer_1 = Trainer(
#     model=model_1,
#     args=training_args_1,
#     train_dataset=train_dataset_1,
#     eval_dataset=dev_dataset_1,
#     compute_metrics=compute_metrics_binary,
#     data_collator=data_collator, # <-- Add the data collator here
# )

# # --- 4. Start Fine-tuning ---
# trainer_1.train()
# trainer_1.save_model("./final_model_1")

SyntaxError: invalid syntax (452620182.py, line 1)

In [92]:
toxic_train_df

Unnamed: 0,id,text,label,toxic,label_id
2,849172,‡¶Ö‡¶§‡¶ø‡¶∞‡¶ø‡¶ï‡ßç‡¶§ ‡¶è ‡¶®‡¶ø‡¶ú‡ßá‡¶ï‡ßá ‡¶¨‡¶æ‡¶¶‡ßÅ‡¶∞ ‡¶¨‡¶æ‡¶®‡¶æ‡¶á‡ßü‡¶æ ‡¶´‡ßá‡¶≤‡¶õ‡ßá‡¶® ‡¶∞‡ßá,4,1,5
4,477288,‡¶è‡¶ü‡¶æ‡¶∞ ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡ßá ‡¶ï‡¶∞‡¶¨‡ßá‡¶Ø‡ßá ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡¶∞‡¶¨‡ßá ‡¶∏‡ßá‡¶á ‡¶§‡ßã ‡¶π‡¶≤‡ßã ‡¶è‡¶á ...,4,1,5
7,786609,‡¶á‡¶∞‡¶æ‡¶® ‡¶™‡¶æ‡¶∞‡¶Æ‡¶æ‡¶£‡¶¨‡¶ø‡¶ï ‡¶¨‡ßã‡¶Æ‡¶æ ‡¶¨‡¶æ‡¶®‡¶æ‡¶¨‡ßá ‡¶¨‡¶æ‡¶®‡¶æ‡¶¨‡ßá ‡¶¨‡¶≤‡¶§‡ßá ‡¶¨‡¶≤‡¶§‡ßá ‡¶¨‡¶ø...,4,1,5
11,124917,‡¶Æ‡ßÅ‡¶∏‡¶≤‡¶ø‡¶Æ ‡¶¨‡¶æ‡¶ö‡ßç‡¶ö‡¶æ‡¶ó‡ßÅ‡¶≤‡ßã ‡¶¨‡¶æ‡¶ö‡ßç‡¶ö‡¶æ ‡¶™‡ßá‡¶∞‡ßá ‡¶™‡ßá‡¶∞‡ßá ‡¶ó‡ßã‡¶ü‡¶æ ‡¶™‡ßÉ‡¶•‡¶ø‡¶¨‡ßÄ...,3,1,4
15,432369,‡¶¨‡¶æ‡ßú‡¶ø ‡¶¨‡¶æ‡¶≤ ‡¶´‡¶æ‡¶≤‡¶æ‡ßü‡¶õ‡ßá,3,1,4
...,...,...,...,...,...
35514,668158,‡¶ï‡ßÉ‡¶∑‡¶ø ‡¶Æ‡¶®‡ßç‡¶§‡ßç‡¶∞‡ßÄ‡¶∞ ‡¶∏‡¶æ‡¶∞‡ßá‡¶∞ ‡¶¨‡ßç‡¶Ø‡¶¨‡¶∏‡¶æ ‡¶ï‡¶∞‡ßá ‡¶§‡¶æ‡¶á ‡¶è ‡¶Ö‡¶¨‡¶∏‡ßç‡¶•‡¶æ,2,1,3
35518,328377,‡¶ö‡ßÅ‡¶∞‡ßá‡¶∞ ‡¶ò‡¶∞‡ßá‡¶∞ ‡¶ö‡ßÅ‡¶∞ ‡¶π‡¶æ‡¶≤‡¶æ,3,1,4
35519,69803,‡¶ú‡¶æ‡¶π‡¶æ‡¶ô‡ßç‡¶ó‡ßÄ‡¶∞ ‡¶¨‡ßÅ‡¶¶‡ßç‡¶ß‡¶ø ‡¶®‡ßá‡¶á ‡¶Æ‡¶æ‡¶†‡ßá ‡¶Æ‡¶æ‡¶∞‡¶æ ‡¶Ø‡¶æ‡¶¨‡ßá,4,1,5
35520,419984,‡¶è‡¶ï‡¶ü‡¶æ ‡¶´‡ßá‡¶á‡¶≤‡ßç‡¶° ‡¶è‡¶∏‡ßç‡¶ü‡ßá‡¶ü ‡¶è‡¶ì ‡¶∏‡ßÅ‡¶∑‡ßç‡¶†‡ßÅ ‡¶®‡¶ø‡¶∞‡ßç‡¶¨‡¶æ‡¶ö‡¶® ‡¶π‡ßü ‡¶®‡ßá‡¶§‡¶æ‡¶∞...,4,1,5


In [57]:
toxic_dev_df

Unnamed: 0,id,text,label,toxic,label_id
0,166449,‡¶á‡¶®‡ßç‡¶°‡¶ø‡ßü‡¶æ ‡¶ï‡¶ø ‡¶Æ‡¶æ‡¶õ ‡¶ß‡¶∞‡¶æ ‡¶¨‡¶®‡ßç‡¶ß ‡¶∞‡¶æ‡¶ñ‡¶õ‡ßá‡¶è‡¶ï ‡¶®‡¶¶‡ßÄ‡¶§‡ßá ‡¶¶‡ßÅ‡¶á‡¶®‡ßÄ‡¶§‡¶ø ...,2,1,3
1,267692,‡¶≤‡¶ï‡ßç‡¶∑ ‡¶ü‡¶æ‡¶ï‡¶æ ‡¶ò‡ßÅ‡¶∑ ‡¶¶‡¶ø‡¶Ø‡¶º‡ßá ‡¶Ö‡¶Ø‡ßã‡¶ó‡ßç‡¶Ø ‡¶Ü‡¶∞ ‡¶¶‡¶æ‡¶Ø‡¶º‡¶ø‡¶§‡ßç‡¶¨‡¶π‡ßÄ‡¶® ‡¶Æ‡¶æ‡¶®‡¶∏...,4,1,5
3,939131,‡¶Ü‡¶∞ ‡¶ï‡¶§‡ßã ‡¶∂‡¶ø‡¶ñ‡¶¨‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶∏‡ßã‡¶®‡¶æ‡¶∞ ‡¶õ‡ßá‡¶≤‡ßá‡¶∞‡¶æ ‡¶è‡¶ó‡ßÅ‡¶≤‡ßã ‡¶ï‡ßá ‡¶ü‡¶æ‡¶ï‡¶æ ‡¶¶...,4,1,5
4,210284,‡¶ï‡¶ø ‡¶∏‡¶æ‡¶Ç‡¶ò‡¶æ‡¶§‡¶ø‡¶ï ‡¶≠‡¶æ‡¶á ‡¶∞‡ßá ‡¶§‡ßÅ‡¶á,4,1,5
5,712332,‡¶≤‡¶û‡ßç‡¶ö ‡¶Æ‡¶æ‡¶≤‡¶ø‡¶ï‡¶¶‡ßá‡¶∞ ‡¶Ö‡¶≠‡¶ø‡¶∂‡¶™‡ßç‡¶§ ‡¶ö‡¶ï‡ßç‡¶∑‡ßÅ ‡¶™‡¶¶‡ßç‡¶Æ‡¶æ ‡¶∏‡ßá‡¶§‡ßÅ‡¶∞ ‡¶â‡¶™‡¶∞,4,1,5
...,...,...,...,...,...
2496,653048,‡¶ï‡¶ø‡¶∞‡ßá ‡¶Æ‡¶æ‡¶®‡¶ø‡¶ï ‡¶ö‡ßã‡¶∞‡¶æ ‡¶§‡ßÅ‡¶á‡¶ì ‡¶Ü‡¶õ‡ßã‡¶∏,4,1,5
2503,121961,‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶∏‡¶¨‡¶á ‡¶§‡ßã ‡¶ö‡ßÅ‡¶∞‡¶ø ‡¶π‡ßü‡ßá ‡¶Ø‡¶æ‡¶ö‡ßç‡¶õ‡ßá ‡¶Ü‡¶∞ ‡¶ö‡ßã‡¶∞‡¶¶‡ßá‡¶∞ ‡¶ß‡¶∞‡¶æ‡¶∞ ‡¶ï‡ßã...,4,1,5
2504,555021,‡¶´‡¶ï‡¶ø‡¶®‡¶ø ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ ‡¶¶‡ßá‡¶∂ ‡¶ï‡¶æ‡¶∞‡ßá‡¶®‡ßç‡¶ü ‡¶®‡¶æ‡¶á,4,1,5
2505,858412,‡¶ï‡¶æ‡¶ï‡ßÅ ‡¶∏‡¶æ‡¶Æ‡¶®‡ßá ‡¶ö‡¶∂‡¶Æ‡¶æ ‡¶•‡¶æ‡¶ï‡¶¨‡ßá ‡¶ï‡¶ø‡¶®‡ßç‡¶§‡ßÅ ‡¶ó‡ßç‡¶≤‡¶æ‡¶∏ ‡¶•‡¶æ‡¶ï‡¶¨‡ßá‡¶®‡¶æ‡¶™‡¶æ‡¶õ‡¶æ...,3,1,4


In [55]:

l2id = {
   
    'Religious Hate': 0,
    'Sexism': 1,
    'Political Hate': 2,
    'Profane': 3,
    'Abusive': 4
}
id2l = {v: k for k, v in l2id.items()}


toxic_train_df["label"] = toxic_train_df["label"].map(l2id)
toxic_train_df['label'].unique()

toxic_dev_df["label"] = toxic_dev_df["label"].map(l2id)
toxic_dev_df['label'].unique()

array([2, 4, 3, 1, 0])

In [56]:
toxic_train_df

Unnamed: 0,id,text,label,toxic,label_id
2,849172,‡¶Ö‡¶§‡¶ø‡¶∞‡¶ø‡¶ï‡ßç‡¶§ ‡¶è ‡¶®‡¶ø‡¶ú‡ßá‡¶ï‡ßá ‡¶¨‡¶æ‡¶¶‡ßÅ‡¶∞ ‡¶¨‡¶æ‡¶®‡¶æ‡¶á‡ßü‡¶æ ‡¶´‡ßá‡¶≤‡¶õ‡ßá‡¶® ‡¶∞‡ßá,4,1,5
4,477288,‡¶è‡¶ü‡¶æ‡¶∞ ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡ßá ‡¶ï‡¶∞‡¶¨‡ßá‡¶Ø‡ßá ‡¶¨‡¶ø‡¶ö‡¶æ‡¶∞ ‡¶ï‡¶∞‡¶¨‡ßá ‡¶∏‡ßá‡¶á ‡¶§‡ßã ‡¶π‡¶≤‡ßã ‡¶è‡¶á ...,4,1,5
7,786609,‡¶á‡¶∞‡¶æ‡¶® ‡¶™‡¶æ‡¶∞‡¶Æ‡¶æ‡¶£‡¶¨‡¶ø‡¶ï ‡¶¨‡ßã‡¶Æ‡¶æ ‡¶¨‡¶æ‡¶®‡¶æ‡¶¨‡ßá ‡¶¨‡¶æ‡¶®‡¶æ‡¶¨‡ßá ‡¶¨‡¶≤‡¶§‡ßá ‡¶¨‡¶≤‡¶§‡ßá ‡¶¨‡¶ø...,4,1,5
11,124917,‡¶Æ‡ßÅ‡¶∏‡¶≤‡¶ø‡¶Æ ‡¶¨‡¶æ‡¶ö‡ßç‡¶ö‡¶æ‡¶ó‡ßÅ‡¶≤‡ßã ‡¶¨‡¶æ‡¶ö‡ßç‡¶ö‡¶æ ‡¶™‡ßá‡¶∞‡ßá ‡¶™‡ßá‡¶∞‡ßá ‡¶ó‡ßã‡¶ü‡¶æ ‡¶™‡ßÉ‡¶•‡¶ø‡¶¨‡ßÄ...,3,1,4
15,432369,‡¶¨‡¶æ‡ßú‡¶ø ‡¶¨‡¶æ‡¶≤ ‡¶´‡¶æ‡¶≤‡¶æ‡ßü‡¶õ‡ßá,3,1,4
...,...,...,...,...,...
35514,668158,‡¶ï‡ßÉ‡¶∑‡¶ø ‡¶Æ‡¶®‡ßç‡¶§‡ßç‡¶∞‡ßÄ‡¶∞ ‡¶∏‡¶æ‡¶∞‡ßá‡¶∞ ‡¶¨‡ßç‡¶Ø‡¶¨‡¶∏‡¶æ ‡¶ï‡¶∞‡ßá ‡¶§‡¶æ‡¶á ‡¶è ‡¶Ö‡¶¨‡¶∏‡ßç‡¶•‡¶æ,2,1,3
35518,328377,‡¶ö‡ßÅ‡¶∞‡ßá‡¶∞ ‡¶ò‡¶∞‡ßá‡¶∞ ‡¶ö‡ßÅ‡¶∞ ‡¶π‡¶æ‡¶≤‡¶æ,3,1,4
35519,69803,‡¶ú‡¶æ‡¶π‡¶æ‡¶ô‡ßç‡¶ó‡ßÄ‡¶∞ ‡¶¨‡ßÅ‡¶¶‡ßç‡¶ß‡¶ø ‡¶®‡ßá‡¶á ‡¶Æ‡¶æ‡¶†‡ßá ‡¶Æ‡¶æ‡¶∞‡¶æ ‡¶Ø‡¶æ‡¶¨‡ßá,4,1,5
35520,419984,‡¶è‡¶ï‡¶ü‡¶æ ‡¶´‡ßá‡¶á‡¶≤‡ßç‡¶° ‡¶è‡¶∏‡ßç‡¶ü‡ßá‡¶ü ‡¶è‡¶ì ‡¶∏‡ßÅ‡¶∑‡ßç‡¶†‡ßÅ ‡¶®‡¶ø‡¶∞‡ßç‡¶¨‡¶æ‡¶ö‡¶® ‡¶π‡ßü ‡¶®‡ßá‡¶§‡¶æ‡¶∞...,4,1,5


In [58]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Ensure your labels are numpy arrays
labels = toxic_train_df['label'].to_numpy()
class_names = np.unique(labels)

# Calculate weights
weights = compute_class_weight(class_weight='balanced', classes=class_names, y=labels)

print(f"Class names: {class_names}")
print(f"Calculated weights: {weights}")



Class names: [0 1 2 3 4]
Calculated weights: [ 4.60591716 25.52131148  0.73659806  1.33573574  0.37915246]


In [31]:
# # # import torch
# # # import torch.nn as nn
# # # from transformers import AutoModel, AutoConfig, PreTrainedModel

# # # class AttentionPooling(nn.Module):
# # #     def __init__(self, hidden_size):
# # #         super().__init__()
# # #         self.attention = nn.Linear(hidden_size, 1)

# # #     def forward(self, hidden_states, mask):
# # #         # hidden_states: [batch_size, seq_len, hidden_size]
# # #         # mask: [batch_size, seq_len]
        
# # #         # Compute raw attention scores
# # #         scores = self.attention(hidden_states).squeeze(-1)  # [batch_size, seq_len]
        
# # #         # Apply mask (very important!)
# # #         scores = scores.masked_fill(mask == 0, -1e9)
        
# # #         # Normalize into probabilities
# # #         attn_weights = torch.softmax(scores, dim=-1)  # [batch_size, seq_len]
        
# # #         # Weighted sum
# # #         pooled = torch.sum(hidden_states * attn_weights.unsqueeze(-1), dim=1)  # [batch_size, hidden_size]
# # #         return pooled


# # # class BanglaBERTWithAttention(PreTrainedModel):
# # #     def __init__(self, model_name, num_labels):
# # #         config = AutoConfig.from_pretrained(model_name, num_labels=num_labels)
# # #         super().__init__(config)
        
# # #         self.bert = AutoModel.from_pretrained(model_name, config=config)
# # #         self.attention = AttentionPooling(config.hidden_size)
# # #         self.dropout = nn.Dropout(config.hidden_dropout_prob)
# # #         self.classifier = nn.Linear(config.hidden_size, num_labels)

# # #     def forward(self, input_ids, attention_mask=None, labels=None):
# # #         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        
# # #         # Get last hidden state
# # #         hidden_states = outputs.last_hidden_state  # [batch, seq_len, hidden_size]
        
# # #         # Apply attention pooling
# # #         pooled_output = self.attention(hidden_states, attention_mask)
# # #         pooled_output = self.dropout(pooled_output)
        
# # #         # Classification head
# # #         logits = self.classifier(pooled_output)
        
# # #         loss = None
# # #         if labels is not None:
# # #             loss_fn = nn.CrossEntropyLoss()
# # #             loss = loss_fn(logits, labels)
        
# # #         return {"loss": loss, "logits": logits}


# # import torch
# # import torch.nn as nn
# # from transformers import AutoModel, AutoConfig, PreTrainedModel
# # from transformers.modeling_outputs import SequenceClassifierOutput


# # class MultiHeadAttentionPooling(nn.Module):
# #     def __init__(self, hidden_size, num_heads=4):
# #         super().__init__()
# #         self.num_heads = num_heads
# #         self.proj = nn.Linear(hidden_size, num_heads)
# #         nn.init.xavier_uniform_(self.proj.weight)
# #         if self.proj.bias is not None:
# #             nn.init.zeros_(self.proj.bias)

# #     def forward(self, hidden_states, mask):
# #         # hidden_states: [batch, seq, hidden]
# #         # mask: [batch, seq]
# #         scores = self.proj(hidden_states)  # [batch, seq, heads]
# #         scores = scores.masked_fill(mask.unsqueeze(-1) == 0, -1e9)

# #         attn_weights = torch.softmax(scores, dim=1)  # [batch, seq, heads]

# #         # Weighted sum per head
# #         pooled = torch.einsum(
# #             "bsh,bsd->bhd", attn_weights, hidden_states
# #         )  # [batch, heads, hidden]

# #         return pooled.reshape(hidden_states.size(0), -1)  # concat heads


# # class BanglaBERTWithAttention(PreTrainedModel):
# #     def __init__(self, model_name, num_labels, class_weights=None, num_heads=4):
# #         config = AutoConfig.from_pretrained(model_name, num_labels=num_labels)
# #         super().__init__(config)

# #         self.bert = AutoModel.from_pretrained(model_name, config=config)
# #         self.attention = MultiHeadAttentionPooling(config.hidden_size, num_heads=num_heads)
# #         self.norm = nn.LayerNorm(config.hidden_size * num_heads)
# #         self.dropout = nn.Dropout(config.hidden_dropout_prob)
# #         self.classifier = nn.Linear(config.hidden_size * num_heads, num_labels)

# #         # Handle class imbalance
# #         if class_weights is not None:
# #             self.register_buffer("class_weights", torch.tensor(class_weights))
# #         else:
# #             self.class_weights = None

# #     def forward(self, input_ids, attention_mask=None, labels=None):
# #         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)

# #         hidden_states = outputs.last_hidden_state  # [batch, seq, hidden]

# #         pooled_output = self.attention(hidden_states, attention_mask)
# #         pooled_output = self.norm(pooled_output)
# #         pooled_output = self.dropout(pooled_output)

# #         logits = self.classifier(pooled_output)

# #         if labels is not None:
# #             if self.config.num_labels == 1:
# #                 # regression
# #                 loss_fn = nn.MSELoss()
# #                 loss = loss_fn(logits.view(-1), labels.view(-1))
# #             else:
# #                 # classification
# #                 labels = labels.long()   # <---- important fix
# #                 loss_fn = nn.CrossEntropyLoss(label_smoothing=0.1)
# #                 loss = loss_fn(logits, labels)


# #         return SequenceClassifierOutput(
# #             loss=loss,
# #             logits=logits,
# #             hidden_states=outputs.hidden_states,
# #             attentions=outputs.attentions,
# #         )




# # import torch
# # import torch.nn as nn
# # from transformers import AutoModel, AutoConfig, PreTrainedModel

# # class AttentionPooling(nn.Module):
# #     def __init__(self, hidden_size):
# #         super().__init__()
# #         self.attention = nn.Linear(hidden_size, 1)

# #     def forward(self, hidden_states, mask):
# #         # hidden_states: [batch_size, seq_len, hidden_size]
# #         # mask: [batch_size, seq_len]
        
# #         # Compute raw attention scores
# #         scores = self.attention(hidden_states).squeeze(-1)  # [batch_size, seq_len]
        
# #         # Apply mask (very important!)
# #         scores = scores.masked_fill(mask == 0, -1e9)
        
# #         # Normalize into probabilities
# #         attn_weights = torch.softmax(scores, dim=-1)  # [batch_size, seq_len]
        
# #         # Weighted sum
# #         pooled = torch.sum(hidden_states * attn_weights.unsqueeze(-1), dim=1)  # [batch_size, hidden_size]
# #         return pooled


# # class BanglaBERTWithAttention(PreTrainedModel):
# #     def __init__(self, model_name, num_labels):
# #         config = AutoConfig.from_pretrained(model_name, num_labels=num_labels)
# #         super().__init__(config)
        
# #         self.bert = AutoModel.from_pretrained(model_name, config=config)
# #         self.attention = AttentionPooling(config.hidden_size)
# #         self.dropout = nn.Dropout(config.hidden_dropout_prob)
# #         self.classifier = nn.Linear(config.hidden_size, num_labels)

# #     def forward(self, input_ids, attention_mask=None, labels=None):
# #         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        
# #         # Get last hidden state
# #         hidden_states = outputs.last_hidden_state  # [batch, seq_len, hidden_size]
        
# #         # Apply attention pooling
# #         pooled_output = self.attention(hidden_states, attention_mask)
# #         pooled_output = self.dropout(pooled_output)
        
# #         # Classification head
# #         logits = self.classifier(pooled_output)
        
# #         loss = None
# #         if labels is not None:
# #             loss_fn = nn.CrossEntropyLoss()
# #             loss = loss_fn(logits, labels)
        
# #         return {"loss": loss, "logits": logits}


# import torch
# import torch.nn as nn
# from transformers import AutoModel, AutoConfig, PreTrainedModel
# from transformers.modeling_outputs import SequenceClassifierOutput


# import torch
# import torch.nn as nn
# from transformers import AutoModel, AutoConfig, PreTrainedModel
# from transformers.modeling_outputs import SequenceClassifierOutput

# # Your MultiHeadAttentionPooling class remains the same, it's well-implemented.
# class MultiHeadAttentionPooling(nn.Module):
#     def __init__(self, hidden_size, num_heads=4):
#         super().__init__()
#         self.num_heads = num_heads
#         self.proj = nn.Linear(hidden_size, num_heads)
#         nn.init.xavier_uniform_(self.proj.weight)
#         if self.proj.bias is not None:
#             nn.init.zeros_(self.proj.bias)

#     def forward(self, hidden_states, mask):
#         # hidden_states: [batch, seq, hidden]
#         # mask: [batch, seq]
#         scores = self.proj(hidden_states)  # [batch, seq, heads]
#         scores = scores.masked_fill(mask.unsqueeze(-1) == 0, -1e9)
#         attn_weights = torch.softmax(scores, dim=1)  # [batch, seq, heads]
#         # Weighted sum per head
#         pooled = torch.einsum(
#             "bsh,bsd->bhd", attn_weights, hidden_states
#         )  # [batch, heads, hidden]
#         return pooled.reshape(hidden_states.size(0), -1)  # concat heads


# class BanglaBERTWithAttention(PreTrainedModel):
#     # ... (your existing __init__ method) ...

#     def forward(self, input_ids, attention_mask=None, labels=None):
#         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)

#         hidden_states = outputs.last_hidden_state
#         cls_output = hidden_states[:, 0, :]
#         attention_pooled_output = self.attention(hidden_states, attention_mask)
#         combined_output = torch.cat((cls_output, attention_pooled_output), dim=1)
#         logits = self.classifier_head(combined_output)

#         loss = None
#         if labels is not None:
#             # ‚≠êÔ∏è Fix: Ensure the labels are of type torch.long
#             labels = labels.long()
#             loss_fn = nn.CrossEntropyLoss(weight=self.class_weights)
#             loss = loss_fn(logits, labels)

#         return SequenceClassifierOutput(
#             loss=loss,
#             logits=logits,
#             hidden_states=outputs.hidden_states,
#             attentions=outputs.attentions,
#         )

#     def forward(self, input_ids, attention_mask=None, labels=None):
#         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        
#         hidden_states = outputs.last_hidden_state  # [batch, seq, hidden]
        
#         # ‚≠êÔ∏è 1. Get the standard [CLS] token representation
#         cls_output = hidden_states[:, 0, :]  # [batch, hidden]
        
#         # 2. Get your custom attention-pooled representation
#         attention_pooled_output = self.attention(hidden_states, attention_mask)
        
#         # ‚≠êÔ∏è 3. Concatenate both representations to combine their strengths
#         combined_output = torch.cat((cls_output, attention_pooled_output), dim=1)
        
#         # 4. Pass the combined representation through the new classifier head
#         logits = self.classifier_head(combined_output)

#         loss = None
#         if labels is not None:
#             loss_fn = nn.CrossEntropyLoss(weight=self.class_weights)
#             loss = loss_fn(logits, labels)

#         return SequenceClassifierOutput(
#             loss=loss,
#             logits=logits,
#             hidden_states=outputs.hidden_states,
#             attentions=outputs.attentions,
#         )

In [93]:
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments,
    DataCollatorWithPadding  # <-- Import the data collator
)
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# --- 1. Load Tokenizer and Model ---
model_name = "csebuetnlp/banglabert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_2 = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)


# def truncate_long_texts(df, max_chars=10000):
#     df['text'] = df['text'].astype(str).apply(lambda x: x[:max_chars])
#     return df

# train_df = truncate_long_texts(train_df)
# dev_df = truncate_long_texts(dev_df)

# --- 2. Create Datasets ---
train_dataset_2 = Dataset.from_pandas(toxic_train_df[['text', 'label']])
dev_dataset_2 = Dataset.from_pandas(toxic_dev_df[['text', 'label']])

def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, max_length=512)



train_dataset_2 = train_dataset_2.map(tokenize, batched=True)
dev_dataset_2 = dev_dataset_2.map(tokenize, batched=True)

# Set the format to torch tensors and specify columns
train_dataset_2.set_format("torch", columns=["input_ids", "attention_mask", "label"])
dev_dataset_2.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# --- 3. Define Training ---

# Create a data collator that will dynamically pad the batches
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics_multiclass(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "f1_weighted": f1_score(p.label_ids, preds, average="weighted"),
    }

training_args_2 = TrainingArguments(
    output_dir="./results_model_2",
    num_train_epochs=15,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    logging_dir="./logs_1",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer_2 = Trainer(
    model=model_2,
    args=training_args_2,
    train_dataset=train_dataset_2,
    eval_dataset=dev_dataset_2,
    compute_metrics=compute_metrics_multiclass,
    data_collator=data_collator, # <-- Add the data collator here
)

# --- 4. Start Fine-tuning ---
trainer_2.train()
trainer_2.save_model("./final_model_2")

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/15568 [00:00<?, ? examples/s]

Map:   0%|          | 0/1061 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted
1,0.8475,0.634301,0.763431,0.759543
2,0.6093,0.653703,0.754948,0.750066
3,0.4601,0.711636,0.762488,0.75877
4,0.3236,0.887283,0.741753,0.736669
5,0.2187,1.096036,0.752121,0.74913
6,0.141,1.333614,0.730443,0.729564
7,0.1102,1.672756,0.740811,0.73803
8,0.091,1.518482,0.744581,0.740286
9,0.0727,1.918614,0.709708,0.709611
10,0.0501,2.197331,0.734213,0.728959


In [10]:
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments,
    DataCollatorWithPadding  # <-- Import the data collator
)
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
# print("Tokenizer vocab size:", len(tokenizer))
# print("Model embedding size:", pipe_1.model.get_input_embeddings().weight.shape[0])
model_name = "csebuetnlp/banglabert"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [24]:
from transformers import pipeline
from sklearn.metrics import classification_report
import numpy as np

# --- 1. Load the fine-tuned models with truncation ---
pipe_1 = pipeline(
    "text-classification",
    model="./final_model_1",
    tokenizer=tokenizer,
    device=-1,   # force CPU
    truncation=True,
    max_length=512
)

pipe_2 = pipeline(
    "text-classification",
    model="./final_model_2",
    tokenizer=tokenizer,
    device=-1,
    truncation=True,
    max_length=512,
    return_all_scores=True
)

# --- 2. Run the pipeline on the test set ---
predictions = []
for text in test_df['text']:
    # Model 1: Is it toxic?
    result_1 = pipe_1(text, truncation=True, max_length=512)[0]
    print(result_1)
    is_toxic = 1 if result_1['label'] == 'LABEL_1' else 0
    print(is_toxic)
    
    pred_labels = [0] * pipe_2.model.config.num_labels  # default: all zero
    
    if is_toxic:
        # Model 2: Which toxic class? (highest score = predicted label)
        result_2 = pipe_2(text, truncation=True, max_length=512)[0]
        best_label = max(result_2, key=lambda x: x['score'])
        pred_labels = [1 if r['label'] == best_label['label'] else 0 for r in result_2]
    
    predictions.append(pred_labels)

y_pred = np.array(predictions)
print(y_pred.shape)


Device set to use cpu
Device set to use cpu


{'label': 'LABEL_0', 'score': 0.9264912009239197}
0
{'label': 'LABEL_1', 'score': 0.821557343006134}
1
{'label': 'LABEL_0', 'score': 0.7928609251976013}
0
{'label': 'LABEL_0', 'score': 0.9106222987174988}
0
{'label': 'LABEL_0', 'score': 0.5984629988670349}
0
{'label': 'LABEL_1', 'score': 0.7769083380699158}
1
{'label': 'LABEL_0', 'score': 0.8854161500930786}
0
{'label': 'LABEL_1', 'score': 0.8128166794776917}
1
{'label': 'LABEL_1', 'score': 0.574408769607544}
1
{'label': 'LABEL_1', 'score': 0.5870243310928345}
1
{'label': 'LABEL_0', 'score': 0.8181209564208984}
0
{'label': 'LABEL_0', 'score': 0.8814177513122559}
0
{'label': 'LABEL_0', 'score': 0.7579877376556396}
0
{'label': 'LABEL_1', 'score': 0.6353529095649719}
1
{'label': 'LABEL_1', 'score': 0.8202230930328369}
1
{'label': 'LABEL_1', 'score': 0.6423068046569824}
1
{'label': 'LABEL_0', 'score': 0.8138402700424194}
0
{'label': 'LABEL_0', 'score': 0.7992917895317078}
0
{'label': 'LABEL_0', 'score': 0.9114345908164978}
0
{'label': 'LAB

In [25]:
# # from transformers import pipeline
# # from sklearn.metrics import classification_report
# # import numpy as np

# # pipe_1 = pipeline(
# #     "text-classification",
# #     model="./final_model_1",
# #     tokenizer=tokenizer,
# #     truncation=True,
# #     max_length=512,
# #     device=-1,
# # )

# # pipe_2 = pipeline(
# #     "text-classification",
# #     model="./final_model_2",
# #     tokenizer=tokenizer,
# #     device=-1,
# #     truncation=True,
# #     max_length=512,
# #     return_all_scores=True
# # )

# # --- 2. Run the pipeline on the test set ---
# predictions = []
# for text in test_df['text']:
#     # # Model 1: Is it toxic?
#     # print(text)
#     # result_1 = pipe_1(text)[0]
#     is_toxic = 1 
#     #if result_1['label'] == 'LABEL_1' else 0
    
#     pred_labels = [0] * pipe_2.model.config.num_labels  # default: all zero
    
#     if is_toxic:
#         # Model 2: Which toxic class? (highest score = predicted label)
#         result_2 = pipe_2(text, truncation=True, max_length=512)[0]
#         best_label = max(result_2, key=lambda x: x['score'])
#         pred_labels = [1 if r['label'] == best_label['label'] else 0 for r in result_2]
    
#     predictions.append(pred_labels)

# y_pred = np.array(predictions)
# print(y_pred.shape)


In [26]:
test_df

Unnamed: 0,id,text
0,879187,‡¶∂‡ßÅ‡¶≠ ‡¶ï‡¶æ‡¶Æ‡¶®‡¶æ ‡¶∞‡¶á‡¶≤ ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶ú‡¶®‡ßç‡¶Ø ‡¶á‡¶®‡¶∂‡¶æ‡¶Ü‡¶≤‡ßç‡¶≤‡¶æ‡¶π ‡¶ú‡¶Ø‡¶º ‡¶π‡¶¨‡ßá
1,316919,‡¶ó‡ßã‡ßü‡¶æ ‡¶Æ‡¶æ‡¶∞‡¶æ ‡¶¶‡¶ø‡ßü‡ßá ‡¶Ü‡¶õ‡ßá ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶Æ‡¶æ‡¶¶‡¶æ‡¶∞‡¶ö‡ßã‡¶¶ ‡¶®‡¶ø‡¶â‡¶ú ‡¶ï‡¶∞‡ßá ...
2,916242,‡¶≠‡¶æ‡¶á‡¶Ø‡¶º‡¶æ ‡¶Ü‡¶™‡¶®‡¶ø ‡¶Ö‡¶≠‡¶ø‡¶®‡ßá‡¶§‡¶æ ‡¶π‡¶á‡¶Ø‡¶º‡ßá‡¶® ‡¶®‡¶æ ‡¶®‡¶æ ‡¶π‡¶≤‡ßá ‡¶∏‡¶¨‡¶æ‡¶á ‡¶¨‡¶æ‡¶ö‡ßç...
3,786824,‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞‡ßã ‡¶§‡¶æ‡¶á ‡¶¶‡ßá‡¶ñ‡¶õ‡¶ø
4,47284,‡¶™‡ßÅ‡¶≤‡¶ø‡¶∂ ‡¶ï‡¶§‡¶ü‡¶æ ‡¶ü‡¶æ‡¶ï‡¶æ ‡¶®‡¶ø‡ßü‡ßá
...,...,...
2507,776466,‡¶∏‡¶§‡ßç‡¶Ø ‡¶ï‡¶•‡¶æ ‡¶§‡ßá‡¶§‡ßÅ ‡¶≤‡¶æ‡¶ó‡ßá
2508,849227,‡¶è‡¶á ‡¶´‡¶ï‡¶ø‡¶®‡¶®‡¶ø ‡¶Æ‡¶æ‡¶ó‡ßÄ‡¶ü‡¶æ ‡¶Ü‡¶∞ ‡¶ï‡¶§ ‡¶®‡¶æ‡¶ü‡¶ï ‡¶¶‡ßá‡¶ñ‡¶æ‡¶¨‡ßá
2509,532697,‡¶¶‡ßá‡¶ñ‡ßã ‡¶Ü‡¶ú‡¶ï‡ßá ‡¶ï‡¶æ‡¶∞ ‡¶´‡¶ø‡¶ü‡¶®‡ßá‡¶∏ ‡¶ï‡ßã‡¶•‡¶æ‡¶Ø‡¶º ‡¶¶‡¶æ‡¶Å‡¶°‡¶º‡¶ø‡¶Ø‡¶º‡ßá‡¶õ‡ßá ‡¶§‡ßÅ‡¶Æ‡¶ø ‡¶ö...
2510,861411,‡¶õ‡ßã‡¶ü ‡¶≠‡¶æ‡¶á‡¶ü‡¶ø‡¶∞ ‡¶™‡¶æ‡¶∏‡ßá ‡¶•‡¶æ‡¶ï‡ßÅ‡¶® ‡¶ó‡ßá‡¶Æ ‡¶≠‡¶ø‡ßú‡¶ø‡¶ì ‡¶¨‡¶æ‡¶®‡¶æ‡¶á


In [27]:
import numpy as np
import pandas as pd

# Your mapping
id2l = {
    0: 'Religious Hate',
    1: 'Sexism',
    2: 'Political Hate',
    3: 'Profane',
    4: 'Abusive'
}

# Example y_pred
# y_pred = np.array([[0,0,0,0,0],[0,0,0,1,0],[1,0,0,0,0]])

def decode_labels(row):
    indices = np.where(row == 1)[0]
    if len(indices) == 0:
        return "None"
    # If multiple labels, join them with comma
    return ", ".join([id2l[i] for i in indices])

# Convert predictions into a DataFrame column
df = pd.DataFrame()
df["Predicted_Label"] = [decode_labels(row) for row in y_pred]

print(df.head())


  Predicted_Label
0            None
1         Profane
2            None
3            None
4            None


In [28]:
df

Unnamed: 0,Predicted_Label
0,
1,Profane
2,
3,
4,
...,...
2507,
2508,Profane
2509,Profane
2510,


In [29]:
test_df['label']=df['Predicted_Label']
test_df['model']='bangla-bert'
test_df

Unnamed: 0,id,text,label,model
0,879187,‡¶∂‡ßÅ‡¶≠ ‡¶ï‡¶æ‡¶Æ‡¶®‡¶æ ‡¶∞‡¶á‡¶≤ ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶ú‡¶®‡ßç‡¶Ø ‡¶á‡¶®‡¶∂‡¶æ‡¶Ü‡¶≤‡ßç‡¶≤‡¶æ‡¶π ‡¶ú‡¶Ø‡¶º ‡¶π‡¶¨‡ßá,,bangla-bert
1,316919,‡¶ó‡ßã‡ßü‡¶æ ‡¶Æ‡¶æ‡¶∞‡¶æ ‡¶¶‡¶ø‡ßü‡ßá ‡¶Ü‡¶õ‡ßá ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶Æ‡¶æ‡¶¶‡¶æ‡¶∞‡¶ö‡ßã‡¶¶ ‡¶®‡¶ø‡¶â‡¶ú ‡¶ï‡¶∞‡ßá ...,Profane,bangla-bert
2,916242,‡¶≠‡¶æ‡¶á‡¶Ø‡¶º‡¶æ ‡¶Ü‡¶™‡¶®‡¶ø ‡¶Ö‡¶≠‡¶ø‡¶®‡ßá‡¶§‡¶æ ‡¶π‡¶á‡¶Ø‡¶º‡ßá‡¶® ‡¶®‡¶æ ‡¶®‡¶æ ‡¶π‡¶≤‡ßá ‡¶∏‡¶¨‡¶æ‡¶á ‡¶¨‡¶æ‡¶ö‡ßç...,,bangla-bert
3,786824,‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞‡ßã ‡¶§‡¶æ‡¶á ‡¶¶‡ßá‡¶ñ‡¶õ‡¶ø,,bangla-bert
4,47284,‡¶™‡ßÅ‡¶≤‡¶ø‡¶∂ ‡¶ï‡¶§‡¶ü‡¶æ ‡¶ü‡¶æ‡¶ï‡¶æ ‡¶®‡¶ø‡ßü‡ßá,,bangla-bert
...,...,...,...,...
2507,776466,‡¶∏‡¶§‡ßç‡¶Ø ‡¶ï‡¶•‡¶æ ‡¶§‡ßá‡¶§‡ßÅ ‡¶≤‡¶æ‡¶ó‡ßá,,bangla-bert
2508,849227,‡¶è‡¶á ‡¶´‡¶ï‡¶ø‡¶®‡¶®‡¶ø ‡¶Æ‡¶æ‡¶ó‡ßÄ‡¶ü‡¶æ ‡¶Ü‡¶∞ ‡¶ï‡¶§ ‡¶®‡¶æ‡¶ü‡¶ï ‡¶¶‡ßá‡¶ñ‡¶æ‡¶¨‡ßá,Profane,bangla-bert
2509,532697,‡¶¶‡ßá‡¶ñ‡ßã ‡¶Ü‡¶ú‡¶ï‡ßá ‡¶ï‡¶æ‡¶∞ ‡¶´‡¶ø‡¶ü‡¶®‡ßá‡¶∏ ‡¶ï‡ßã‡¶•‡¶æ‡¶Ø‡¶º ‡¶¶‡¶æ‡¶Å‡¶°‡¶º‡¶ø‡¶Ø‡¶º‡ßá‡¶õ‡ßá ‡¶§‡ßÅ‡¶Æ‡¶ø ‡¶ö...,Profane,bangla-bert
2510,861411,‡¶õ‡ßã‡¶ü ‡¶≠‡¶æ‡¶á‡¶ü‡¶ø‡¶∞ ‡¶™‡¶æ‡¶∏‡ßá ‡¶•‡¶æ‡¶ï‡ßÅ‡¶® ‡¶ó‡ßá‡¶Æ ‡¶≠‡¶ø‡ßú‡¶ø‡¶ì ‡¶¨‡¶æ‡¶®‡¶æ‡¶á,,bangla-bert


In [30]:
#test_df['model']='bert-base-multilingual-cased'

In [31]:
test_df.columns

Index(['id', 'text', 'label', 'model'], dtype='object')

In [32]:
test_df = test_df[['id', 'label', 'model']]
test_df

Unnamed: 0,id,label,model
0,879187,,bangla-bert
1,316919,Profane,bangla-bert
2,916242,,bangla-bert
3,786824,,bangla-bert
4,47284,,bangla-bert
...,...,...,...
2507,776466,,bangla-bert
2508,849227,Profane,bangla-bert
2509,532697,Profane,bangla-bert
2510,861411,,bangla-bert


In [33]:
test_df.to_csv("final_bert_v6.tsv", sep="\t", index=False)
print("Saved to final_ensemble.tsv")

Saved to final_ensemble.tsv
