In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/lastly/collect_preprocessed_dataset.csv


In [3]:
import torch
import pandas as pd
import numpy as np

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score, precision_score,
    recall_score, accuracy_score, hamming_loss
)

from tqdm import tqdm


In [5]:
df = pd.read_csv("/kaggle/input/lastly/collect_preprocessed_dataset.csv")




In [6]:
df.columns


Index(['Data', 'Love', 'Joy', 'Anger', 'Surprise', 'Sadness', 'Fear', 'Hate',
       'topic', 'Domain'],
      dtype='object')

In [7]:
emotion_cols = ["Love","Sadness","Anger","Surprise","Fear","Joy","Hate"]

df = df[["Data"] + emotion_cols]
df.dropna(inplace=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print("Dataset shape:", df.shape)
df.head()

Dataset shape: (27808, 8)


Unnamed: 0,Data,Love,Sadness,Anger,Surprise,Fear,Joy,Hate
0,shitkale pampers pore ghumai,0,0,0,1,0,0,0
1,ekta dokane 40 lakh taka,0,0,0,0,0,0,1
2,ami ekta aghatojnit smriti somporke obogoto ho...,0,0,0,0,1,0,0
3,tuder opor hobe gojob na kar opore hobe,0,0,0,0,1,0,0
4,update deoyar por onek valo hoye gese godi 100...,1,0,0,0,0,0,0


In [8]:
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


In [9]:
class EmotionDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.texts = df["Data"].tolist()
        self.labels = df[emotion_cols].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(),
            "attention_mask": enc["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float)
        }


In [10]:
MODEL_NAME = "csebuetnlp/banglishbert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_ds = EmotionDataset(train_df, tokenizer)
val_ds   = EmotionDataset(val_df, tokenizer)
test_ds  = EmotionDataset(test_df, tokenizer)

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=8)
test_loader  = DataLoader(test_ds, batch_size=8)


tokenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/874 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=7,
    problem_type="multi_label_classification"
).to(device)


2025-12-19 04:55:07.481247: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766120107.670537      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766120107.720321      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglishbert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

In [13]:
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=2e-5,          # BanglishBERT sweet spot
    weight_decay=0.01
)

criterion = torch.nn.BCEWithLogitsLoss()


In [15]:
def train_epoch(model, loader):
    model.train()
    total_loss = 0

    for batch in tqdm(loader):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=mask)
        loss = criterion(outputs.logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(loader)


In [17]:
def evaluate(model, loader):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            labels = batch["labels"].cpu().numpy()

            logits = model(input_ids, attention_mask=mask).logits
            probs = torch.sigmoid(logits).cpu().numpy()
            preds = (probs > 0.5).astype(int)

            all_preds.append(preds)
            all_labels.append(labels)

    y_pred = np.vstack(all_preds)
    y_true = np.vstack(all_labels)

    return {
        "subset_accuracy": accuracy_score(y_true, y_pred),
        "hamming_accuracy": 1 - hamming_loss(y_true, y_pred),
        "macro_f1": f1_score(y_true, y_pred, average="macro"),
        "precision": precision_score(y_true, y_pred, average="macro"),
        "recall": recall_score(y_true, y_pred, average="macro"),
    }


In [19]:
for epoch in range(8):
    loss = train_epoch(model, train_loader)
    metrics = evaluate(model, val_loader)

    print(f"\nEpoch {epoch+1}")
    print("Train Loss:", loss)
    print(metrics)


100%|██████████| 2781/2781 [09:01<00:00,  5.13it/s]



Epoch 1
Train Loss: 0.14656824647311317
{'subset_accuracy': 0.5767709457029845, 'hamming_accuracy': 0.8999332203215699, 'macro_f1': 0.6632221516395489, 'precision': 0.695605035804963, 'recall': 0.6392820382233195}


100%|██████████| 2781/2781 [09:02<00:00,  5.13it/s]



Epoch 2
Train Loss: 0.11917847162280037
{'subset_accuracy': 0.5875584322186264, 'hamming_accuracy': 0.9048646427287204, 'macro_f1': 0.6754138146671644, 'precision': 0.7143239815441973, 'recall': 0.6424257450080458}


100%|██████████| 2781/2781 [09:01<00:00,  5.13it/s]



Epoch 3
Train Loss: 0.09746176513037164
{'subset_accuracy': 0.5886371808701906, 'hamming_accuracy': 0.8989058406534135, 'macro_f1': 0.6648269176261193, 'precision': 0.6955059417125743, 'recall': 0.6458330592949547}


100%|██████████| 2781/2781 [09:01<00:00,  5.13it/s]



Epoch 4
Train Loss: 0.07965243684247816
{'subset_accuracy': 0.594750089895721, 'hamming_accuracy': 0.8991626855704525, 'macro_f1': 0.6686707910858994, 'precision': 0.6894773362577, 'recall': 0.6538641357148883}


100%|██████████| 2781/2781 [09:02<00:00,  5.13it/s]



Epoch 5
Train Loss: 0.0662141822201308
{'subset_accuracy': 0.6120100683207479, 'hamming_accuracy': 0.905840653413469, 'macro_f1': 0.6867701694242176, 'precision': 0.7049678934346423, 'recall': 0.674146991078706}


100%|██████████| 2781/2781 [09:01<00:00,  5.13it/s]



Epoch 6
Train Loss: 0.05906678329279505
{'subset_accuracy': 0.6105717367853291, 'hamming_accuracy': 0.9033749422098937, 'macro_f1': 0.674719093440743, 'precision': 0.7025253461523301, 'recall': 0.6525204247911536}


100%|██████████| 2781/2781 [09:01<00:00,  5.13it/s]



Epoch 7
Train Loss: 0.048590290725515156
{'subset_accuracy': 0.6073354908306364, 'hamming_accuracy': 0.9019366106744747, 'macro_f1': 0.6888441509541671, 'precision': 0.6959301924971752, 'recall': 0.6875918220513102}


100%|██████████| 2781/2781 [09:02<00:00,  5.13it/s]



Epoch 8
Train Loss: 0.044456189430616154
{'subset_accuracy': 0.6130888169723121, 'hamming_accuracy': 0.9023989315251452, 'macro_f1': 0.6865134277730831, 'precision': 0.6949597926762773, 'recall': 0.6800154952358473}


In [20]:
SAVE_PATH = "/kaggle/working/banglishBert_model"

model.save_pretrained(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)

# Save thresholds
np.save(f"{SAVE_PATH}/thresholds.npy", np.array(best_thresholds))


NameError: name 'best_thresholds' is not defined

In [21]:
import numpy as np
from sklearn.metrics import f1_score

def tune_thresholds(y_true, y_probs):
    best_thresholds = []

    for i in range(y_true.shape[1]):
        best_f1 = 0
        best_t = 0.5

        for t in np.arange(0.1, 0.9, 0.05):
            preds = (y_probs[:, i] > t).astype(int)
            f1 = f1_score(y_true[:, i], preds)

            if f1 > best_f1:
                best_f1 = f1
                best_t = t

        best_thresholds.append(best_t)

    return best_thresholds


In [23]:
def apply_thresholds(probs, thresholds):
    preds = np.zeros_like(probs)

    for i, t in enumerate(thresholds):
        preds[:, i] = (probs[:, i] > t).astype(int)

    return preds


In [24]:
# After validation
val_probs, val_labels = [], []

model.eval()
with torch.no_grad():
    for batch in val_loader:
        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labels = batch["labels"].cpu().numpy()

        logits = model(ids, attention_mask=mask).logits
        probs = torch.sigmoid(logits).cpu().numpy()

        val_probs.append(probs)
        val_labels.append(labels)

val_probs = np.vstack(val_probs)
val_labels = np.vstack(val_labels)

best_thresholds = tune_thresholds(val_labels, val_probs)
print("Best thresholds:", best_thresholds)


Best thresholds: [0.6000000000000002, 0.30000000000000004, 0.5500000000000002, 0.8000000000000002, 0.8000000000000002, 0.6000000000000002, 0.20000000000000004]


In [25]:
test_probs, test_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labels = batch["labels"].cpu().numpy()

        logits = model(ids, attention_mask=mask).logits
        probs = torch.sigmoid(logits).cpu().numpy()

        test_probs.append(probs)
        test_labels.append(labels)

test_probs = np.vstack(test_probs)
test_labels = np.vstack(test_labels)

test_preds = apply_thresholds(test_probs, best_thresholds)

print("Macro F1:", f1_score(test_labels, test_preds, average="macro"))


Macro F1: 0.6804339120819751


In [26]:
SAVE_PATH = "/kaggle/working/banglishBert_model"

model.save_pretrained(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)

# Save thresholds
np.save(f"{SAVE_PATH}/thresholds.npy", np.array(best_thresholds))


In [27]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(SAVE_PATH).to(device)
tokenizer = AutoTokenizer.from_pretrained(SAVE_PATH)

thresholds = np.load(f"{SAVE_PATH}/thresholds.npy")


In [28]:
emotion_cols = ["Love","Sad","Angry","Surprise","Fear","Joy","Hate"]

def predict_emotion(text):
    enc = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    ).to(device)

    with torch.no_grad():
        logits = model(**enc).logits
        probs = torch.sigmoid(logits)[0].cpu().numpy()

    results = []
    for emo, p, t in zip(emotion_cols, probs, thresholds):
        if p > t:
            results.append((emo, round(float(p), 3)))

    return results


In [29]:
predict_emotion("ami onek khushi but ektu voy lagche")


[('Fear', 1.0)]

In [30]:
def predict_topk(text, k=3):
    model.eval()

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    ).to(device)

    with torch.no_grad():
        logits = model(**inputs).logits
        probs = torch.sigmoid(logits)[0].cpu().numpy()

    scores = list(zip(emotion_cols, probs))
    scores.sort(key=lambda x: x[1], reverse=True)

    return scores[:k]


In [31]:
predict_topk("ami onek khushi but ektu voy lagche", k=3)


[('Fear', 0.99963), ('Sad', 0.00045022933), ('Angry', 0.0001944166)]

In [32]:
def predict_threshold(text):
    model.eval()

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    ).to(device)

    with torch.no_grad():
        logits = model(**inputs).logits
        probs = torch.sigmoid(logits)[0].cpu().numpy()

    results = []
    for emo, p in zip(emotion_cols, probs):
        if p >= thresholds[emo]:
            results.append((emo, round(float(p), 3)))

    if not results:
        # fallback
        top_idx = np.argmax(probs)
        results.append((emotion_cols[top_idx], round(float(probs[top_idx]), 3)))

    return results


In [33]:
predict_topk("ami onek khushi but ektu voy lagche", k=3)


[('Fear', 0.99963), ('Sad', 0.00045022933), ('Angry', 0.0001944166)]