In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/lastly/collect_preprocessed_dataset.csv


In [19]:
import pandas as pd
import numpy as np
import torch

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score


In [8]:
# Load your dataset
df = pd.read_csv("/kaggle/input/lastly/collect_preprocessed_dataset.csv")

emotion_cols = [
    'Love', 'Joy', 'Anger', 'Surprise', 'Sadness', 'Fear', 'Hate'
]

df = df[['Data'] + emotion_cols]
df.dropna(inplace=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(df.head())


                                                Data  Love  Joy  Anger  \
0                       shitkale pampers pore ghumai     0    0      0   
1                           ekta dokane 40 lakh taka     0    0      0   
2  ami ekta aghatojnit smriti somporke obogoto ho...     0    0      0   
3            tuder opor hobe gojob na kar opore hobe     0    0      0   
4  update deoyar por onek valo hoye gese godi 100...     1    0      0   

   Surprise  Sadness  Fear  Hate  
0         1        0     0     0  
1         0        0     0     1  
2         0        0     1     0  
3         0        0     1     0  
4         0        0     0     0  


In [7]:
df.columns

Index(['Data', 'Love', 'Joy', 'Anger', 'Surprise', 'Sadness', 'Fear', 'Hate',
       'topic', 'Domain'],
      dtype='object')

In [9]:
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


In [10]:
train_ds = Dataset.from_pandas(train_df)
val_ds   = Dataset.from_pandas(val_df)
test_ds  = Dataset.from_pandas(test_df)


In [24]:
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    tokens = tokenizer(
        batch["Data"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

    labels = []
    for i in range(len(batch["Data"])):
        labels.append([float(batch[col][i]) for col in emotion_cols])

    tokens["labels"] = labels
    return tokens

train_ds = Dataset.from_pandas(train_df).map(tokenize, batched=True)
val_ds   = Dataset.from_pandas(val_df).map(tokenize, batched=True)
test_ds  = Dataset.from_pandas(test_df).map(tokenize, batched=True)

train_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/22246 [00:00<?, ? examples/s]

Map:   0%|          | 0/2781 [00:00<?, ? examples/s]

Map:   0%|          | 0/2781 [00:00<?, ? examples/s]

In [26]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(emotion_cols),
    problem_type="multi_label_classification"
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    preds = (probs > 0.5).astype(int)

    return {
        "precision": precision_score(labels, preds, average="macro", zero_division=0),
        "recall": recall_score(labels, preds, average="macro", zero_division=0),
        "f1": f1_score(labels, preds, average="macro", zero_division=0),
    }


In [32]:
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    hamming_loss
)
import torch
import numpy as np


def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # logits → probabilities
    probs = torch.sigmoid(torch.tensor(logits)).numpy()

    # binary predictions
    preds = (probs > 0.5).astype(int)

    # 1️⃣ Normal Accuracy (Subset / Exact Match)
    subset_accuracy = accuracy_score(labels, preds)

    # 2️⃣ Hamming Accuracy
    hamming_accuracy = 1 - hamming_loss(labels, preds)

    # 3️⃣ Macro F1
    macro_f1 = f1_score(labels, preds, average="macro", zero_division=0)

    return {
        "subset_accuracy": subset_accuracy,   # Normal accuracy
        "hamming_accuracy": hamming_accuracy, # Best for multi-label
        "macro_f1": macro_f1,
        "precision": precision_score(labels, preds, average="macro", zero_division=0),
        "recall": recall_score(labels, preds, average="macro", zero_division=0),
    }


In [33]:
training_args = TrainingArguments(
    output_dir="./mbert_results",
    eval_strategy="epoch",        # ⚠️ correct for your version
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_steps=100,
    report_to="none"
)

print("✅ training_args created")


✅ training_args created


In [34]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("✅ Trainer initialized")


✅ Trainer initialized


  trainer = Trainer(


In [35]:
print(train_ds[0]["labels"])
print(train_ds[0]["labels"].dtype)


tensor([0., 0., 0., 0., 0., 1., 0.])
torch.float32


In [36]:
trainer.train()




Epoch,Training Loss,Validation Loss,Subset Accuracy,Hamming Accuracy,Macro F1,Precision,Recall
1,0.1828,0.285194,0.5347,0.891971,0.605072,0.676328,0.568342
2,0.1498,0.282676,0.571737,0.897365,0.648315,0.69198,0.618362
3,0.1575,0.276213,0.563466,0.897005,0.650881,0.69265,0.61762
4,0.1286,0.2883,0.588997,0.900601,0.667322,0.705356,0.641442
5,0.1037,0.320816,0.599065,0.899625,0.663487,0.690533,0.644883
6,0.0871,0.325917,0.612729,0.903837,0.679847,0.709263,0.654964
7,0.0693,0.354146,0.609853,0.900755,0.680078,0.685487,0.678329
8,0.0498,0.378005,0.612729,0.902193,0.68069,0.696233,0.669074
9,0.0461,0.390712,0.635023,0.905738,0.694355,0.711952,0.6814
10,0.0353,0.423983,0.624955,0.903632,0.688993,0.695933,0.684394




RuntimeError: [enforce fail at inline_container.cc:626] . unexpected pos 683460288 vs 683460176