In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe_connected'

import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.nn.functional import softmax
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
import time
from collections import defaultdict

In [24]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# ***Experiments***

In [3]:
DEVICE = "cuda"
MAX_LEN = 64
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16
EPOCHS = 5
TRAINING_FILE = "/kaggle/input/data-splits-product-categorization/train_set_product_categorization.csv"
VALIDATION_FILE = "/kaggle/input/data-splits-product-categorization/validation_set_product_categorization.csv"
TEST_FILE = "/kaggle/input/data-splits-product-categorization/test_set_product_categorization.csv"

In [4]:
class BERTDataset(Dataset):
    def __init__(self, title, target, tokenizer, max_len=MAX_LEN):
        self.title = title
        self.target = target
        self.max_len = max_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.title)

    def __getitem__(self, item):
        title = str(self.title[item])

        # Tokenize title and pad/truncate to max_len
        encoding = self.tokenizer.encode_plus(
            title,
            add_special_tokens=True,  # Adds [CLS] and [SEP]
            max_length=self.max_len, 
            padding='max_length', 
            truncation=True,
            return_attention_mask=True, 
            return_tensors='pt'
        )

        # Prepare input tensors
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        label = torch.tensor(self.target[item], dtype=torch.long)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label
        }

In [5]:
# Loss function
criterion = nn.CrossEntropyLoss()

# Training function
def train_loop(data_loader, model, optimizer, device, scheduler):
    model.train()
    epoch_loss = []
    total_correct = 0
    total_samples = 0

    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # In the train loop:
        loss = criterion(logits, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        scheduler.step()

        epoch_loss.append(loss.item())  

        # Calculate accuracy based on logits (thresholded)
        preds = torch.argmax(logits, dim=1)
        total_correct += (preds.squeeze() == labels).sum().item()
        total_samples += labels.size(0)
 
    avg_loss = np.mean(epoch_loss)  # Calculate average loss
    accuracy = total_correct / total_samples  # Calculate accuracy
    return avg_loss, accuracy


# Evaluation function
def test_loop(data_loader, model, device):
    model.eval()
    val_loss = []
    total_correct = 0
    total_samples = 0
    all_preds = []
    all_labels = []

    # Dictionary to store per-category stats
    category_correct = defaultdict(int)
    category_total = defaultdict(int)
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = criterion(logits, labels)
            val_loss.append(loss.item())

            preds = torch.argmax(logits, dim=1)

            # Update totals
            total_correct += (preds == labels).sum().item()
            total_samples += labels.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            # Update per-category counts
            for label, pred in zip(labels.cpu().numpy(), preds.cpu().numpy()):
                category_total[label] += 1
                if label == pred:
                    category_correct[label] += 1

    # Per-category accuracy
    category_accuracies = {
        label: category_correct[label] / category_total[label]
        for label in category_total
    }

    avg_val_loss = np.mean(val_loss)
    accuracy = total_correct / total_samples
    f1 = f1_score(all_labels, all_preds, average="weighted")

    return avg_val_loss, accuracy, f1, category_accuracies

In [20]:
# Main function to run training and evaluation
def train(model_name='roberta-base-uncased', max_len=64):

    # Start timing for training
    start_time = time.time()
    
    # Read training and validation datasets
    df_train = pd.read_csv(TRAINING_FILE)
    df_valid = pd.read_csv(VALIDATION_FILE)

    # Define the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Create dataset objects for training and validation
    train_dataset = BERTDataset(
        title=df_train.title.values,
        target=df_train.category_label_encoded.values,
        tokenizer = tokenizer,
        max_len=max_len
    )

    train_data_loader = DataLoader(
        train_dataset, TRAIN_BATCH_SIZE, shuffle=True, num_workers=4
    )

    valid_dataset = BERTDataset(
        title=df_valid.title.values,
        target=df_valid.category_label_encoded.values,
        tokenizer = tokenizer,
        max_len=max_len
    )

    valid_data_loader = DataLoader(
        valid_dataset, VALID_BATCH_SIZE, num_workers=1
    )

    # Prepare the model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=10)
    model.to(device)

    # Prepare optimizer and scheduler
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)  
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )

    max_val_accuracy = 0
    train_losses = []
    train_accuracies = []
    val_losses = []
    val_accuracies = []
    val_f1s = []
    
    # Iterate over the whole dataset EPOCHS times 
    for epoch in range(EPOCHS):  
        
        # Per epoch training and validation loops 
        train_loss, train_accuracy = train_loop(train_data_loader, model, optimizer, device, scheduler)
        val_loss, val_accuracy, val_f1, per_category_accuracy = test_loop(valid_data_loader, model, device)

        # Store metrics for plotting 
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)
        val_f1s.append(val_f1)

         # Print the training results per epoch
        log_string = "Epoch: {}/{} Train loss: {} Training accuracy: {} %  Val loss {} Val accuracy {} %  Val F1 {}"
        print(log_string.format(epoch + 1,
                                EPOCHS, 
                                round(train_loss, 3), 
                                round(100 * train_accuracy, 2),
                                round(val_loss, 3),
                                round(100 * val_accuracy, 2),
                                round(100 * val_f1, 2)
                                ))

        # Print per-category accuracy
        # print("Per-category accuracy:")
        # for category_id, acc in sorted(per_category_accuracy.items()):
        #     category_name = label_encoder.inverse_transform([category_id])[0]
        #     print(f"  {category_name} (ID {category_id}): {round(100 * acc, 2)}%")

        
        # Save the model with the highest validation accuracy
        if val_accuracy > max_val_accuracy:
            safe_model_name = model_name.replace("/", "_")
            torch.save(model.state_dict(), f"{safe_model_name}_max_len_{max_len}_product_categorization.pth")
            max_val_accuracy = val_accuracy


    # Track the total training time 
    end_time = time.time()
    total_time = end_time - start_time
    hours, rem = divmod(total_time, 3600)
    minutes, seconds = divmod(rem, 60)
    print(f"\nTotal training time: {int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}")

    # Save the training time 
    with open(f"training_time_{safe_model_name}_max_len_{max_len}.txt", "w") as f:
        f.write(f"Total training time: {int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}\n")

    # Save statistics for plotting later
    epochs = range(1, EPOCHS + 1)
    metrics_df = pd.DataFrame({
        "epoch": epochs,
        "train_loss": train_losses,
        "val_loss": val_losses,
        "train_accuracy": train_accuracies,
        "val_accuracy": val_accuracies,
        "val_f1": val_f1s
    })

    metrics_df.to_csv(f"training_metrics_{safe_model_name}_max_len_{max_len}.csv", index=False)


if __name__ == "__main__":
    train(model_name='roberta-base-uncased', max_len=64)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training:   0%|          | 0/883 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already b

Epoch: 1/5 Train loss: 0.243 Training accuracy: 93.7 %  Val loss 0.064 Val accuracy 98.39 %  Val F1 98.39


Training:   0%|          | 0/883 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Epoch: 2/5 Train loss: 0.046 Training accuracy: 98.83 %  Val loss 0.047 Val accuracy 98.87 %  Val F1 98.87


Training:   0%|          | 0/883 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Epoch: 3/5 Train loss: 0.022 Training accuracy: 99.45 %  Val loss 0.041 Val accuracy 99.01 %  Val F1 99.01


Training:   0%|          | 0/883 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Epoch: 4/5 Train loss: 0.012 Training accuracy: 99.76 %  Val loss 0.035 Val accuracy 99.26 %  Val F1 99.26


Training:   0%|          | 0/883 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Epoch: 5/5 Train loss: 0.008 Training accuracy: 99.85 %  Val loss 0.035 Val accuracy 99.32 %  Val F1 99.32

Total training time: 00:14:47


# ***Inference***

In [21]:
df_test = pd.read_csv(TEST_FILE)
all_labels = df_test["category_label_encoded"].values
label_names = ['CPUs', 'Digital Cameras', 'Dishwashers', 'Freezers',
       'Fridge Freezers', 'Fridges', 'Microwaves', 'Mobile Phones', 'TVs',
        'Washing Machines']
df_test

Unnamed: 0,title,category_label_encoded
0,constructa einbau k hlschrank ck 60430,5
1,siemens gs36nvi30g freestanding freezer,3
2,bosch kgn36vw35g exxcel frost free fridge free...,4
3,blomberg lwf29441w 1400 spin 9kg washing machine,9
4,sony slt a77 mark ii geh use ilca77m2,1
...,...,...
3526,iq700 wm14yh79gb 9kg 1400 spin washing machine,9
3527,smeg right hand hinge free standing fridge fre...,4
3528,htc wildfire s sim free mobile phone silverwhite,7
3529,siemens sn558s02me geschirrsp ler integriert 60cm,2


In [22]:
def run_inference_with_eval_and_plots(model_name, df, max_len=64, batch_size=32, label_names=None, output_dir="inference_outputs"):
    os.makedirs(output_dir, exist_ok=True)

    # Prepare safe names and device
    safe_model_name = model_name.replace("/", "_")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=10)
    model.load_state_dict(torch.load(f"/kaggle/working/{safe_model_name}_max_len_{max_len}_product_categorization.pth", map_location=device))
    model.to(device).eval()

    all_preds, all_probs, all_labels = [], [], df["category_label_encoded"].values  

    for start_idx in tqdm(range(0, len(df), batch_size), desc="Inferencing"):
        batch = df.iloc[start_idx:start_idx + batch_size]
        inputs = tokenizer(
            list(batch["title"]),
            padding=True,
            truncation=True,
            max_length=max_len,
            return_tensors="pt"
        )
        
        inputs = {k: v.to(device) for k, v in inputs.items()}
        if model_name in ["roberta-base", "microsoft/deberta-v3-base"]:
            inputs.pop("token_type_ids", None)

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            probs = softmax(logits, dim=1)
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())

    all_preds = np.array(all_preds)
    all_probs = np.array(all_probs)

    # Compute metrics
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    report = classification_report(all_labels, all_preds, target_names=label_names, output_dict=True)
    
    # Save metrics
    metrics_df = pd.DataFrame({
        "model": model_name,
        "max_len": max_len,
        "accuracy": acc,
        "f1_score": f1
    })
    
    metrics_df.to_csv(f"{output_dir}/{safe_model_name}_maxlen{max_len}_metrics.csv", index=False)

    # Plot: Confusion Matrix
    cm = confusion_matrix(all_labels, all_preds)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names)
    
    fig, ax = plt.subplots(figsize=(10, 8))
    disp.plot(ax=ax, cmap="Blues", xticks_rotation=45)
    plt.title(f"Confusion Matrix ({safe_model_name}, max_len = {max_len})")
    plt.tight_layout()
    plt.savefig(f"{output_dir}/{safe_model_name}_maxlen{max_len}_confusion_matrix.png")
    plt.close()

    # Plot: Classification Report Heatmap
    df_report = pd.DataFrame(report).transpose().drop(["accuracy", "macro avg", "weighted avg"], errors="ignore")

    plt.figure(figsize=(12, 6))
    sns.heatmap(df_report.iloc[:, :-1], annot=True, cmap="YlGnBu", fmt=".2f", cbar=True)
    plt.title(f"Classification Report Heatmap ({safe_model_name}, max_len = {max_len})")
    plt.tight_layout()
    plt.savefig(f"{output_dir}/{safe_model_name}_maxlen{max_len}_classification_heatmap.png")
    plt.close()

    # Plot: per class F1 scores
    report_dict = classification_report(all_labels, all_preds, target_names=label_names, output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose()
    per_class_df = report_df.iloc[:-3]  # Excludes 'accuracy', 'macro avg', 'weighted avg'
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x=per_class_df.index, y=per_class_df["f1-score"], palette="viridis")
    plt.ylabel("F1 Score")
    plt.xlabel("Category")
    plt.title(f"F1 Scores per Category ({safe_model_name}, max_len = {max_len})")
    plt.xticks(rotation=45)
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.savefig(f"{output_dir}/{safe_model_name}_maxlen{max_len}_f1_per_category.png")
    plt.close()
    
    # Optionally return
    return all_preds, all_probs, metrics

In [23]:
preds, probs, metrics = run_inference_with_eval_and_plots(
    model_name="bert-base-uncased",
    df=df_test,
    max_len=64,
    batch_size=32,
    label_names=label_names,
    output_dir="bert_maxlen64"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Inferencing: 100%|██████████| 111/111 [00:04<00:00, 26.79it/s]
