In [2]:
from IPython.display import display

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe_connected'

import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss
from torch.nn.functional import softmax
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, roc_auc_score, roc_curve
import time
from collections import defaultdict

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# ***Experiments***

In [4]:
DEVICE = "cuda"
MAX_LEN = 64
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16
EPOCHS = 5
TRAINING_FILE = "/kaggle/input/data-splits-20-06-2025/train_set.csv"
VALIDATION_FILE = "/kaggle/input/data-splits-20-06-2025/validation_set.csv"
TEST_FILE = "/kaggle/input/data-splits-20-06-2025/test_set.csv"

In [5]:
class BERTDataset(Dataset):
    def __init__(self, product1, product2, target, tokenizer, max_len=MAX_LEN):
        self.product1 = product1
        self.product2 = product2
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.product1)

    def __getitem__(self, item):
        title1 = str(self.product1[item])
        title2 = str(self.product2[item])

        # Tokenize both titles (product1 and product2) and pad/truncate to max_len
        encoding = self.tokenizer.encode_plus(
            title1, title2, 
            add_special_tokens=True,  # Adds [CLS] and [SEP]
            max_length=self.max_len, 
            padding='max_length', 
            truncation=True,
            return_attention_mask=True, 
            return_tensors='pt'
        )

        # Prepare input tensors
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)
        label = torch.tensor(self.target[item], dtype=torch.float)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label
        }

In [6]:
# Define the custom loss function
def loss_fn(outputs, labels):
    return nn.BCEWithLogitsLoss()(outputs, labels)

# Training function
def train_loop(data_loader, model, optimizer, device, scheduler):
    model.train()
    epoch_loss = []
    total_correct = 0
    total_samples = 0

    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits.squeeze(-1)  # Ensure shape [batch_size]

        # Calculate loss
        loss = loss_fn(logits, labels)  # Using BCEWithLogitsLoss

        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        scheduler.step()

        epoch_loss += [loss.item()]
        
        # Calculate accuracy based on logits (thresholded)
        preds = torch.sigmoid(logits).round()  # Convert logits to binary predictions
        total_correct += (preds.squeeze() == labels).sum().item()
        total_samples += labels.size(0)

    avg_loss = np.mean(epoch_loss) 
    accuracy = total_correct / total_samples  
    return avg_loss, accuracy


# Evaluation function
def test_loop(data_loader, model, device, return_details=False):
    model.eval()
    val_loss = []
    total_correct = 0
    total_samples = 0
    all_preds, all_labels, all_losses = [], [], []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits.squeeze(-1)
            loss = loss_fn(logits, labels)
            val_loss.append(loss.item())

            probs = torch.sigmoid(logits)
            preds = probs.round()

            total_correct += (preds == labels).sum().item()
            total_samples += labels.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_losses.extend(nn.BCELoss(reduction="none")(probs, labels).cpu().numpy())

    avg_val_loss = np.mean(val_loss)
    accuracy = total_correct / total_samples
    f1 = f1_score(all_labels, all_preds, average='weighted')
    
    if return_details:
        return avg_val_loss, accuracy, f1, all_preds, all_labels, all_losses
    
    return avg_val_loss, accuracy, f1

In [16]:
# Main function to run training and evaluation
def train(model_name='distilbert-base-uncased', max_len=128):
    
    # Start timing for training
    start_time = time.time()
    
    # Read training and validation datasets
    df_train = pd.read_csv(TRAINING_FILE)
    df_valid = pd.read_csv(VALIDATION_FILE)

    # Define the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Create dataset objects for training and validation
    train_dataset = BERTDataset(
        product1=df_train.title_1.values,
        product2=df_train.title_2.values,
        target=df_train.label.values,
        tokenizer=tokenizer,
        max_len=max_len
    )

    train_data_loader = DataLoader(
        train_dataset, TRAIN_BATCH_SIZE, shuffle=True, num_workers=4
    )

    valid_dataset = BERTDataset(
        product1=df_valid.title_1.values,
        product2=df_valid.title_2.values,
        target=df_valid.label.values,
        tokenizer=tokenizer,
        max_len=max_len
    )

    valid_data_loader = DataLoader(
        valid_dataset, VALID_BATCH_SIZE, num_workers=1
    )

    # Prepare the model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
    model.to(device)

    # Prepare optimizer and scheduler
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)  
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )

    max_val_accuracy = 0
    train_losses = []
    train_accuracies = []
    val_losses = []
    val_accuracies = []
    val_f1s = []
    
    # Iterate over the whole dataset EPOCHS times 
    for epoch in range(EPOCHS):  
        
        # Per epoch training and validation loops 
        train_loss, train_accuracy = train_loop(train_data_loader, model, optimizer, device, scheduler)
        val_loss, val_accuracy, val_f1 = test_loop(valid_data_loader, model, device)

        # Store metrics for plotting 
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)
        val_f1s.append(val_f1)

         # Print the training results per epoch
        log_string = "Epoch: {}/{} Train Loss: {} Train Accuracy: {} %  Val Loss {} Val Accuracy {} %  Val F1 {}"
        print(log_string.format(epoch + 1,
                                EPOCHS, 
                                round(train_loss, 3), 
                                round(100 * train_accuracy, 2),
                                round(val_loss, 3),
                                round(100 * val_accuracy, 2),
                                round(100 * val_f1, 2)
                                ))

        # Save the model with the highest validation accuracy
        if val_accuracy > max_val_accuracy:
            safe_model_name = model_name.replace("/", "_")
            torch.save(model.state_dict(), f"{safe_model_name}_max_len_{max_len}_product_matching.pth")
            max_val_accuracy = val_accuracy

    # Track the total training time 
    end_time = time.time()
    total_time = end_time - start_time
    hours, rem = divmod(total_time, 3600)
    minutes, seconds = divmod(rem, 60)
    print(f"\nTotal training time: {int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}")

    # Save the training time 
    with open(f"training_time_{safe_model_name}_max_len_{max_len}_product_matching.txt", "w") as f:
        f.write(f"Total training time: {int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}\n")

    # Save statistics for plotting later
    epochs = range(1, EPOCHS + 1)
    metrics_df = pd.DataFrame({
        "epoch": epochs,
        "train_loss": train_losses,
        "val_loss": val_losses,
        "train_accuracy": train_accuracies,
        "val_accuracy": val_accuracies,
        "val_f1": val_f1s
    })

    metrics_df.to_csv(f"training_metrics_{safe_model_name}_max_len_{max_len}_product_matching.csv", index=False)


if __name__ == "__main__":
    train(model_name='distilbert-base-uncased', max_len=128)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training: 100%|██████████| 303/303 [00:59<00:00,  5.11it/s]
Evaluating: 100%|██████████| 76/76 [00:02<00:00, 32.73it/s]


Epoch: 1/5 Train Loss: 0.334 Train Accuracy: 84.73 %  Val Loss 0.209 Val Accuracy 91.56 %  Val F1 91.64


Training: 100%|██████████| 303/303 [01:00<00:00,  5.05it/s]
Evaluating: 100%|██████████| 76/76 [00:02<00:00, 32.88it/s]


Epoch: 2/5 Train Loss: 0.152 Train Accuracy: 94.6 %  Val Loss 0.141 Val Accuracy 95.7 %  Val F1 95.7


Training: 100%|██████████| 303/303 [01:00<00:00,  5.05it/s]
Evaluating: 100%|██████████| 76/76 [00:02<00:00, 32.80it/s]


Epoch: 3/5 Train Loss: 0.096 Train Accuracy: 96.64 %  Val Loss 0.144 Val Accuracy 95.53 %  Val F1 95.54


Training: 100%|██████████| 303/303 [00:59<00:00,  5.05it/s]
Evaluating: 100%|██████████| 76/76 [00:02<00:00, 32.76it/s]


Epoch: 4/5 Train Loss: 0.069 Train Accuracy: 97.81 %  Val Loss 0.148 Val Accuracy 95.7 %  Val F1 95.7


Training: 100%|██████████| 303/303 [00:59<00:00,  5.05it/s]
Evaluating: 100%|██████████| 76/76 [00:02<00:00, 32.72it/s]


Epoch: 5/5 Train Loss: 0.051 Train Accuracy: 98.37 %  Val Loss 0.142 Val Accuracy 95.78 %  Val F1 95.78

Total training time: 00:05:17


# ***Inference***

In [8]:
df_test = pd.read_csv(TEST_FILE)
all_labels = df_test["label"].values
df_test

Unnamed: 0,title_1,title_2,label
0,cyrus outdoor mobile phone cm8 solid unlocked,archos core 60s 15.2 cm 6 2 gb 16 gb dual sim ...,0
1,blackberry z10 3g smartphone sim free touch sc...,blackberry z10,1
2,new zte blade a430 android 8gb sim free unlock...,nokia 6700 slide mobile phone,0
3,doro 7354 2404 2g uk sim free mobile phone dua...,doro 2404 blackwhite 2.4 2g unlocked sim free,1
4,blackberry 8800 mobile phone,sony xperia l1 sim free smartphone black,0
...,...,...,...
1205,beafon sl340 eu001r bea fon sl340 red,bea fon sl340 blau,1
1206,xiaomi redmi 6 dual sim 32gb black eu,xiaomi redmi 5 dual sim black 32gb,0
1207,samsung g600 pink mobile phone,sim free nokia 7 plus 64gb mobile phone blackc...,0
1208,swisstone sc330 schwarz,swisstone sc230,0


In [10]:
def run_inference_with_eval_and_plots(model_name, df, max_len=128, batch_size=32, label_names=None, output_dir="inference_outputs"):
    os.makedirs(output_dir, exist_ok=True)

    # Start timing 
    start_time = time.time()
    
    # Prepare safe names and device
    safe_model_name = model_name.replace("/", "_")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
    model.load_state_dict(torch.load(f"/kaggle/working/{safe_model_name}_max_len_{max_len}_product_matching.pth", map_location=device))
    model.to(device).eval()

    # Initialize accumulators
    all_preds, all_probs, all_losses = [], [], []
    all_labels = df["label"].values
    criterion = BCEWithLogitsLoss(reduction="none")

    # Run inference with loss
    for start_idx in tqdm(range(0, len(df), batch_size), desc="Inferencing"):
        batch = df.iloc[start_idx:start_idx + batch_size]
        inputs = tokenizer(
            list(batch["title_1"]),
            list(batch["title_2"]),
            padding=True,
            truncation=True,
            max_length=max_len,
            return_tensors="pt"
        )
        labels = torch.tensor(batch["label"].values, dtype=torch.float32).unsqueeze(1).to(device)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        if model_name in ["roberta-base", "microsoft/deberta-v3-base"]:
            inputs.pop("token_type_ids", None)

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).long()

            loss = criterion(logits, labels)

            all_preds.extend(preds.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
            all_losses.extend(loss.cpu().numpy())

    # End timing 
    end_time = time.time()
    inference_duration = end_time - start_time

    all_preds = np.array(all_preds).astype(int)
    all_probs = np.array(all_probs)
    all_losses = np.array(all_losses)

    # ------------------ Metrics ------------------
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    report = classification_report(all_labels, all_preds, output_dict=True)

    metrics_df = pd.DataFrame([{
        "model": model_name,
        "max_len": max_len,
        "accuracy": acc,
        "f1_score": f1,
        "inference_time_sec": round(inference_duration, 2)
    }])
    metrics_df.to_csv(f"{output_dir}/{safe_model_name}_maxlen{max_len}_metrics_product_matching.csv", index=False)

    # ------------------ Plots ------------------
    cm = confusion_matrix(all_labels, all_preds)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)

    fig, ax = plt.subplots(figsize=(10, 8))
    disp.plot(ax=ax, cmap="Blues")
    plt.title(f"Confusion Matrix ({safe_model_name}, max_len = {max_len})")
    plt.tight_layout()
    plt.savefig(f"{output_dir}/{safe_model_name}_maxlen{max_len}_confusion_matrix.png")
    plt.close()

    df_report = pd.DataFrame(report).transpose().drop(["accuracy", "macro avg", "weighted avg"], errors="ignore")
    plt.figure(figsize=(12, 6))
    sns.heatmap(df_report.iloc[:, :-1], annot=True, cmap="YlGnBu", fmt=".2f", cbar=True)
    plt.title(f"Classification Report Heatmap ({safe_model_name}, max_len = {max_len})")
    plt.tight_layout()
    plt.savefig(f"{output_dir}/{safe_model_name}_maxlen{max_len}_classification_heatmap.png")
    plt.close()

    report_df = pd.DataFrame(report).transpose()
    per_class_df = report_df.iloc[:-3]
    plt.figure(figsize=(10, 6))
    sns.barplot(x=per_class_df.index, y=per_class_df["f1-score"], palette="viridis")
    plt.ylabel("F1 Score")
    plt.xlabel("Category")
    plt.title(f"F1 Scores per Category ({safe_model_name}, max_len = {max_len})")
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.savefig(f"{output_dir}/{safe_model_name}_maxlen{max_len}_f1_per_category.png")
    plt.close()

    # ------------------ Error Analysis ------------------
    df_errors = df.copy()
    df_errors["predicted_label"] = all_preds
    df_errors["probability"] = all_probs
    df_errors["loss"] = all_losses
    df_errors_sorted = df_errors.sort_values(by="loss", ascending=False)

    # Display top-20 most confusing predictions
    print("\nTop 20 Most Confusing Predictions (by loss):")
    display(df_errors_sorted.head(20))

    # Save them to CSV
    df_errors_sorted.to_csv(f"{output_dir}/{safe_model_name}_maxlen{max_len}_top_test_errors.csv", index=False)

    return all_preds, all_probs, metrics_df

In [17]:
preds, probs, metrics_df = run_inference_with_eval_and_plots(
    model_name="distilbert-base-uncased",
    df=df_test,
    max_len=128,
    batch_size=32,
    output_dir="distilbert_maxlen128"
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Inferencing: 100%|██████████| 38/38 [00:01<00:00, 25.18it/s]



Top 20 Most Confusing Predictions (by loss):


Unnamed: 0,title_1,title_2,label,predicted_label,probability,loss
489,motorola moto g 5.7 single sim 4g 3gb 32gb 300...,sim free motorola moto g6 32gb mobile phone de...,1,0,0.00135,6.607663
922,motorola moto g 5.7 single sim 4g 3gb 32gb 300...,motorola moto g6 indigo 5.7 32gb 4g unlocked s...,1,0,0.00365,5.6129
340,htc one x pj46100 16gb grey unlocked smartphon...,htc one x grey,0,1,0.995852,5.485095
59,sim free sony xperia xz1 mobile phone blue,sim free sony xperia xz1 64gb mobile phone black,0,1,0.995722,5.454176
181,doro phoneeasy 609l black,doro phoneeasy 609 black,1,0,0.004644,5.372153
1044,nokia 3310 mobile phone in azure,nokia 3310 3g,0,1,0.995218,5.342935
722,huawei p smart 32gb sim free 4g lte smartphone...,sim free huawei p smart 32gb mobile phone black,0,1,0.994877,5.274025
826,oneplus 5t a5010 64gb dual sim factory unlocke...,oneplus 5t midnight black 64gb6gb 4g dual sim ...,1,0,0.006817,4.988318
378,oneplus 5t midnight black 64gb6gb 4g dual sim ...,oneplus 5t a5010 64gb dual sim factory unlocke...,1,0,0.006986,4.963792
791,lg k10 2017 m250n 13.5 cm 5.3 16 gb 13 mp andr...,lg m250e 13 46 cm 5 3 zoll smartphone k10,0,1,0.991164,4.728958
