In [1]:
# Cell 1: Install necessary libraries if not already available
!pip install datasets transformers evaluate --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
bigframes 1.42.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.9.0.13 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cudnn-cu12==9.1.0.70; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cudnn-cu12 9.3.0.75 which is incompatible.
torch 2.6.0+cu124 requires nvid

In [2]:
# Cell 2: Import libraries, configure warnings, and set up configurations

import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Suppress specific warning from torch
import warnings
warnings.filterwarnings("ignore", message="Was asked to gather along dimension 0")

# Datasets for handling data, Evaluate for metrics
from datasets import Dataset, DatasetDict
import evaluate

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)


2025-05-25 14:19:55.642017: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748182795.834154      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748182795.887727      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [9]:
# Cell 3: Data setup
DATA_PATH = "/kaggle/input/touche-task4-2025-subtask2"
train_responses_file = os.path.join(DATA_PATH, "responses-train.jsonl")
train_labels_file = os.path.join(DATA_PATH, "responses-train-labels.jsonl")
val_responses_file = os.path.join(DATA_PATH, "responses-validation.jsonl")
val_labels_file = os.path.join(DATA_PATH, "responses-validation-labels.jsonl")
test_responses_file = os.path.join(DATA_PATH, "responses-test.jsonl")
test_labels_file = os.path.join(DATA_PATH, "responses-test-labels.jsonl")

def load_jsonl(file_path):
    """Load a JSONL file and return a list of dicts."""
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                data.append(json.loads(line))
    return data

def merge_data(responses_file, labels_file):
    """Merge responses and labels using the response id."""
    responses = load_jsonl(responses_file)
    labels = load_jsonl(labels_file)
    label_map = {item["id"]: item["label"] for item in labels}
    
    merged = []
    for resp in responses:
        rid = resp["id"]
        if rid in label_map:
            merged.append({
                "id": rid,
                "text": resp["response"],
                "label": label_map[rid]
            })
    return merged

In [10]:
# Cell 4: Create Hugging Face Datasets for train/validation/test
train_data = merge_data(train_responses_file, train_labels_file)
val_data = merge_data(val_responses_file, val_labels_file)
test_data = merge_data(test_responses_file, test_labels_file)

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 11487
    })
    validation: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 3257
    })
    test: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 2600
    })
})


In [11]:
# Cell 5: Tokenize using DeBERTa V3 Base tokenizer

model_checkpoint = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    # Reduce the maximum length to 256 tokens to save memory
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["id", "text"])

# Suppress the sentencepiece warning
import warnings
warnings.filterwarnings("ignore", message="The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option")


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/11487 [00:00<?, ? examples/s]

Map:   0%|          | 0/3257 [00:00<?, ? examples/s]

Map:   0%|          | 0/2600 [00:00<?, ? examples/s]

In [None]:
# Cell 6: Create the model and data collator

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
# Cell 7: Define evaluation metrics and configure the Trainer

import evaluate

# Load evaluation metrics
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = precision_metric.compute(predictions=predictions, references=labels, average="binary")["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels, average="binary")["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="binary")["f1"]
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Enable gradient checkpointing to reduce memory consumption during training
model.gradient_checkpointing_enable()

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,       # Further reduced batch size to lower memory usage
    per_device_eval_batch_size=8,        # Reduced evaluation batch size as well
    gradient_accumulation_steps=8,       # Accumulate gradients to simulate an effective batch size of 4
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none",
    fp16=True                          # Enable mixed precision training for memory efficiency
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [None]:
# Cell 8: Train the model
trainer.train()


In [None]:
# Cell 9: Evaluate on the test set
test_metrics = trainer.evaluate(tokenized_datasets["test"])
print("Test set metrics:", test_metrics)

In [None]:
# Cell 10: Save the fine-tuned model and tokenizer
model_save_path = "./deberta-v3-large-ad-detector-finetuned"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")

In [None]:
# Cell 11: Compute and plot the confusion matrix for the test set
test_output = trainer.predict(tokenized_datasets["test"])
predictions = test_output.predictions.argmax(axis=-1)
true_labels = test_output.label_ids

cm = confusion_matrix(true_labels, predictions)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix on Test Set")
plt.show()

In [None]:
# Cell 12: Extract training loss logs and plot average loss per epoch
log_history = trainer.state.log_history
epoch_loss = {}
for entry in log_history:
    if "epoch" in entry and "loss" in entry:
        epoch = entry["epoch"]
        if epoch not in epoch_loss:
            epoch_loss[epoch] = []
        epoch_loss[epoch].append(entry["loss"])

epoch_loss_avg = {epoch: sum(losses) / len(losses) for epoch, losses in epoch_loss.items()}
sorted_epochs = sorted(epoch_loss_avg.keys())
sorted_loss = [epoch_loss_avg[epoch] for epoch in sorted_epochs]

plt.figure(figsize=(8,5))
plt.plot(sorted_epochs, sorted_loss, marker="o", linestyle="-")
plt.xlabel("Epoch")
plt.ylabel("Average Loss")
plt.title("Average Training Loss vs. Epoch")
plt.grid(True)
plt.show()