In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install --upgrade transformers


In [None]:
pip install --upgrade transformers


In [None]:
import torch
from transformers import AutoTokenizer
from datasets import Dataset
import pandas as pd

# Verify GPU availability
print(f"Is GPU available? {torch.cuda.is_available()}")

# Prepare dataset
train = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')
test = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')

# Label mapping
label_map = {'winner_model_a': 0, 'winner_model_b': 1, 'winner_tie': 2}
train['label'] = train[['winner_model_a', 'winner_model_b', 'winner_tie']].idxmax(axis=1).map(label_map)
train = train[['prompt', 'response_a', 'response_b', 'label']]
test = test[['id', 'prompt', 'response_a', 'response_b']]

# Load tokenizer
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Tokenization function
def preprocess(example):
    return tokenizer(
        example['prompt'] + tokenizer.sep_token + example['response_a'] + tokenizer.sep_token + example['response_b'],
        padding="max_length",
        truncation=True,
        max_length=512,
    )

# Tokenize the datasets
train_dataset = Dataset.from_pandas(train)
train_dataset = train_dataset.map(preprocess, batched=False)

test_dataset = Dataset.from_pandas(test)
test_dataset = test_dataset.map(preprocess, batched=False)

# Save tokenized datasets for later use
train_dataset.save_to_disk("/kaggle/working/tokenized_train_dataset")
test_dataset.save_to_disk("/kaggle/working/tokenized_test_dataset")

print("Tokenization completed and datasets saved.")


In [None]:
# --- Imports
import torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_from_disk

# --- Check CPU/GPU
print(f"Is GPU available? {torch.cuda.is_available()}")  # Expected False (CPU)

# --- Load tokenized dataset
train_dataset = load_from_disk("/kaggle/working/tokenized_train_dataset")

# --- Fix labels if missing
if "label" not in train_dataset.features:
    import pandas as pd
    train_df = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')
    label_map = {'winner_model_a': 0, 'winner_model_b': 1, 'winner_tie': 2}
    train_df['label'] = train_df[['winner_model_a', 'winner_model_b', 'winner_tie']].idxmax(axis=1).map(label_map)
    train_dataset = train_dataset.remove_columns(["prompt", "response_a", "response_b"])
    train_dataset = train_dataset.add_column("label", train_df["label"].tolist())

# --- Set correct format
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# --- Load a lightweight model
model_checkpoint = "prajjwal1/bert-tiny"
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

# --- Training arguments
training_args = TrainingArguments(
    output_dir="/kaggle/working/results",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    learning_rate=2e-4,
    logging_steps=10,
    report_to="none",
    no_cuda=True,
    disable_tqdm=False,
)

# --- Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# --- Start Training
print("üöÄ Starting Optimized Training...")
trainer.train()
print("üèÅ Training Complete.")


In [None]:
import torch
import pandas as pd
from datasets import load_from_disk

# 1. Load your tokenized test set
test_dataset = load_from_disk("/kaggle/working/tokenized_test_dataset")
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# 2. Run predictions
print("üöÄ Predicting on test set‚Ä¶")
preds = trainer.predict(test_dataset)
logits = preds.predictions                     # shape (n_samples, 3)
probs  = torch.softmax(torch.tensor(logits), -1).numpy()

# 3. Build submission DataFrame
test_df = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')
submission = pd.DataFrame({
    "id":             test_df["id"],
    "winner_model_a": probs[:, 0],
    "winner_model_b": probs[:, 1],
    "winner_tie":     probs[:, 2],
})

# 4. Save CSV
submission.to_csv("/kaggle/working/submission.csv", index=False)
print("‚úÖ submission.csv saved at /kaggle/working/submission.csv")
