In [None]:
# %pip install evaluate
# %pip download evaluate
# %pip download evaluate -d wheels/
# from evaluate import load
# load('accuracy')
# accuracy = load("/kaggle/input/import-evaluate/wheels/evaluate/metrics/accuracy")

In [None]:
# %pip install --no-index --find-links=/kaggle/input/import-evaluate/wheels evaluate

In [None]:
# from sklearn.metrics import accuracy_score

# y_true = [0, 1, 0, 1]
# y_pred = [0, 0, 0, 1]

# print("Accuracy:", accuracy_score(y_true, y_pred))


In [None]:
import pandas as pd
# from evaluate import load
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import torch

# -----------------------
# 1. Load Data
# -----------------------
# data_dir = Path("/kaggle/input/playground-series-s5e8")
# train_path = data_dir / "train.csv"
# test_path = data_dir / "test.csv"
# sample_path = data_dir / "sample_submission.csv"
out_path = "/kaggle/working/submission.csv"

train = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")
test = pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")

# train = train[:75]
# test = train[:25]

# Map to single column
train["y"] = train[["winner_model_a", "winner_model_b", "winner_tie"]].idxmax(axis=1)
print(train["y"])
# Replace column names with 0/1/2
mapping = {"winner_model_a": 0, "winner_model_b": 1, "winner_tie": 2}
train["y"] = train["y"].map(mapping)

# train = train.drop(columns)
label_list = ["winner_model_a", "winner_model_b", "winner_tie", "model_a", "model_b"]
TARGET = "y"
ID_COL = "id"
# LABEL_LIST = ["winner_model_a", "winner_model_b", "winner_tie"]
FEATURES = [col for col in train.columns if col not in [ID_COL, TARGET]+label_list]

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Features:", FEATURES)

# -----------------------
# 2. Prepare Text Feature
# -----------------------
# Concatenate features into one text column (since HuggingFace works with text input)
train["text"] = train[FEATURES].astype(str).agg(" ".join, axis=1)
test["text"] = test[FEATURES].astype(str).agg(" ".join, axis=1)

# Train/Validation split
train_df, valid_df = train_test_split(train, test_size=0.2, stratify=train[TARGET], random_state=42)

# Convert to HuggingFace Dataset
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df[["text", TARGET]]).rename_column(TARGET, "labels"),
    "validation": Dataset.from_pandas(valid_df[["text", TARGET]]).rename_column(TARGET, "labels"),
    "test": Dataset.from_pandas(test[["text", ID_COL]])
})
# print(dataset["train"][0])
# -----------------------
# 3. Tokenizer
# -----------------------
# MODEL = "distilbert-base-uncased"
# MODEL = "huawei-noah/TinyBERT_General_4L_312D"
# MODEL = "ginnigarg/binary-classification-kaggle-tiny-bert"
MODEL = "/kaggle/input/tiny-bert/transformers/default/1/results/checkpoint-1080"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])

# Rename target to "labels" for Trainer
# Using map
# train_ds = train_ds.map(lambda x: {"text_length": len(x["input_ids"])})
# dataset["train"] = dataset["train"].rename_column(TARGET, "labels")
# dataset["validation"] = dataset["validation"].rename_column(TARGET, "labels")

# Set format for PyTorch
dataset['train'].set_format("torch", columns=["input_ids", "attention_mask", "labels"], output_all_columns=False)
dataset['validation'].set_format("torch", columns=["input_ids", "attention_mask", "labels"], output_all_columns=False)


# -----------------------
# 4. Model
# -----------------------
num_labels = len(train[TARGET].unique())  # should be 2 for binary classification
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=num_labels)

BATCH_SIZE = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to GPU
model.to(device)
model.eval()

# -----------------------
# 5. Training Setup
# -----------------------
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    report_to="none",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="./logs",
)

# Metrics
# accuracy = load("accuracy")
# roc_auc = load("roc_auc")

# accuracy = load("/kaggle/input/import-evaluate/wheels/evaluate/metrics/accuracy")
# roc_auc = load("/kaggle/input/import-evaluate/wheels/evaluate/metrics/roc_auc")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    # acc = accuracy.compute(predictions=preds, references=labels)
    acc = accuracy_score(labels, preds)
    # auc = roc_auc.compute(prediction_scores=logits[:,1], references=labels)
    return {"accuracy": acc} # "roc_auc": auc["roc_auc"]}

# -----------------------
# 6. Trainer
# -----------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# -----------------------
# 7. Train
# -----------------------
trainer.train()

# -----------------------
# 8. Predictions on Test
# -----------------------

test_texts = test["text"].tolist()
all_probs = []

with torch.no_grad():
    for i in range(0, len(test_texts), BATCH_SIZE):
        batch_texts = test_texts[i:i+BATCH_SIZE]
        batch_encodings = tokenizer(
            batch_texts,
            truncation=True,
            padding=True,
            max_length=128,
            return_tensors="pt"
        )

        # Move batch tensors to GPU
        batch_encodings = {k: v.to(device) for k, v in batch_encodings.items()}

        outputs = model(**batch_encodings)
        # probs = torch.softmax(outputs.logits, dim=-1)[:, 1]
        probs = torch.softmax(outputs.logits, dim=-1)
        all_probs.extend(probs.cpu().numpy())  # move back to CPU for storage

# Convert to numpy array
print(all_probs)
print(all_probs[0])
all_probs = np.array(all_probs)
        
# -----------------------
# 9. Save Submission
# -----------------------
# submission = pd.DataFrame({
#     "id": test[ID_COL],
#     "y": (all_probs > 0.5).astype(int)  # threshold at 0.5
# })

submission = pd.DataFrame(all_probs, columns=["winner_model_a", "winner_model_b", "winner_tie"])
submission['id'] = test[ID_COL]
submission.to_csv(out_path, index=False)

print("âœ… Saved submission.csv")
