In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Kaggle Notebook: LLM Classification Finetuning — working baseline
# File: llm_classification_finetuning_kaggle_notebook.py
# Instructions: Put this file into a Kaggle Notebook (Python) and run all cells.
# Make sure the competition data (train.csv, test.csv) is present in the working directory
# (Kaggle "Data" panel will mount them to /kaggle/input/llm-classification-finetuning/ by default).

# =========================
# Cell 1 — Imports
# =========================
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import log_loss

# =========================
# Cell 2 — Paths & helper
# =========================
INPUT_DIR = '/kaggle/input/llm-classification-finetuning'
if not os.path.exists(INPUT_DIR):
    # sometimes kaggle mounts under /kaggle/input/<competition-name>
    INPUT_DIR = '/kaggle/input/llm-classification-finetuning'

TRAIN_CSV = os.path.join(INPUT_DIR, 'train.csv')
TEST_CSV = os.path.join(INPUT_DIR, 'test.csv')
SAMPLE_SUB = os.path.join(INPUT_DIR, 'sample_submission.csv')

print('Looking for files in:', INPUT_DIR)
print('Exists train:', os.path.exists(TRAIN_CSV))
print('Exists test:', os.path.exists(TEST_CSV))

# =========================
# Cell 3 — Load data
# =========================
train = pd.read_csv(TRAIN_CSV)
test = pd.read_csv(TEST_CSV)
print('train shape', train.shape)
print('test shape', test.shape)
print('Train columns:', list(train.columns))

# =========================
# Label extraction
# =========================
if all(c in train.columns for c in ['winner_model_a', 'winner_model_b', 'winner_tie']):
    # Convert one-hot to single label
    y = np.zeros(len(train), dtype=int)
    y[train['winner_model_b'] == 1] = 1
    y[train['winner_tie'] == 1] = 2

elif 'winner' in train.columns:
    # Map string labels to integers
    mapping = {'a': 0, 'b': 1, 'tie': 2}
    y = train['winner'].map(mapping).values

else:
    raise ValueError(
        f"Could not find winner columns. Got: {list(train.columns)}"
    )

# =========================
# Cell 4 — Simple feature engineering
# =========================
train['text_a'] = train['prompt'].fillna('') + ' ' + train['response_a'].fillna('')
train['text_b'] = train['prompt'].fillna('') + ' ' + train['response_b'].fillna('')

test['text_a'] = test['prompt'].fillna('') + ' ' + test['response_a'].fillna('')
test['text_b'] = test['prompt'].fillna('') + ' ' + test['response_b'].fillna('')

# Helper transformers to extract columns for sklearn pipeline
get_a = FunctionTransformer(lambda df: df['text_a'].values, validate=False)
get_b = FunctionTransformer(lambda df: df['text_b'].values, validate=False)
get_prompt = FunctionTransformer(lambda df: df['prompt'].fillna('').values, validate=False)

# =========================
# Cell 5 — Vectorizers & pipeline
# =========================
MAX_FEATURES = 20000
vec = TfidfVectorizer(min_df=3, max_features=MAX_FEATURES, ngram_range=(1,2), sublinear_tf=True)

from scipy import sparse

# Fit vectorizer on combined corpus
all_text = pd.concat([train['text_a'], train['text_b'], train['prompt']]).astype(str)
vec.fit(all_text)

# Transform train
Xa = vec.transform(train['text_a'].astype(str))
Xb = vec.transform(train['text_b'].astype(str))
Xp = vec.transform(train['prompt'].astype(str))
X_train = sparse.hstack([Xa - Xb, Xp], format='csr')

# Transform test
Xa_test = vec.transform(test['text_a'].astype(str))
Xb_test = vec.transform(test['text_b'].astype(str))
Xp_test = vec.transform(test['prompt'].astype(str))
X_test = sparse.hstack([Xa_test - Xb_test, Xp_test], format='csr')

print('X_train shape', X_train.shape)
print('X_test shape', X_test.shape)

# =========================
# Cell 6 — Train/validation split and model
# =========================
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y, test_size=0.12, random_state=42, stratify=y)

model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=400, C=1.0)
model.fit(X_tr, y_tr)

probs_val = model.predict_proba(X_val)
vl = log_loss(y_val, probs_val)
print(f'Validation log loss: {vl:.6f}')

# =========================
# Cell 7 — Train on full training data
# =========================
model_full = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=400, C=1.0)
model_full.fit(X_train, y)

# =========================
# Cell 8 — Predict on test and create submission.csv
# =========================
probs_test = model_full.predict_proba(X_test)

submission = pd.DataFrame({
    'id': test['id'],
    'winner_model_a': probs_test[:, 0],
    'winner_model_b': probs_test[:, 1],
    'winner_tie': probs_test[:, 2]
})

submission.to_csv('submission.csv', index=False)
print('Wrote submission.csv — shape:', submission.shape)
print('Row 0 sum:', submission[['winner_model_a','winner_model_b','winner_tie']].iloc[0].sum())
print(submission.head())

# End of notebook
