In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd

data_path = "/kaggle/input/llm-classification-finetuning/"

train_df = pd.read_csv(data_path + "train.csv")
test_df = pd.read_csv(data_path + "test.csv")
sample_submission = pd.read_csv(data_path + "sample_submission.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Sample submission shape:", sample_submission.shape)

# Peek at first few rows
train_df.head()

# Check label distribution
print("\nLabel distribution:")
print(train_df['winner_model_a'].sum(), "A wins")
print(train_df['winner_model_b'].sum(), "B wins")
print(train_df['winner_tie'].sum(), "ties")

# Count NaNs
print("\nMissing values in train:")
print(train_df.isnull().sum())

# Check basic stats for response lengths
train_df['len_a'] = train_df['model_a'].str.len()
train_df['len_b'] = train_df['model_b'].str.len()
print("\nLength stats for A and B:")
print(train_df[['len_a', 'len_b']].describe())


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import lightgbm as lgb

# Paths
data_path = "/kaggle/input/llm-classification-finetuning/"

# Load
train_df = pd.read_csv(data_path + "train.csv")
test_df = pd.read_csv(data_path + "test.csv")
sample_submission = pd.read_csv(data_path + "sample_submission.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# Ensure model_a / model_b exist in test set
for col in ['model_a', 'model_b']:
    if col not in test_df.columns:
        test_df[col] = "unknown"

# Encode model_a, model_b
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
model_features_train = ohe.fit_transform(train_df[['model_a', 'model_b']])
model_features_test = ohe.transform(test_df[['model_a', 'model_b']])

# TF-IDF similarity between prompt and responses
vectorizer = TfidfVectorizer(max_features=5000)

# Fit on all text fields to build vocab
vectorizer.fit(pd.concat([
    train_df['prompt'], train_df['response_a'], train_df['response_b'],
    test_df['prompt'], test_df['response_a'], test_df['response_b']
]))

# Compute cosine similarity features
def compute_similarity(df):
    prompt_vec = vectorizer.transform(df['prompt'])
    resp_a_vec = vectorizer.transform(df['response_a'])
    resp_b_vec = vectorizer.transform(df['response_b'])
    sim_a = [cosine_similarity(prompt_vec[i], resp_a_vec[i])[0,0] for i in range(df.shape[0])]
    sim_b = [cosine_similarity(prompt_vec[i], resp_b_vec[i])[0,0] for i in range(df.shape[0])]
    return np.array(sim_a).reshape(-1,1), np.array(sim_b).reshape(-1,1)

sim_a_train, sim_b_train = compute_similarity(train_df)
sim_a_test, sim_b_test = compute_similarity(test_df)

# Length features
len_a_train = train_df['response_a'].str.len().values.reshape(-1,1)
len_b_train = train_df['response_b'].str.len().values.reshape(-1,1)
len_a_test = test_df['response_a'].str.len().values.reshape(-1,1)
len_b_test = test_df['response_b'].str.len().values.reshape(-1,1)

# Final feature matrices
X_train = np.hstack([model_features_train, sim_a_train, sim_b_train, len_a_train, len_b_train])
X_test = np.hstack([model_features_test, sim_a_test, sim_b_test, len_a_test, len_b_test])

# Target: winner_model_a=0/1, winner_model_b=0/1, winner_tie=0/1
# We'll map them into a single class label: 0=A, 1=B, 2=tie
y_train = np.argmax(train_df[['winner_model_a', 'winner_model_b', 'winner_tie']].values, axis=1)

# Train LightGBM multiclass model
params = {
    'objective': 'multiclass',
    'num_class': 3,
    'metric': 'multi_logloss',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1
}

dtrain = lgb.Dataset(X_train, label=y_train)
model = lgb.train(params, dtrain, num_boost_round=200)

# Predict probabilities
preds = model.predict(X_test)

# Format submission
submission = pd.DataFrame(preds, columns=['winner_model_a','winner_model_b','winner_tie'])
submission.insert(0, 'id', test_df['id'])
submission.to_csv('submission.csv', index=False)

print("Submission saved to submission.csv")
