In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

In [None]:
# Load the training data
train_df = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")

In [None]:
# 1. Basic Feature Engineering: Text Length
train_df['len_a'] = train_df['response_a'].apply(len)
train_df['len_b'] = train_df['response_b'].apply(len)
train_df['words_a'] = train_df['response_a'].apply(lambda x: len(x.split()))
train_df['words_b'] = train_df['response_b'].apply(lambda x: len(x.split()))

In [None]:
# 2. Create a combined feature set (using difference here)
train_df['len_diff'] = train_df['len_a'] - train_df['len_b']
train_df['words_diff'] = train_df['words_a'] - train_df['words_b']

In [None]:
# Select the features
feature_cols = ['len_a', 'len_b']
X = train_df[feature_cols]

In [None]:
# 3. Target Variable Transformation
def get_target(row):
    if row['winner_model_a'] == 1:
        return 0
    elif row['winner_model_b'] == 1:
        return 1
    else:
        return 2

In [None]:
y = train_df.apply(get_target, axis=1)

In [None]:
# 4. Split Data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# 5. Model Selection and Training
model = LogisticRegression(random_state=42, solver='liblinear', multi_class='ovr')
model.fit(X_train, y_train)

In [None]:
# 6. Make Predictions and Evaluate
y_pred_proba = model.predict_proba(X_val)
logloss = log_loss(y_val, y_pred_proba, eps=1e-15)
print(f"Validation Log Loss: {logloss}")

In [None]:
# Load the test data
test_df = pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")

In [None]:
# Apply the same feature engineering to the test data
test_df['len_a'] = test_df['response_a'].apply(len)
test_df['len_b'] = test_df['response_b'].apply(len)
test_df['words_a'] = test_df['response_a'].apply(lambda x: len(x.split()))
test_df['words_b'] = test_df['response_b'].apply(lambda x: len(x.split()))
test_df['len_diff'] = test_df['len_a'] - test_df['len_b']
test_df['words_diff'] = test_df['words_a'] - test_df['words_b']

In [None]:
# Select the numerical features for the test set
X_test_numerical = test_df[feature_cols]

# Make predictions on the test data using the model trained on numerical features
test_pred_proba_numerical = model.predict_proba(X_test_numerical)

In [None]:
# Format the submission file
submission_df_numerical = pd.DataFrame({
    'id': test_df['id'],
    'winner_model_a': test_pred_proba_numerical[:, 0],
    'winner_model_b': test_pred_proba_numerical[:, 1],
    'winner_tie': test_pred_proba_numerical[:, 2]
})

In [None]:
print(test_pred_proba_numerical)

In [None]:
 # Save the submission file (you can name it 'submission.csv' or something similar)
submission_df_numerical.to_csv("submission.csv", index=False)

print("Submission file (numerical features only) created: submission.csv")
