In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-classification-finetuning/sample_submission.csv
/kaggle/input/llm-classification-finetuning/train.csv
/kaggle/input/llm-classification-finetuning/test.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
import re

train = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')
test = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')
sample_submission = pd.read_csv('/kaggle/input/llm-classification-finetuning/sample_submission.csv')
train.head()

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1
3,96401,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0


In [3]:
# Step 3: Data Cleaning and Preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

train['prompt'] = train['prompt'].apply(preprocess_text)
train['response_a'] = train['response_a'].apply(preprocess_text)
train['response_b'] = train['response_b'].apply(preprocess_text)

test['prompt'] = test['prompt'].apply(preprocess_text)
test['response_a'] = test['response_a'].apply(preprocess_text)
test['response_b'] = test['response_b'].apply(preprocess_text)

In [4]:
# Step 4: Feature Engineering
# Using TF-IDF (Term Frequency-Inverse Document Frequency) to transform textual data into numerical features.
tfidf = TfidfVectorizer(max_features=1000)
train['combined_text_a'] = train['prompt'] + " " + train['response_a']
train['combined_text_b'] = train['prompt'] + " " + train['response_b']

tfidf_train_a = tfidf.fit_transform(train['combined_text_a'])
tfidf_train_b = tfidf.transform(train['combined_text_b'])
X_train = np.hstack((tfidf_train_a.toarray(), tfidf_train_b.toarray()))
y_train = train[['winner_model_a', 'winner_model_b', 'winner_tie']].values
test['combined_text_a'] = test['prompt'] + " " + test['response_a']
test['combined_text_b'] = test['prompt'] + " " + test['response_b']

tfidf_test_a = tfidf.transform(test['combined_text_a'])
tfidf_test_b = tfidf.transform(test['combined_text_b'])

X_test = np.hstack((tfidf_test_a.toarray(), tfidf_test_b.toarray()))

In [5]:
train['label'] = train.apply(lambda x: 1 if x['winner_model_a'] == 1 else (0 if x['winner_model_b'] == 1 else -1), axis=1)
y_train = train['label'].values 
print("Unique labels in y_train:", np.unique(y_train))

Unique labels in y_train: [-1  0  1]


In [6]:
# Step 5: Model Selection and Training
# I’ll use a Logistic Regression model, as it’s effective for binary/multiclass classification with probability outputs.
model = LogisticRegression(max_iter=1000, multi_class='multinomial')
model.fit(X_train, y_train)

In [7]:
# Step 6: Model Evaluation - Cross-Validation and Log Loss Calculation
# Evaluating the model using log loss on a validation set.
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
val_preds = model.predict_proba(X_val_split)
log_loss_score = log_loss(y_val_split, val_preds)
print(f'Log Loss on Validation Set: {log_loss_score}')

Log Loss on Validation Set: 1.0094023279049487


In [8]:
# Step 7: Predictions on Test Set
test_preds = model.predict_proba(X_test)
submission = pd.DataFrame({
    'id': test['id'],
    'winner_model_a': test_preds[:, 0],
    'winner_model_b': test_preds[:, 1],
    'winner_tie': test_preds[:, 2]
})
submission.to_csv('submission.csv', index=False)