In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

# ==============================
# Load data
# ==============================
train = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")
test = pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")

# ==============================
# Preprocess text
# ==============================
train["text"] = train["prompt"].astype(str) + " " + train["response_a"].astype(str) + " " + train["response_b"].astype(str)
test["text"]  = test["prompt"].astype(str)  + " " + test["response_a"].astype(str)  + " " + test["response_b"].astype(str)

# ==============================
# Targets
# ==============================
y = np.argmax(train[["winner_model_a","winner_model_b","winner_tie"]].values, axis=1)

# ==============================
# Vectorizer
# ==============================
vectorizer = TfidfVectorizer(
    max_features=5000,  
    ngram_range=(1,2),  
    stop_words="english"
)

# ==============================
# Models
# ==============================
log_reg = LogisticRegression(max_iter=2000, class_weight="balanced", C=2.0)
rf = RandomForestClassifier(n_estimators=300, max_depth=20, n_jobs=-1, random_state=42)

clf = VotingClassifier(
    estimators=[("lr", log_reg), ("rf", rf)],
    voting="soft"
)

# ==============================
# Train
# ==============================
X = vectorizer.fit_transform(train["text"])
clf.fit(X, y)

# ==============================
# Predict on test
# ==============================
X_test = vectorizer.transform(test["text"])
probs = clf.predict_proba(X_test)

# ==============================
# Submission
# ==============================
submission = pd.DataFrame({
    "id": test["id"],
    "winner_model_a": probs[:,0],
    "winner_model_b": probs[:,1],
    "winner_tie": probs[:,2],
})


# ==============================
# Show preview
# ==============================
print(submission.head())


In [None]:
from IPython.display import display
display(submission.head())


In [None]:
submission.to_csv("submission.csv", index=False)
