In [1]:
TRAIN_PATH = r"C:\Users\USER\Downloads\llm-classification-finetuning\train.csv"
TEST_PATH = r"C:\Users\USER\Downloads\llm-classification-finetuning\test.csv"
SUB_PATH = r"C:\Users\USER\Downloads\llm-classification-finetuning\sample_submission.csv"

TEXT_TEMPLATE = """[PROMPT]
{prompt}

[RESPONSE A]
{response_a}

[RESPONSE B]
{response_b}
"""

EMBEDDING_TEMPLATE = "sentence-transformers/all-MiniLM-L6-v2"
RANDOM_STATE = 42

In [2]:
import pandas as pd

def load_data(path):
    return pd.read_csv(path)

def build_text(df):
    return df.apply(
        lambda x: TEXT_TEMPLATE.format(
            prompt=x["prompt"], 
            response_a=x["response_a"],
            response_b=x["response_b"]
        ),
        axis=1
    )

def build_labels(df):
    return df[
        ["winner_model_a", "winner_model_b", "winner_tie"]
    ].values.argmax(axis=1)

In [3]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import numpy as np

def embed_texts(texts, model_name):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(
        texts,
        batch_size=32,
        show_progress_bar=True,
        normalize_embeddings=True
    )
    return np.array(embeddings)

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

df = load_data(TRAIN_PATH)

texts = build_text(df)
labels = build_labels(df)

X = embed_texts(texts, EMBEDDING_TEMPLATE)

X_train, X_val, y_train, y_val = train_test_split(
    X, labels, test_size=0.2, random_state=42, stratify=labels
)

clf = LogisticRegression(
    multi_class="multinomial",
    max_iter=1000
)
clf.fit(X_train, y_train)

val_preds = clf.predict_proba(X_val)
print("Validation Log Loss:", log_loss(y_val, val_preds))

Batches:   0%|          | 0/1797 [00:00<?, ?it/s]



Validation Log Loss: 1.0848973441481768


In [5]:
from joblib import dump, load

from joblib import dump
dump(clf, "model.joblib")

['model.joblib']

In [None]:
from joblib import load

clf = load("model.joblib")

test_df = load_data(TEST_PATH)
texts = build_text(test_df)

X_test = embed_texts(texts, EMBEDDING_TEMPLATE)

probs = clf.predict_proba(X_test)

sub = pd.read_csv(SUB_PATH)
sub[["winner_model_a", "winner_model_b", "winner_tie"]] = probs

sub.to_csv("submission.csv", index=False)

print("Predictions saved to submission.csv")

'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 9f085e8d-864a-44b6-a8c3-725e771d5c19)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

KeyError: "None of [Index(['winner_model_a', 'winner_model_b', 'winner_tie'], dtype='object')] are in the [columns]"