In [None]:
! ls /kaggle/input/dependencies/scikitlearn/default/1
! pip install sentence_transformers --no-index --find-links=file:///kaggle/input/dependencies/scikitlearn/default/1

In [None]:
import pickle, os
import numpy as np
import pandas as pd
from tqdm import tqdm
from textblob import TextBlob
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from torch.utils.data import DataLoader

model = SentenceTransformer('/kaggle/input/all-minilm-l6-v2/scikitlearn/default/1/all-MiniLM-L6-v2')

def load_csv(train = True):
    if train:
        return pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")
    return pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")

class EmbeddingDataset:
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

def batch_encode_texts(texts, model, batch_size=32):
    dataset = EmbeddingDataset(texts)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    embeddings = []

    for batch in tqdm(dataloader, desc="Batch Encoding", unit="batch"):
        batch_embeddings = model.encode(batch, convert_to_tensor=True)
        embeddings.append(batch_embeddings.cpu().numpy())

    return np.vstack(embeddings)

def extract_features(df, train=False):
    prompts = df["prompt"].tolist()
    responses_a = df["response_a"].tolist()
    responses_b = df["response_b"].tolist()

    # Batch embedding extraction
    prompt_embeddings = batch_encode_texts(prompts, model)
    response_a_embeddings = batch_encode_texts(responses_a, model)
    response_b_embeddings = batch_encode_texts(responses_b, model)

    # Sentiment analysis and verbosity
    sentiment_a = [TextBlob(text).sentiment.polarity for text in responses_a]
    sentiment_b = [TextBlob(text).sentiment.polarity for text in responses_b]
    verbosity_a = [len(text.split()) for text in responses_a]
    verbosity_b = [len(text.split()) for text in responses_b]

    # Compute cosine similarities
    similarity_a = np.diag(cosine_similarity(prompt_embeddings, response_a_embeddings))
    similarity_b = np.diag(cosine_similarity(prompt_embeddings, response_b_embeddings))
    similarity_a_b = np.diag(cosine_similarity(response_a_embeddings, response_b_embeddings))

    # Stack features
    features = np.column_stack([
        similarity_a, similarity_b, similarity_a_b,
        sentiment_a, sentiment_b,
        verbosity_a, verbosity_b
    ])

    return features

def test_model():
    test = load_csv(train=False)
    clf: LGBMClassifier = pickle.load(open("/kaggle/input/model/scikitlearn/default/3/model.pkl", "rb"))
    label_encoder: LabelEncoder = pickle.load(open("/kaggle/input/model/scikitlearn/default/3/label_encoder.pkl", "rb"))
    X_test = extract_features(test)
    test_predictions = clf.predict_proba(X_test)

    test["winner_model_a"] = test_predictions[:, 0]
    test["winner_model_b"] = test_predictions[:, 1]
    test["winner_tie"] = test_predictions[:, 2]

    submission = test[["id", "winner_model_a", "winner_model_b", "winner_tie"]]
    submission.columns = ["id", "winner_model_a", "winner_model_b", "winner_model_tie"]
    submission.to_csv("submission.csv", index=False)

print("Starting", flush=True)
test_model()