In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Load

In [None]:
df = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')

In [None]:
df.head()

# Pre-processing

In [None]:
df.isnull().sum()

In [None]:
#target col
# 0 = A wins, 1 = B wins, 2 = tie
def target_col(row):
    if row['winner_model_a'] == 1:
        return 0
    elif row['winner_model_b'] == 1:
        return 1
    elif row['winner_tie'] == 1:
        return 2

df['label'] = df.apply(target_col, axis=1)

In [None]:
df.head()

## Text encoding using sentence transformer (all-MiniLM-L6-v2)

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# Encode prompt, respnse_a, response_b columns
emb_a = model.encode(df['response_a'].tolist())

In [None]:
emb_b = model.encode(df['response_b'].tolist())

In [None]:
prompt = model.encode(df['prompt'].tolist())

# Method 1: sentence transformer -> cosine similarity -> ML 

## Sample Demonstration 

In [None]:
#consider 3rd row
df['response_a'].iloc[2]

In [None]:
df['response_b'].iloc[2]

In [None]:
a = """Function calling is the process of invoking or executing a function in a programming language. To call a function, you need to use the function\'s name followed by parentheses \\"()\\". Within the parentheses, you can pass arguments if the function requires any.\\n\\nHere is an example of how to call a function in Python:\\n\\n```python\\n# Define a function\\ndef greet(name):\\n    print(\\"Hello, \\" + name + \\"!\\")\\n\\n# Call the function\\ngreet(\\"John\\")\\n```\\n\\nIn this example, the function \\"greet\\" is defined with a single parameter \\"name\\" which is used to print a greeting message. To call the function and pass an argument, we write \\"greet(\\"John\\")\\". This will execute the function and print \\"Hello, John!\\".\\n\\nIt is important to note that the function should be defined before calling it in most programming languages, as the interpreter needs to know about the function\'s existence before it can be called."""
b = """Function calling is the process of invoking a function in a program. A function is a block of code that performs a specific task and can be reused multiple times in a program. Functions help to organize and modularize code, making it more readable and maintainable.\\n\\nTo call a function, you simply write the name of the function followed by parentheses () containing any arguments or parameters that the function requires. Here\'s an example:\\n```\\nfunction greet(name) {\\n  console.log(\\"Hello, \\" + name);\\n}\\n\\n\\/\\/ Call the greet function and pass in the argument \\"Alice\\"\\ngreet(\\"Alice\\");\\n```\\nIn the example above, the function `greet` takes one parameter, `name`, which is used inside the function to print a greeting message. To call the function, we write the name of the function (`greet`) followed by parentheses containing the argument `\\"Alice\\"`. When the function is called, the argument `\\"Alice\\"` is passed to the function, where it is assigned to the `name` parameter. The function then uses the `name` parameter to print the greeting message `\\"Hello, Alice\\"`.\\n\\nFunctions can also return values, which can be assigned to variables or used in expressions. For example:\\n```\\nfunction addNumbers(a, b) {\\n  return a + b;\\n}\\n\\n\\/\\/ Call the addNumbers function and assign the result to the variable sum\\nvar sum = addNumbers(3, 5);\\nconsole.log(sum); \\/\\/ Output: 8\\n```\\nIn the example above, the function `addNumbers` takes two parameters, `a` and `b`, adds them together, and returns the result. To call the function, we write the name of the function (`addNumbers`) followed by parentheses containing the arguments `3` and `5`. The function calculates the sum of the two arguments (`3 + 5 = 8`) and returns the result (`8`). We then assign the result to the variable `sum` and print it to the console."""

In [None]:
encode_a = model.encode([a])[0]
encode_b = model.encode([b])[0]

In [None]:
from scipy.spatial import distance
similarity_score = 1 - distance.cosine(encode_b, encode_a)
print(f'similarity score of a and b is {similarity_score}')

In [None]:
df['prompt'].iloc[2]

In [None]:
p = """explain function calling. how would you call a function?"""
encode_p = model.encode([p])[0]

In [None]:
ss = 1 - distance.cosine(encode_p, encode_a)
s3 = 1 - distance.cosine(encode_p, encode_b)
print(ss, s3)

In [None]:
a1 = model.encode([df['prompt'].iloc[0]])[0]

In [None]:
p1 = model.encode([df['response_a'].iloc[0]])[0]

In [None]:
s4 = 1 - distance.cosine(a1, p1)
s4

## Model

In [None]:
from scipy.spatial.distance import cosine
similarities_prompt_a = []
similarities_prompt_b = []
similarities_a_b = []

for p, a, b in zip(prompt, emb_a, emb_b):
    sim_pa = 1 - cosine(p, a)
    sim_pb = 1 - cosine(p, b)
    sim_ab = 1 - cosine(a, b)

    similarities_prompt_a.append(sim_pa)
    similarities_prompt_b.append(sim_pb)
    similarities_a_b.append(sim_ab)

# Store results in DataFrame
df['sim_prompt_a'] = similarities_prompt_a
df['sim_prompt_b'] = similarities_prompt_b
df['sim_a_b'] = similarities_a_b

In [None]:
df.head()

### Splitting the data 

In [None]:
from sklearn.model_selection import train_test_split
X = df[['sim_prompt_a', 'sim_prompt_b', 'sim_a_b']]
X_train, X_test, y_train, y_test = train_test_split(X, df['label'],test_size=0.2)

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, random_state = 42)
clf.fit(X_train, y_train)

In [None]:
y_probs_rf = clf.predict_proba(X_test)
from sklearn.metrics import log_loss
print("Log Loss:", log_loss(y_test, y_probs_rf))

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

In [None]:
y_lr_probs = lr.predict_proba(X_test)
print(log_loss(y_test, y_lr_probs))

In [None]:
import lightgbm as lgb
lgb_model = lgb.LGBMClassifier(num_leaves=31, max_depth=-1, learning_rate=0.05, n_estimators=100)

lgb_model.fit(X_train, y_train)
y_lgb_probs = lgb_model.predict_proba(X_test)
print(log_loss(y_test, y_lgb_probs))

# Method 2: sentence transformer -> differences -> ML

In [None]:
features = np.hstack([
    np.abs(emb_a - emb_b),
    np.abs(prompt - emb_a),
    np.abs(prompt - emb_b)
])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
X2 = features
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, df['label'], test_size=0.2)

In [None]:
clf2 = RandomForestClassifier(n_estimators=100, random_state = 42)
clf2.fit(X2_train, y2_train)

In [None]:
y2_probs = clf2.predict_proba(X2_test)
from sklearn.metrics import log_loss
print("Log Loss:", log_loss(y2_test, y2_probs))

In [None]:
import lightgbm as lgb
lgb_model2 = lgb.LGBMClassifier(num_leaves=31, max_depth=-1, learning_rate=0.05, n_estimators=100)

# Train the model
lgb_model2.fit(X2_train, y2_train)
y_lgb_probs = lgb_model2.predict_proba(X2_test)
print(log_loss(y2_test, y_lgb_probs))

### best model: difference funstion with  lgbm clssifier

# Test Data

In [None]:
test = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')

In [None]:
test.head()

In [None]:
test.isnull().sum()

In [None]:
test_emb_a = model.encode(test['response_a'].tolist())
test_emb_b = model.encode(test['response_b'].tolist())
test_prompt = model.encode(test['prompt'].tolist())

In [None]:
#cosine similarity
from scipy.spatial.distance import cosine
test_similarities_prompt_a = []
test_similarities_prompt_b = []
test_similarities_a_b = []

for p, a, b in zip(test_prompt, test_emb_a, test_emb_b):
    t_sim_pa = 1 - cosine(p, a)
    t_sim_pb = 1 - cosine(p, b)
    t_sim_ab = 1 - cosine(a, b)

    test_similarities_prompt_a.append(t_sim_pa)
    test_similarities_prompt_b.append(t_sim_pb)
    test_similarities_a_b.append(t_sim_ab)

# Store results in DataFrame
test['sim_prompt_a'] = test_similarities_prompt_a
test['sim_prompt_b'] = test_similarities_prompt_b
test['sim_a_b'] = test_similarities_a_b

In [None]:
test.head()

In [None]:
#difference function
test_features = np.hstack([
    np.abs(test_emb_a - test_emb_b),
    np.abs(test_prompt - test_emb_a),
    np.abs(test_prompt - test_emb_b)
])

In [None]:
# X3 = test[['sim_prompt_a', 'sim_prompt_b', 'sim_a_b']]
test_probs = lgb_model2.predict_proba(test_features)

In [None]:
test_probs

In [None]:
submission = pd.DataFrame({
    'id': test['id'],
    'winner_model_a': test_probs[:, 0],
    'winner_model_b': test_probs[:, 1],
    'winner_tie': test_probs[:, 2],
})

In [None]:
submission.to_csv('submission.csv', index=False)