In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import libraries helpful for cleaning text data

import re     #regular expression operations
import string #simplifies working with strings
import nltk   #Natural Language Toolkit
nltk.download('stopwords') #download stopwords list from NLTK
from nltk.corpus import stopwords

In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

# kaggle 'Competitions': LLM Classification Finetuning
***Finetune LLMs to Predict Human Preference using Chatbot Arena conversations***

This notebook contain a solution for the [LLM Classification Finetuning on Kaggle](http://www.kaggle.com/competitions/llm-classification-finetuning/overview)


**Main objective:** Predict which responses users will prefer in a head-to-head battle between chatbots powered by large language models(LLMs).

**Data**

train.csv

- id - A unique identifier for the row.
- model_a/b - The identity of model_a/b. Included in train.csv but not test.csv.
- prompt - The prompt that was given as an input (to both models).
- response_a/b - The response from model_a/b to the given prompt.
- winner_model_a/b/tie - Binary columns marking the judge's selection. The ground truth target column.

test.csv

- id
- prompt
- response_a/b

sample_submission.csv A submission file in the correct format.
- id
- winner_model_a/b/tie - This is what is predicted from the test set.


**Main steps**

1. [Understand the original data](#section1) <a href='#section1'></a>
2. [Data Cleaning](#section2) <a href='#section2'></a>
3. [Data Exploration](#section3) <a href='#section3'></a>
4. [Feature Engineering](#section4) <a href='#section4'></a>
5. [Model Selection and Training](#section5) <a href='#section5'></a>
6. [Model Evaluation](#section6) <a href='#section6'></a>
7. [Submission File](#section7) <a href='#section7'></a>

<a id='section1'></a>

## **1. Understand the original data**

> The first analysis is always made working with the Training set. Only after this first analysis, it comes the final step which is to make predictions based on the test set.

The first thing to do is to import the raw data.

In [None]:
# import the data
training = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')
test = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')

training['train_test'] = 1
test['train_test'] = 0
all_data = pd.concat([training,test])

print("Import Data Complete")

Then we should look at our data to get familiar with data and understand the data types, data consistency, null counts to think about how to manage that data/information

In [None]:
training.info()

In [None]:
training.head(10)

In [None]:
training.tail(10)

<a id='section2'></a>

## **2. Data Cleaning**

After we get familiar with data and understand the data types, we should look at data consistency, null counts to think about how to manage that data/information.

In [None]:
#check for duplicate id's
total_id = len(training["id"])
total_unique_id = len(training["id"].unique())

print("Total number of 'id' duplicates:")
print(total_id - total_unique_id)

In [None]:
#check for null or empty cells
nan_count = training.isna().sum().sum()
null_count = training.isnull().sum().sum()

print('Number of NaN values:', nan_count)
print('Number of null values:', null_count)

In [None]:
#check for consistency of model_a, model_b and LLMs identification
total_unique_model_a = len(training["model_a"].unique())
total_unique_model_b = len(training["model_b"].unique())

print("Total number of 'model_a' and 'model_b' unique values:")
print('model_a =', total_unique_model_a)
print('model_b =', total_unique_model_b)

LLM_a = training["model_a"].unique()
LLM_b = training["model_b"].unique()
LLM = list(set(LLM_a).intersection(set(LLM_b)))

print("total number of LLMs =", len(LLM))
print('LLMs utilized:', LLM)



In [None]:
#check for duplicate prompts -  the same prompt could have been given to more than two different LLMs
total_prompt = len(training["prompt"])
total_unique_prompt = len(training["prompt"].unique())

print("Total number of 'prompt' duplicates:")
print(total_prompt - total_unique_prompt)

#There are 57477 observations and 5743 prompt duplicates without id duplicates -> having prompt duplicates are ok to have 
#and no further data cleaning is needed to deal with prompt duplicates

Now that the data was checked for duplicates, NaN and null values, and consistency, it is essential to clean cells with text data (columns _prompt_, _response_a_ and _response_b_).

In [None]:
#clean cells with text data
def preprocess_text(text):
    #convert text to lower case
    text = text.lower()
    #remove digits and special characters using regular expressions
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    #tokenize the text
    text = nltk.word_tokenize(text)
    
    return text

def remove_stopwords(text):
    #remove stopwords
    stop_words = set(stopwords.words('english'))
    text_no_stopwords = [word for word in text if word not in stop_words]

    return text_no_stopwords


def lemmatization(text):
    lemmatizer = nltk.WordNetLemmatizer()
    lemmatizer_text = [lemmatizer.lemmatize(text) for text in text]

    return lemmatizer_text


def clean_text(text):
    #convert text to lower case, remove digits and special characters using regular expressions and remove stopwords combined together
    text = preprocess_text(text)
    filtered_text = remove_stopwords(text)
    lemmatizer_text = lemmatization(filtered_text)
    clean_text = ' '.join(lemmatizer_text)

    return clean_text

training["prompt"] = training["prompt"].apply(clean_text)
training["response_a"] = training["response_a"].apply(clean_text)
training["response_b"] = training["response_b"].apply(clean_text)


print("Clean Cells with Text Data Complete")

In [None]:
training.head(10)


In [None]:
training.tail(10)

Now, the raw text in columns _prompt_, _response_a_ and _response_b_ are clean and ready for the next steps. 

<a id='section3'></a>

## **3. Data Exploration**

The next thing to do it to get to know data more detailed and examining it to find initial patterns and interesting points

In [None]:
#barchart - model a
result_model_a = training["model_a"].value_counts()
print("model a:", result_model_a)

## Matplotlib barchart:
print("-----")
print("Matplotlib barchart, model a:")
  
barWidth = 0.45
plt.figure(figsize=(15, 7))

plt.bar(result_model_a.index, result_model_a.values, barWidth, color='r')

plt.ylabel('Counts', fontweight ='bold', fontsize = 15)
plt.xlabel('LLMs', fontweight ='bold', fontsize = 15)

plt.xticks(rotation=90)
plt.title('LLMs Value Counts - model a', fontweight ='bold', fontsize = 15)


plt.show()

In [None]:
#barchart - model b
result_model_b = training["model_b"].value_counts()
print("model b:", result_model_b)

## Matplotlib barchart:
print("-----")
print("Matplotlib barchart, model b:")
  
barWidth = 0.45
plt.figure(figsize=(15, 7))

plt.bar(result_model_b.index, result_model_b.values, barWidth, color='g')

plt.ylabel('Counts', fontweight ='bold', fontsize = 15)
plt.xlabel('LLMs', fontweight ='bold', fontsize = 15)

plt.xticks(rotation=90)
plt.title('LLMs Value Counts - model b', fontweight ='bold', fontsize = 15)


plt.show()

In [None]:
#barchart - model winner: winner model a, winner model b or winner tie
def which_winner(value):
    if  value["winner_model_a"] == 1:
         #winner model a
         value["winner_model_b"] = 0
         value["winner_tie"] = 0
         return 0
    elif value["winner_model_b"] == 1:
         #winner model b
         return 1
    elif value["winner_tie"] == 1:
         #winner tie
         return 2
    return None

training["winner"] = training.apply(which_winner, axis=1)

training["winner_model"] = training["winner"].astype(str)
training.loc[training["winner_model"] == "0", "winner_model"] = "model a"
training.loc[training["winner_model"] == "1", "winner_model"] = "model b"
training.loc[training["winner_model"] == "2", "winner_model"] = "winner tie"

result_model_winner = training["winner_model"].value_counts()
print("model winner:", result_model_winner)

print("-----")
print("Matplotlib barchart, model winner:")

barWidth = 0.75
plt.figure(figsize=(8, 7))

plt.bar(result_model_winner.index, result_model_winner.values, barWidth, color='b')

plt.ylabel('Counts', fontweight ='bold', fontsize = 15)
plt.xlabel('Model winner', fontweight ='bold', fontsize = 15)

plt.title('LLMs Value Counts - model winner', fontweight ='bold', fontsize = 15)


plt.show()

<a id='section4'></a>

## **4. Feature Engineering**

After cleaning the raw data and understanding it more cleary, the next step is extracting meaningful information from the data to make it usable for machine learning models.

In [None]:
training.info()

In [None]:
#transform text data into numerical form
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features = 150) #without max_features it crashes due to memory limit
vectorizer_prompt = vectorizer.fit_transform(training["prompt"])
print(vectorizer.idf_)
print(vectorizer.get_feature_names_out())
vectorizer_response_a = vectorizer.fit_transform(training["response_a"])
print(vectorizer.idf_)
print(vectorizer.get_feature_names_out())
vectorizer_response_b = vectorizer.fit_transform(training["response_b"])
print(vectorizer.idf_)
print(vectorizer.get_feature_names_out())



temp_prompt = vectorizer.transform(training["prompt"])
temp_response_a = vectorizer.fit_transform(training["response_a"])
temp_response_b = vectorizer.fit_transform(training["response_b"])

print("vectorizer representation for 'prompt':\n", temp_prompt.toarray())
print("vectorizer representation for 'response a':\n", temp_response_a.toarray())
print("vectorizer representation for 'response b':\n", temp_response_b.toarray())

print("Number of elements for the vectorizer representation for 'prompt':\n", temp_prompt.shape)
print("Number of elements for the vectorizer representation for 'response a':\n", temp_response_a.shape)
print("Number of elements for the vectorizer representation for 'response b':\n", temp_response_b.shape)


In [None]:
#selecting the prediction target
train_y = training["winner"].values

#choosing "features"
train_X = np.concatenate((temp_prompt.toarray(), temp_response_a.toarray(), temp_response_b.toarray()), axis=1)

print("Selecting The Prediction Target and Choosing Features Complete")

<a id='section5'></a>

## **5. Model Selection and Training**

The next step is the model selection and predictions. I will use data science model **Logistic Regression** to predict the outcome of the winner model, because as input features we have *temp_prompt*, *temp_response_a* and *temp_response_b* and corresponding labels _winner_ (0 for winner model a, 1 for winner model b, and 2 for winner tie). Moreover, Logistic Regression predicts the probability for each target class as requested for the Submission File.

In [None]:
#use Logistic Regression
from sklearn.linear_model import LogisticRegression
from datetime import datetime

#record start time to calculate the execution time
start = datetime.now()

#Logistic Regression
model = LogisticRegression(max_iter=500, multi_class='multinomial', solver='saga') #For large datasets the “saga” solver is usually faster [scikit-learn documentation]
model.fit(train_X, train_y)

#record end time
end = datetime.now()
 
#calculate the execution time
execution_time = (end - start).total_seconds() / 60
print(f"The time of execution is: {execution_time} minutes")


print("Model Training Complete")

 

#Note: model = LogisticRegression() without futher improvements is giving a ConvergenceWarning:
#/opt/conda/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
#STOP: TOTAL NO. of ITERATIONS REACHED LIMIT
#Increase the number of iterations (max_iter) or scale the data as shown in:
#    https://scikit-learn.org/stable/modules/preprocessing.html
#Please also refer to the documentation for alternative solver options:
#    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

<a id='section6'></a>

## **6. Model Evaluation**

In this step, the Logistic Regression model trained is evaluated to check how it performs to estimate the winner model (winner model a, winner model b or winner tie).

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score

#split into validation and training data
train_X_train, train_X_val, train_y_train, train_y_val = train_test_split(train_X, train_y, test_size=0.2, random_state=42)

#record start time to calculate the execution time
start = datetime.now()

#think about results - comparing predictions (value_y_predict) to the actual winner model (train_y_val)
value_y_predict = model.predict(train_X_val)
print('Model winner prediction', value_y_predict)
print('Model winner real value', train_y_val)

value_y_probabilities = model.predict_proba(train_X_val)
print('Model winner prediction, probability', value_y_probabilities) #winner model a | winner model b | winner tie

#confusion matrix
cm = confusion_matrix(train_y_val, value_y_predict)
print("Confusion Matrix:\n", cm)

#model accuracy
score = model.score(train_X_val, train_y_val)
print('Model Accuracy Score', score)

#macro and micro averaged Precision and Recall
macro_precision = precision_score(train_y_val, value_y_predict, average='macro') #calculate precision for all classes individually and then average them
macro_recall = recall_score(train_y_val, value_y_predict, average='macro')
micro_precision = precision_score(train_y_val, value_y_predict, average='micro') #calculate class wise true positive and false positive and then use that to calculate overall precision
micro_recall = recall_score(train_y_val, value_y_predict, average='micro')
print("Macro Precision:", macro_precision)
print("Macro Recall:", macro_recall)
print("Micro Precision:", micro_precision)
print("Micro Recall:", micro_recall)

#record end time
end = datetime.now()
 
#calculate the execution time
execution_time = (end - start).total_seconds()
print(f"The time of execution is: {execution_time} seconds")


In [None]:
#model log loss - https://www.kaggle.com/competitions/llm-classification-finetuning/discussion/552103
from sklearn.metrics import log_loss

model_log_loss = log_loss(train_y_val, value_y_probabilities)

print('Model Log loss:', model_log_loss) 

# Number of classes = 3 : Logloss = - log(1/3) = 1.10
# Model Log loss: 1.05, model prediction is considered good for this project

<a id='section7'></a>

## **7. Submission File**

The last step is to generate a CSV file with model predictions on test data to submit to the competition.

In [None]:
#clean cells with text data
test["prompt"] = test["prompt"].apply(clean_text)
test["response_a"] = test["response_a"].apply(clean_text)
test["response_b"] = test["response_b"].apply(clean_text)


print("Clean Cells with Text Data Complete")

In [None]:
test.info()

In [None]:
#transform text data into numerical form

vectorizer = TfidfVectorizer(max_features = 150) #without max_features it crashes due to memory limit
vectorizer_prompt = vectorizer.fit_transform(test["prompt"])
print(vectorizer.idf_)
print(vectorizer.get_feature_names_out())
vectorizer_test_response_a = vectorizer.fit_transform(test["response_a"])
print(vectorizer.idf_)
print(vectorizer.get_feature_names_out())
vectorizer_test_response_b = vectorizer.fit_transform(test["response_b"])
print(vectorizer.idf_)
print(vectorizer.get_feature_names_out())



temp_test_prompt = vectorizer.transform(test["prompt"])
temp_test_response_a = vectorizer.fit_transform(test["response_a"])
temp_test_response_b = vectorizer.fit_transform(test["response_b"])

print("vectorizer representation for 'prompt':\n", temp_test_prompt.toarray())
print("vectorizer representation for 'response a':\n", temp_test_response_a.toarray())
print("vectorizer representation for 'response b':\n", temp_test_response_b.toarray())

print("Number of elements for the vectorizer representation for 'prompt':\n", temp_test_prompt.shape)
print("Number of elements for the vectorizer representation for 'response a':\n", temp_test_response_a.shape)
print("Number of elements for the vectorizer representation for 'response b':\n", temp_test_response_b.shape)

In [None]:
#model prediction
test_X = np.concatenate((temp_test_prompt.toarray(), temp_test_response_a.toarray(), temp_test_response_b.toarray()), axis=1)
value_test_y_probabilities = model.predict_proba(test_X)
print('Model winner prediction, probability', value_test_y_probabilities) #winner model a | winner model b | winner tie

In [None]:
output = pd.DataFrame({'id': test.id,
                        'winner_model_a': value_test_y_probabilities[:, 0],
                        'winner_model_b': value_test_y_probabilities[:, 1],
                        'winner_tie': value_test_y_probabilities[:, 2]})
output.to_csv('submission.csv', index=False)