In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")
train_df.head()

In [None]:
test_df = pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")
test_df.head()

In [None]:
import unicodedata
import re
# Precompile frequently used regex patterns for performance.
NEWLINE_TAB_PATTERN = re.compile(r"[\r\n\t]+")
MULTIPLE_SPACES_PATTERN = re.compile(r"\s+")
ALLOWED_CHARS_PATTERN = re.compile(r"[^a-zA-Z0-9\s.,!?()]+")
NON_ASCII_PATTERN = re.compile(r'[^\x00-\x7F]+')

def text_cleaning(text, remove_non_ascii=True):
    """
    Clean and standardize a string for text processing.
    
    Parameters:
        text (str): The input string to be cleaned.
        remove_non_ascii (bool): Flag to determine whether non-ASCII characters should be removed. 
                                 Defaults to True.
        
    Returns:
        str: The cleaned and normalized string.
    """
    # Normalize unicode characters (NFKD) to decompose combined letters, e.g., converting "Ã©" to "e"
    text = unicodedata.normalize("NFKD", text)
    
    # Convert text to lowercase.
    text = text.lower()
    
    # Replace newline and tab characters with a space.
    text = NEWLINE_TAB_PATTERN.sub(" ", text)
    
    # Replace specific escape sequences; here we transform "\/" into "/" as needed.
    text = text.replace("\\/", "/")
    
    # Remove non-ASCII characters if flag is set. This is optional and can be disabled
    if remove_non_ascii:
        text = NON_ASCII_PATTERN.sub("", text)
    
    # Remove unwanted punctuation and characters.
    text = ALLOWED_CHARS_PATTERN.sub("", text)
    
    # Replace multiple spaces with a single space.
    text = MULTIPLE_SPACES_PATTERN.sub(" ", text)
    
    # Remove any leading and trailing spaces.
    return text.strip()



In [None]:
#text cleaning
train_df['prompt'] = train_df['prompt'].apply(text_cleaning)
train_df['response_a'] = train_df['response_a'].apply(text_cleaning)
train_df['response_b'] = train_df['response_b'].apply(text_cleaning)

train_df.head()

In [None]:
# test_df['prompt'] = test_df['prompt'].apply(text_cleaning)
test_df['response_a'] = test_df['response_a'].apply(text_cleaning)
test_df['response_b'] = test_df['response_b'].apply(text_cleaning)
test_df.head()

In [None]:
# train_df['response_a_len'] = train_df['response_a'].apply(lambda x: len(x))
# train_df['response_b_len'] = train_df['response_b'].apply(lambda x: len(x))

In [None]:
# test_df['response_a_len'] = test_df['response_a'].apply(lambda x: len(x))
# test_df['response_b_len'] = test_df['response_b'].apply(lambda x: len(x))

In [None]:
# train_df['response_a_un'] = train_df['response_a'].apply(lambda x: len(set(x.split(" "))))
# train_df['response_b_un'] = train_df['response_b'].apply(lambda x: len(set(x.split(" "))))
# train_df['prompt_un'] = train_df['prompt'].apply(lambda x: len(set(x.split(" "))))

In [None]:
# test_df['response_a_un'] = test_df['response_a'].apply(lambda x: len(set(x.split(" "))))
# test_df['response_b_un'] = test_df['response_b'].apply(lambda x: len(set(x.split(" "))))
# test_df['prompt_un'] = test_df['prompt'].apply(lambda x: len(set(x.split(" "))))

In [None]:
# def combine_p_r(row): 
#     return (
#         f"{row['prompt']}: {row['response_a']}",
#         f"{row['prompt']}: {row['response_b']}"
#     )

In [None]:
# train_df[['combined_a', 'combined_b']] = train_df.apply(combine_p_r, axis=1, result_type='expand')
# train_df.head()

In [None]:
# test_df[['combined_a', 'combined_b']] = test_df.apply(combine_p_r, axis=1, result_type='expand')
# test_df.head()

In [None]:
train_df['winner'] = pd.from_dummies(train_df[['winner_model_a', 'winner_model_b', 'winner_tie']])
train_df['winner'] = train_df['winner'].map({'winner_model_a':0, 'winner_model_b':1, 'winner_tie':2})
train_df.drop(['winner_model_a', 'winner_model_b', 'winner_tie'], axis=1, inplace=True)
train_df.head()

In [None]:
X = train_df[["prompt", "response_a", "response_b"]]
y = train_df[['winner']]

In [None]:
# X = train_df[["prompt", "response_a", "response_b", "response_a_len", "response_b_len", "response_a_un", "response_b_un", 
#               "prompt_un", "combined_a", "combined_b"]]
# y = train_df[['winner']]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from scipy.sparse import hstack, csr_matrix
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.metrics import log_loss





In [None]:
numeric_features = [
    'response_a_len', 'response_b_len', 'response_a_un', 'response_b_un', 'prompt_un'
]

text_features = [
    'prompt', 'response_a', 'response_b', 'combined_a', 'combined_b'
]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
tfidf_prompt = TfidfVectorizer(max_features=1500, stop_words='english')
tfidf_responses = TfidfVectorizer(max_features=1500, stop_words='english')

prompt_tf = tfidf_prompt.fit_transform(X_train['prompt'])
response_a_tf = tfidf_responses.fit_transform(X_train['response_a'])
response_b_tf = tfidf_responses.fit_transform(X_train['response_b'])

X_train_tf_combined = hstack([
    prompt_tf, 
    response_a_tf, 
    response_b_tf, 
])




prompt_tf_test = tfidf_prompt.transform(X_test['prompt'])
response_a_tf_test = tfidf_responses.transform(X_test['response_a'])
response_b_tf_test = tfidf_responses.transform(X_test['response_b'])


X_test_tf_combined = hstack([
    prompt_tf_test, 
    response_a_tf_test, 
    response_b_tf_test, 

])

In [None]:
# -------------------------------
# TF-IDF Vectorization for Text Features
# -------------------------------
# tfidf_prompt = TfidfVectorizer(max_features=1500, stop_words='english')
# tfidf_responses = TfidfVectorizer(max_features=1500, stop_words='english')

# # Fit and transform the training set for prompts and various response fields.
# prompt_tf = tfidf_prompt.fit_transform(X_train['prompt'])
# response_a_tf = tfidf_responses.fit_transform(X_train['response_a'])
# response_b_tf = tfidf_responses.fit_transform(X_train['response_b'])
# combined_a_tf = tfidf_responses.fit_transform(X_train['combined_a'])
# combined_b_tf = tfidf_responses.fit_transform(X_train['combined_b'])

# # -------------------------------
# # Scaling Numeric Features
# # -------------------------------
# scaler = StandardScaler()
# X_train_numeric = scaler.fit_transform(X_train[numeric_features])
# X_test_numeric = scaler.transform(X_test[numeric_features])

# # Convert numeric features to sparse format so they match the type of TF-IDF matrices.
# X_train_numeric_sparse = csr_matrix(X_train_numeric)
# X_test_numeric_sparse = csr_matrix(X_test_numeric)

# # -------------------------------
# # Combine TF-IDF and Numeric Features for Training Set
# # -------------------------------
# X_train_tf_combined = hstack([
#     prompt_tf, 
#     response_a_tf, 
#     response_b_tf, 
#     combined_a_tf, 
#     combined_b_tf, 
#     X_train_numeric_sparse
# ])

# # -------------------------------
# # Transform Test Set Using the Same Vectorizers and Scaler
# # -------------------------------
# prompt_tf_test = tfidf_prompt.transform(X_test['prompt'])
# response_a_tf_test = tfidf_responses.transform(X_test['response_a'])
# response_b_tf_test = tfidf_responses.transform(X_test['response_b'])
# combined_a_tf_test = tfidf_responses.transform(X_test['combined_a'])
# combined_b_tf_test = tfidf_responses.transform(X_test['combined_b'])

# X_test_tf_combined = hstack([
#     prompt_tf_test, 
#     response_a_tf_test, 
#     response_b_tf_test, 
#     combined_a_tf_test, 
#     combined_b_tf_test, 
#     X_test_numeric_sparse
# ])

In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier

In [None]:
xgb_model = XGBClassifier(n_estimators=1000, subsample=0.8, min_child_weight=10, max_depth=3, learning_rate=0.01, gamma=0, colsample_bytree=0.8)
xgb_model.fit(X_train_tf_combined, y_train)
y_pred = xgb_model.predict(X_test_tf_combined)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

In [None]:
prompt_tf = tfidf_prompt.transform(test_df['prompt'])
response_a_tf = tfidf_responses.transform(test_df['response_a'])
response_b_tf = tfidf_responses.transform(test_df['response_b'])
# combined_a_tf = tfidf_responses.transform(test_df['combined_a'])
# combined_b_tf = tfidf_responses.transform(test_df['combined_b'])

# Scaling numeric features
#scaler = StandardScaler()
#test_numeric = scaler.transform(test_df[numeric_features])

# Combine TF-IDF and scaled numeric features
X_test_tf = hstack([
    prompt_tf, 
    response_a_tf, 
    response_b_tf, 
    #combined_a_tf, 
    #combined_b_tf, 
    #test_numeric
])

In [None]:
prediction = xgb_model.predict_proba(X_test_tf)
df_sample = pd.DataFrame(data=prediction, columns=['winner_model_a', 'winner_model_b', 'winner_tie'])

In [None]:
submission = pd.concat([test_df[['id']], df_sample], axis=1)
submission.to_csv('submission.csv', index=False)
submission