In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

In [None]:
train = pd.read_csv(r'/kaggle/input/llm-classification-finetuning/train.csv')
test = pd.read_csv(r'/kaggle/input/llm-classification-finetuning/test.csv')
sample_submission = pd.read_csv(r'/kaggle/input/llm-classification-finetuning/sample_submission.csv')

print('train data shape :', train.shape)
print('test data shape :', test.shape)
print('sample_submission data shape :', sample_submission.shape)

In [None]:
train.head()

In [None]:
# Get value counts in descending order
value_counts = train['model_a'].value_counts(ascending=False)

# Plotting
plt.figure(figsize=(15, 6))
sns.barplot(x=value_counts.index, y=value_counts.values)
plt.xlabel('Category')
plt.ylabel('Counts')
plt.xticks(rotation=90)
plt.title('Value Counts in Descending Order')
plt.show()

In [None]:
print(train['model_b'].value_counts())
plt.figure(figsize=(15, 6))
sns.countplot(x='model_b', data=train)
plt.xticks(rotation=90)
plt.show()

In [None]:
train = train.drop_duplicates()
train.shape

In [None]:
train.isnull().sum().sort_values(ascending=False)

In [None]:
# Distribution of the data:
train.drop('id', axis=1).hist(figsize=(10,5),color = 'skyblue', edgecolor='black')
plt.show()

In [None]:
test.head()

In [None]:
sample_submission.head()

In [None]:
# Distribution of the data:
sample_submission.drop('id', axis=1).hist(figsize=(10,5),color = 'skyblue', edgecolor='black')
plt.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack  # to concatenate sparse matrices

# Initialize TF-IDF Vectorizers for each text column
tfidf_prompt = TfidfVectorizer(max_features=500)   # Adjust max_features as needed
tfidf_response_a = TfidfVectorizer(max_features=500)
tfidf_response_b = TfidfVectorizer(max_features=500)

# Fit and transform each text column
X_prompt_tfidf = tfidf_prompt.fit_transform(train['prompt'])
X_response_a_tfidf = tfidf_response_a.fit_transform(train['response_a'])
X_response_b_tfidf = tfidf_response_b.fit_transform(train['response_b'])

# Combine TF-IDF matrices with other features
X_tfidf = hstack([X_prompt_tfidf, X_response_a_tfidf, X_response_b_tfidf])

X = X_tfidf 

In [None]:
# Fit and transform each text column
test_prompt_tfidf = tfidf_prompt.transform(test['prompt'])
test_response_a_tfidf = tfidf_response_a.transform(test['response_a'])
test_response_b_tfidf = tfidf_response_b.transform(test['response_b'])

# Combine TF-IDF matrices with other features
test_tfidf = hstack([test_prompt_tfidf, test_response_a_tfidf, test_response_b_tfidf])

test = test_tfidf

In [None]:
from sklearn.model_selection import train_test_split
X = X_tfidf
y = train[['winner_model_a','winner_model_b','winner_tie']]
test = test_tfidf

In [None]:
params = {'n_estimators': 229, 'max_depth': 9, 'learning_rate': 0.028184526290102357, 'subsample': 0.6607687169383815, 'colsample_bytree': 0.642663510005148}

In [None]:
# Initialize the MultiOutputClassifier with XGBClassifier
multi_target_model = MultiOutputClassifier(XGBClassifier(**params))

# Define K-Fold cross-validation
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Arrays to store predictions and log loss for each fold
fold_probs = []
fold_log_losses = []
preds = []

# Perform K-Fold cross-validation
for fold, (train_index, val_index) in enumerate(kf.split(X)):
    # Split the data into train and validation sets for this fold
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Fit the model on the training data of this fold
    multi_target_model.fit(X_train, y_train)

    # Get probability predictions on the validation set
    y_val_proba = [estimator.predict_proba(X_val)[:, 1] for estimator in multi_target_model.estimators_]
    y_val_proba = np.column_stack(y_val_proba)  # Stack to (n_samples, n_targets)
    pred = [estimator.predict_proba(test)[:, 1] for estimator in multi_target_model.estimators_]
    preds.append(pred)

    # Store predictions
    fold_probs.append(y_val_proba)

    # Calculate log loss for each target and store the results
    log_losses = [log_loss(y_val.iloc[:, i], y_val_proba[:, i]) for i in range(y_val.shape[1])]
    mean_log_loss = np.mean(log_losses)
    fold_log_losses.append(mean_log_loss)
    
    print(f"Fold {fold + 1} Mean Log Loss: {mean_log_loss}")

# Calculate the average log loss across all folds
avg_log_loss = np.mean(fold_log_losses)
print(f"\nAverage Log Loss across all folds: {avg_log_loss}")

# Optional: Convert fold probabilities into a DataFrame
all_probs = np.vstack(fold_probs)  # Stack probabilities from all folds if needed
probs_df = pd.DataFrame(all_probs, columns=[f"{col}_proba" for col in y.columns])
print("\nProbability predictions for each target:\n", probs_df)

In [None]:
submission = pd.DataFrame({
    'winner_model_a': np.mean(preds[0],axis=0),
    'winner_model_b': np.mean(preds[1],axis=0),
    'winner_tie': np.mean(preds[2],axis=0)
})

In [None]:
submission = pd.DataFrame({'id': sample_submission.id, 'winner_model_a' : np.mean(preds[0],axis=0), 'winner_model_b' : np.mean(preds[1],axis=0), 'winner_tie' : np.mean(preds[2],axis=0) })
submission.to_csv('submission.csv', index=False)
submission

from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize MultiOutputClassifier with XGBClassifier
multi_target_model = MultiOutputClassifier(XGBClassifier(**params))

# Train the model
multi_target_model.fit(X_train, y_train)

# Get probability predictions
y_proba = [estimator.predict_proba(X_test)[:, 1] for estimator in multi_target_model.estimators_]

# Display probabilities for each target
for i, col in enumerate(y.columns):
    print(f"Probability predictions for {col}:\n {y_proba[i]}")
    
#pred = multi_target_model.predict_proba(test)
pred = [estimator.predict_proba(test)[:, 1] for estimator in multi_target_model.estimators_]
pred

submission = pd.DataFrame({
    'winner_model_a': pred[0].flatten(),
    'winner_model_b': pred[1].flatten(),
    'winner_tie': pred[2].flatten()
})

submission = pd.DataFrame({'id': sample_submission.id, 'winner_model_a' : pred[0].flatten(), 'winner_model_b' : pred[1].flatten(), 'winner_tie' : pred[2].flatten() })
submission.to_csv('submission.csv', index=False)
submission

# Separate each target column individually
y_model_a = train['winner_model_a']
y_model_b = train['winner_model_b']
y_tie = train['winner_tie']

# Split into training and testing sets for each target
X_train, X_test, y_a_train, y_a_test = train_test_split(X, y_model_a, test_size=0.2, random_state=42)
_, _, y_b_train, y_b_test = train_test_split(X, y_model_b, test_size=0.2, random_state=42)
_, _, y_tie_train, y_tie_test = train_test_split(X, y_tie, test_size=0.2, random_state=42)


# Initialize classifiers
model_a = xgb.XGBClassifier(**params)
model_b = xgb.XGBClassifier(**params)
model_tie = xgb.XGBClassifier(**params)

# Train each model on the same X but different targets
model_a.fit(X, y_model_a)
model_b.fit(X, y_model_b)
model_tie.fit(X, y_tie)

# Predict on the test set for each model
y_a_pred = model_a.predict(X_test)
y_b_pred = model_b.predict(X_test)
y_tie_pred = model_tie.predict(X_test)

from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
# Evaluate accuracy
accuracy_a = accuracy_score(y_a_test, y_a_pred)
accuracy_b = accuracy_score(y_b_test, y_b_pred)
accuracy_tie = accuracy_score(y_tie_test, y_tie_pred)

log_loss_a = log_loss(y_a_test, y_a_pred)
log_loss_b = log_loss(y_b_test, y_b_pred)
log_loss_tie = log_loss(y_tie_test, y_tie_pred)

print(f"Accuracy for winner_model_a: {accuracy_a}")
print(f"Accuracy for winner_model_b: {accuracy_b}")
print(f"Accuracy for winner_tie: {accuracy_tie}")

print()

print(f"log_loss for winner_model_a: {log_loss_a}")
print(f"log_loss for winner_model_b: {log_loss_b}")
print(f"log_loss for winner_tie: {log_loss_tie}")