In [None]:
# This is a template for a Kaggle Notebook for the LLM Classification Finetuning competition.
# You will need to fill in the specifics related to your data and chosen model.

# 1. Setting up the environment
import pandas as pd
import numpy as np
import os
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss

# Example imports for a basic text classification model.
# You might need more advanced NLP libraries like transformers, torch, tensorflow, etc.
# based on your chosen approach (e.g., fine-tuning a pre-trained LLM, using embeddings).
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully.")


In [None]:
# 2. Loading the data
# Assuming the data files are in '../input/llm-classification-finetuning/'
# You'll need to check the actual file names and paths once you join the competition.

try:
    train_df = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')
    test_df = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')
    # You might also have a sample submission file to guide the format
    sample_submission_df = pd.read_csv('/kaggle/input/llm-classification-finetuning/sample_submission.csv')

    print("Data loaded successfully.")
    print(f"Train data shape: {train_df.shape}")
    print(f"Test data shape: {test_df.shape}")
    print(f"Sample Submission data shape: {sample_submission_df.shape}")

except FileNotFoundError:
    print("Error: Data files not found. Please ensure the dataset is added to your Kaggle Notebook.")
    print("Go to 'Add Data' on the right sidebar of your notebook and search for 'llm-classification-finetuning'.")
    exit() # Exit if data isn't found to prevent further errors

# Display the first few rows of the dataframes to understand their structure
print("\nTrain DataFrame Head:")
print(train_df.head())
print("\nTest DataFrame Head:")
print(test_df.head())
print("\nSample Submission DataFrame Head:")
print(sample_submission_df.head())


In [None]:
# 3. Exploratory Data Analysis (EDA) - Initial Checks
print("\nTrain DataFrame Info:")
train_df.info()
print("\nTest DataFrame Info:")
test_df.info()

print("\nMissing values in Train DataFrame:")
print(train_df.isnull().sum())
print("\nMissing values in Test DataFrame:")
print(test_df.isnull().sum())

# Check the distribution of the target variable (winner_model_a, winner_model_b, winner_tie)
# The competition description implies these are one-hot encoded, so we'll sum them to find the true winner.
# Assuming the 'winner' columns are directly provided in the train.csv as the target.
# If the target is a single column indicating the winner, you'll need to adjust this.
# Let's create a 'winner' column for easier analysis if it's not already a single categorical column.

# This part is highly dependent on the exact structure of your target variable in train.csv
# If the target is already a single column like 'winner' with values 'model_a', 'model_b', 'tie',
# then skip the below reconstruction and directly use that column.
if 'winner_model_a' in train_df.columns and 'winner_model_b' in train_df.columns and 'winner_tie' in train_df.columns:
    def get_winner(row):
        if row['winner_model_a'] == 1:
            return 'model_a'
        elif row['winner_model_b'] == 1:
            return 'model_b'
        elif row['winner_tie'] == 1:
            return 'tie'
        else:
            return 'unknown' # Should not happen if data is clean
    train_df['winner'] = train_df.apply(get_winner, axis=1)
    print("\nDistribution of 'winner' in Train DataFrame:")
    print(train_df['winner'].value_counts(normalize=True))
else:
    print("\nWarning: 'winner_model_a', 'winner_model_b', or 'winner_tie' columns not found as expected.")
    print("Please inspect your 'train.csv' to understand the target variable's structure.")
    # If your target is a single categorical column, e.g., 'target_winner', then use:
    # print(train_df['target_winner'].value_counts(normalize=True))


In [None]:
# 4. Feature Engineering
# This is a critical step and will heavily depend on the content of your 'prompt', 'response_a', and 'response_b' columns.
# You'll likely need to extract meaningful features from the text.

def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower() # Convert to string and lowercase
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
    return text

print("\nApplying text preprocessing...")
# Apply preprocessing to relevant text columns
# Assuming 'prompt', 'response_a', 'response_b' are the text columns.
# You might have other columns like 'conversation_id', 'turn', etc. which could also be useful.
train_df['processed_prompt'] = train_df['prompt'].apply(preprocess_text)
train_df['processed_response_a'] = train_df['response_a'].apply(preprocess_text)
train_df['processed_response_b'] = train_df['response_b'].apply(preprocess_text)

test_df['processed_prompt'] = test_df['prompt'].apply(preprocess_text)
test_df['processed_response_a'] = test_df['response_a'].apply(preprocess_text)
test_df['processed_response_b'] = test_df['response_b'].apply(preprocess_text)

# Example basic feature engineering: combining prompt and responses
# You could create features like:
# - Length of prompt, response_a, response_b
# - Word count of prompt, response_a, response_b
# - Readability scores (Flesch-Kincaid)
# - Sentiment scores
# - Difference in length/word count between response_a and response_b
# - Incorporating LLM specific metrics if available (e.g., perplexity, toxicity scores from external models)

train_df['combined_text_a'] = train_df['processed_prompt'] + " " + train_df['processed_response_a']
train_df['combined_text_b'] = train_df['processed_prompt'] + " " + train_df['processed_response_b']

test_df['combined_text_a'] = test_df['processed_prompt'] + " " + test_df['processed_response_a']
test_df['combined_text_b'] = test_df['processed_prompt'] + " " + test_df['processed_response_b']

print("Text preprocessing and basic combination complete.")



In [None]:
# 5. Model Training
# For a Getting Started competition, a simple yet effective approach is often a good start.
# Logistic Regression with TF-IDF features is a common baseline for text classification.
# For more advanced solutions, consider:
# - Embeddings (Word2Vec, GloVe, FastText)
# - Pre-trained language models (BERT, RoBERTa, etc.) and fine-tuning them for classification
# - Siamese networks or other architectures that compare two texts

# Define features (X) and target (y)
# We need to predict probabilities for winner_model_a, winner_model_b, winner_tie
# This means we're essentially doing a multi-class classification problem.

# For simplicity, let's start with a single model that predicts the 'winner' category.
# Then, we'll convert the predictions to the required probability format.
X_train_text_a = train_df['combined_text_a']
X_train_text_b = train_df['combined_text_b']
y_train = train_df['winner'] # This assumes 'winner' column was created correctly

X_test_text_a = test_df['combined_text_a']
X_test_text_b = test_df['combined_text_b']


# Label encode the target variable
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

# We need to build a model that can compare response A and response B.
# One approach is to treat this as two separate binary classification problems (A vs not A, B vs not B)
# or a multi-class classification.
# Given the competition asks for probabilities for A, B, and tie, a multi-class approach seems direct.

# Let's try a simple pipeline: TF-IDF + Logistic Regression
# We'll create features that represent the "difference" or "comparison" between A and B.
# This is a simplified approach; more robust models would likely involve twin networks or more complex feature engineering.

# A common strategy is to create pairs of (prompt, response_A) and (prompt, response_B) and learn a preference.
# Another is to feed both into a model and have it output preference.

# Let's simplify and just use the combined text 'combined_text_a' and 'combined_text_b'
# and try to build features from their difference or concatenation.
# For a multi-class classification, we can concatenate features from both responses.

# Create a 'comparison' feature or represent both responses in a way the model can compare.
# For a simple TF-IDF + Logistic Regression, we can create separate TF-IDF features and concatenate them.
# Or, build a model where the input is structured for comparison.

# Let's try a simpler approach: build a classifier for each response's quality, then combine.
# This is not ideal for 'preference' but a starting point.
# A better approach for preference: input (prompt, response_A, response_B) -> output (A_win, B_win, tie)

# For a direct multi-class classification:
# We need a unified input feature set that represents the comparison.
# Let's create a single feature set by concatenating `combined_text_a` and `combined_text_b` with a separator.

train_df['comparison_text'] = train_df['processed_prompt'] + " [SEP] " + \
                              train_df['processed_response_a'] + " [SEP] " + \
                              train_df['processed_response_b']

test_df['comparison_text'] = test_df['processed_prompt'] + " [SEP] " + \
                             test_df['processed_response_a'] + " [SEP] " + \
                             test_df['processed_response_b']


# Define the model pipeline
# Use TfidfVectorizer to convert text into numerical features
# Then, use LogisticRegression for multi-class classification
model_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))), # Experiment with max_features, ngram_range
    ('classifier', LogisticRegression(random_state=42, solver='liblinear', multi_class='auto')) # You can try 'saga', 'lbfgs'
])

print("\nTraining the model...")
model_pipeline.fit(train_df['comparison_text'], y_train_encoded)
print("Model training complete.")



In [None]:
# 6. Prediction
print("\nGenerating predictions on the test set...")
# Predict probabilities for each class (model_a, model_b, tie)
# The order of classes will be determined by LabelEncoder's internal sorting.
# You need to map these back to 'winner_model_a', 'winner_model_b', 'winner_tie'.
predicted_probabilities = model_pipeline.predict_proba(test_df['comparison_text'])

# Get the class names in the order they were encoded
encoded_classes = le.classes_
print(f"Encoded classes order: {encoded_classes}")

# Create a DataFrame for predictions
# Initialize with zeros
submission_preds = pd.DataFrame(0.0, index=test_df['id'], columns=['winner_model_a', 'winner_model_b', 'winner_tie'])

# Map probabilities back to the correct columns
# Ensure 'model_a', 'model_b', 'tie' are correctly mapped.
# This is crucial for the submission file.
for i, class_name in enumerate(encoded_classes):
    if class_name == 'model_a':
        submission_preds['winner_model_a'] = predicted_probabilities[:, i]
    elif class_name == 'model_b':
        submission_preds['winner_model_b'] = predicted_probabilities[:, i]
    elif class_name == 'tie':
        submission_preds['winner_tie'] = predicted_probabilities[:, i]

print("Prediction generation complete.")
print(submission_preds.head())

# Post-processing: Ensure probabilities sum to 1 (they should from predict_proba)
# and handle potential edge cases if needed (e.g., very small probabilities).
# For LogLoss, very small probabilities (close to 0) can cause issues.
# The competition description mentions "eps=auto", which handles this.

In [None]:
# 7. Submission File Creation
submission_df = test_df[['id']].copy()
submission_df = submission_df.merge(submission_preds, left_on='id', right_index=True)

# Ensure the columns are in the correct order as per the competition
submission_df = submission_df[['id', 'winner_model_a', 'winner_model_b', 'winner_tie']]

# Save the submission file
submission_file_path = 'submission.csv'
submission_df.to_csv(submission_file_path, index=False)

print(f"\nSubmission file created at: {submission_file_path}")
print("Submission DataFrame Head:")
print(submission_df.head())

print("\nKaggle Notebook script complete. You can now 'Save Version' and 'Submit' your notebook.")

In [None]:
# Incorrect line causing the error:
# submission_df.to_csv(C:\Users\Hammad Farooq\OneDrive\Documents, index=False)

# Correct line for Kaggle:
submission_file_path = 'submission.csv'
submission_df.to_csv(submission_file_path, index=False)