In [1]:
import pandas as pd
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

# --- File Paths ---
# Assumes the files are in the same directory as the script
LEAKY_SUB_PATH = 'submission-3.csv'  # The one that scored 0.69
ROBUST_SUB_PATH = 'submission.csv'    # The one you want to evaluate

# --- Load the submission files ---
try:
    df_leaky = pd.read_csv(LEAKY_SUB_PATH)
    df_robust = pd.read_csv(ROBUST_SUB_PATH)
except FileNotFoundError as e:
    print(f"Error: {e}. Make sure both submission files are in the same directory as this script.")
    exit()

# --- Merge the dataframes to align predictions by 'id' ---
# This is crucial to ensure we compare the correct rows.
df_comparison = pd.merge(df_leaky, df_robust, on='id', suffixes=('_leaky_0.69', '_robust_unknown'))

# --- Extract the prediction columns to act as y_true and y_pred ---
# We use the high-scoring leaky submission as a proxy for the true values.
y_true_proxy = df_comparison['purchaseValue_leaky_0.69']
y_pred_robust = df_comparison['purchaseValue_robust_unknown']

# --- Calculate the metrics ---
# R² Score
# This will show how much of the "variance" in the good model is explained by the bad model.
# A negative score means the model is worse than just predicting the average.
r2 = r2_score(y_true_proxy, y_pred_robust)

# Mean Squared Error (MSE)
# This will be enormous because R² penalizes large errors quadratically.
mse = mean_squared_error(y_true_proxy, y_pred_robust)

# Mean Absolute Error (MAE)
# This shows the average dollar amount your prediction was off by.
mae = mean_absolute_error(y_true_proxy, y_pred_robust)


# --- Print the results ---
print("=======================================================================")
print("Comparing Robust Model ('submission.csv') vs. Leaky Model ('submission-3.csv')")
print("Assuming the Leaky Model's predictions are the 'Ground Truth'...\n")

print(f"R² Score: {r2:.4f}")
print(f"Mean Squared Error (MSE): {mse:,.2f}")
print(f"Mean Absolute Error (MAE): {mae:,.2f}")
print("=======================================================================\n")


# --- Interpretation of the Results ---
print("What does this R² score mean?")
if r2 < 0:
    print(f"The calculated R² score is {r2:.4f}, which is NEGATIVE.")
    print("This indicates that the robust model's predictions are extremely poor compared to the leaky model.")
    print("It performs WORSE than a naive model that simply predicts the average purchase value of the leaky model.")
elif r2 < 0.5:
    print(f"The calculated R² score is {r2:.4f}, which is very low.")
    print("This indicates a very weak correlation between the two models' predictions.")
else:
    print(f"The calculated R² score is {r2:.4f}, indicating a moderate to strong correlation.")

print("\nBased on this analysis, the Kaggle score for 'submission.csv' will almost certainly be very low, likely close to zero or negative, just as your validation score predicted.")

Comparing Robust Model ('submission.csv') vs. Leaky Model ('submission-3.csv')
Assuming the Leaky Model's predictions are the 'Ground Truth'...

R² Score: 0.3932
Mean Squared Error (MSE): 12,569,394,226,424,738.00
Mean Absolute Error (MAE): 13,856,449.45

What does this R² score mean?
The calculated R² score is 0.3932, which is very low.
This indicates a very weak correlation between the two models' predictions.

Based on this analysis, the Kaggle score for 'submission.csv' will almost certainly be very low, likely close to zero or negative, just as your validation score predicted.
