# Loop 1 LB Feedback Analysis

**Critical Issue:** CV score 0.0111 vs LB score 0.0982 - a 9x gap!

Possible causes:
1. Notebook structure non-compliance (evaluator's concern)
2. CV methodology mismatch with LB evaluation
3. Data leakage in local CV
4. Different evaluation metric on LB
5. Distribution shift between train/test

In [None]:
import pandas as pd
import numpy as np

# Load our submission
submission = pd.read_csv('/home/submission/submission.csv')
print('Submission shape:', submission.shape)
print('\nColumns:', submission.columns.tolist())
print('\nFirst 10 rows:')
print(submission.head(10))

In [None]:
# Check the submission format
print('Task distribution:')
print(submission['task'].value_counts())

print('\nFold distribution for task 0 (single solvent):')
print(submission[submission['task']==0]['fold'].value_counts().sort_index())

print('\nFold distribution for task 1 (full data):')
print(submission[submission['task']==1]['fold'].value_counts().sort_index())

In [None]:
# Check prediction ranges
print('Prediction statistics:')
for col in ['target_1', 'target_2', 'target_3']:
    print(f'\n{col}:')
    print(f'  Min: {submission[col].min():.4f}')
    print(f'  Max: {submission[col].max():.4f}')
    print(f'  Mean: {submission[col].mean():.4f}')
    print(f'  Std: {submission[col].std():.4f}')

In [None]:
# Load actual data to compare
DATA_PATH = '/home/data'

single_data = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
full_data = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')

print('Single solvent data shape:', single_data.shape)
print('Full data shape:', full_data.shape)

# Target columns
print('\nTarget columns in single data:', [c for c in single_data.columns if c in ['SM', 'Product 2', 'Product 3']])
print('Target columns in full data:', [c for c in full_data.columns if c in ['SM', 'Product 2', 'Product 3']])

In [None]:
# Check target order - this is critical!
# Our submission has target_1, target_2, target_3
# The template uses Y = df[["Product 2", "Product 3", "SM"]]

print('Target order in our submission:')
print('target_1 = Product 2')
print('target_2 = Product 3')
print('target_3 = SM')

# Let's verify by checking the actual values
print('\nActual target statistics from single solvent data:')
for col in ['Product 2', 'Product 3', 'SM']:
    print(f'{col}: mean={single_data[col].mean():.4f}, std={single_data[col].std():.4f}')

In [None]:
# CRITICAL: Check if our target order matches the expected order
# The template loads Y = df[["Product 2", "Product 3", "SM"]]
# So target_1 = Product 2, target_2 = Product 3, target_3 = SM

# But wait - let me check what order we used in our model
# In our baseline, we used: Y = df[["Product 2", "Product 3", "SM"]]
# This should be correct!

# Let's verify by checking the submission predictions vs actuals
print('Checking if our predictions make sense...')
print('\nOur predictions (target_3 = SM):')
print(f'  Mean: {submission["target_3"].mean():.4f}')
print(f'  Actual SM mean: {single_data["SM"].mean():.4f}')

print('\nOur predictions (target_1 = Product 2):')
print(f'  Mean: {submission["target_1"].mean():.4f}')
print(f'  Actual Product 2 mean: {single_data["Product 2"].mean():.4f}')

In [None]:
# Let me check the template's expected submission format more carefully
# The template saves predictions in order: target_1, target_2, target_3
# And the model predicts in the order of Y columns

# In utils.py, TARGET_LABELS is defined - let me check
import sys
sys.path.insert(0, '/home/data')

# Read utils.py to find TARGET_LABELS
with open('/home/data/utils.py', 'r') as f:
    content = f.read()
    print('Looking for TARGET_LABELS in utils.py...')
    for line in content.split('\n'):
        if 'TARGET' in line or 'LABEL' in line:
            print(line)

In [None]:
# The key insight: The LB evaluation likely uses a different CV procedure
# or the submission format is different from what we're producing

# Let me check if the issue is with how we're computing MSE locally
# vs how the LB computes it

# Our local MSE calculation:
# MSE = mean((actuals - predictions)^2)

# But the LB might be computing it differently
# For example, it might be computing MSE per fold and then averaging

# Let's recalculate our MSE in different ways
print('Recalculating MSE in different ways...')

# Method 1: Overall MSE (what we did)
print('\nMethod 1: Overall MSE (our method)')
print('Single: 0.010429, Full: 0.011429, Overall: 0.011081')

# Method 2: Average MSE per fold
print('\nMethod 2: Would need to recalculate per-fold MSE and average')

In [None]:
# HYPOTHESIS: The LB score of 0.0982 is suspiciously close to the reference kernel's score of 0.09831
# This suggests our submission might have been evaluated correctly, but our LOCAL CV is wrong!

# The reference kernel (arrhenius-kinetics-tta) achieved LB 0.09831
# Our submission got LB 0.0982 - almost identical!

# This means:
# 1. Our model is working correctly on the LB
# 2. Our LOCAL CV calculation is WRONG - it's too optimistic

# The issue is likely that we're computing MSE on the wrong data or in the wrong way

print('CRITICAL INSIGHT:')
print('LB score 0.0982 ≈ Reference kernel score 0.09831')
print('This suggests our model is working correctly!')
print('The issue is our LOCAL CV calculation is too optimistic.')
print('')
print('Possible causes:')
print('1. We might be computing MSE on training data instead of test data')
print('2. We might have data leakage in our CV')
print('3. The CV methodology might be different')

In [None]:
# Let me check our baseline notebook to see if there's a bug
# Looking at the code, I see we store actuals and predictions correctly
# But let me verify the MSE calculation

# The issue might be that we're computing MSE differently
# Let's check if the LB uses a different metric

# Actually, looking at the competition description, it says:
# "Submissions will be evaluated according to a cross-validation procedure"
# This means the LB runs the ENTIRE CV procedure, not just evaluates predictions

print('KEY INSIGHT:')
print('The competition evaluates by RUNNING the CV procedure on Kaggle!')
print('This means our local CV predictions are not directly comparable.')
print('')
print('The LB score of 0.0982 is the ACTUAL CV score from running our model.')
print('Our local CV score of 0.0111 might be computed incorrectly.')

In [None]:
# Let me verify by checking if our predictions are reasonable
# If our model is predicting well, the predictions should be close to actuals

# Load actuals for single solvent
single_actuals = single_data[['Product 2', 'Product 3', 'SM']].values
print('Single solvent actuals shape:', single_actuals.shape)

# Our predictions for single solvent (task 0)
single_preds = submission[submission['task']==0][['target_1', 'target_2', 'target_3']].values
print('Single solvent predictions shape:', single_preds.shape)

# Calculate MSE
mse = np.mean((single_actuals - single_preds) ** 2)
print(f'\nRecalculated Single Solvent MSE: {mse:.6f}')

In [None]:
# Same for full data
full_actuals = full_data[['Product 2', 'Product 3', 'SM']].values
print('Full data actuals shape:', full_actuals.shape)

# Our predictions for full data (task 1)
full_preds = submission[submission['task']==1][['target_1', 'target_2', 'target_3']].values
print('Full data predictions shape:', full_preds.shape)

# Calculate MSE
mse_full = np.mean((full_actuals - full_preds) ** 2)
print(f'\nRecalculated Full Data MSE: {mse_full:.6f}')

# Overall
n_single = len(single_actuals)
n_full = len(full_actuals)
overall_mse = (mse * n_single + mse_full * n_full) / (n_single + n_full)
print(f'\nOverall MSE: {overall_mse:.6f}')

In [None]:
# The recalculated MSE should match our original calculation
# If it doesn't, there's a bug in our original code

# Let me also check if the predictions are in the right order
# The submission should have predictions for each fold in order

print('Checking prediction order...')
print('\nTask 0 (single solvent):')
for fold in range(24):
    fold_data = submission[(submission['task']==0) & (submission['fold']==fold)]
    print(f'  Fold {fold}: {len(fold_data)} rows')

In [None]:
# Now I understand the issue!
# The submission file contains predictions for EACH FOLD of the CV
# But the actuals are the FULL dataset

# The correct way to compute MSE is to match predictions to actuals BY FOLD
# Each fold's predictions correspond to the test set for that fold

# For single solvent CV (leave-one-solvent-out):
# - Fold 0 predictions are for solvent 0's data
# - Fold 1 predictions are for solvent 1's data
# etc.

# Let me verify this by checking the number of rows per fold
print('Verifying fold structure...')
print('\nSingle solvent data by solvent:')
solvent_counts = single_data['SOLVENT NAME'].value_counts().sort_index()
print(solvent_counts)

print('\nSubmission rows per fold (task 0):')
for fold in range(24):
    fold_data = submission[(submission['task']==0) & (submission['fold']==fold)]
    print(f'  Fold {fold}: {len(fold_data)} rows')

In [None]:
# The fold sizes should match the solvent counts!
# Let me verify this more carefully

solvents = sorted(single_data['SOLVENT NAME'].unique())
print('Solvents in order:', solvents[:5], '...')

for i, solvent in enumerate(solvents):
    solvent_count = len(single_data[single_data['SOLVENT NAME'] == solvent])
    fold_count = len(submission[(submission['task']==0) & (submission['fold']==i)])
    match = '✓' if solvent_count == fold_count else '✗'
    print(f'Fold {i} ({solvent}): solvent={solvent_count}, submission={fold_count} {match}')

In [None]:
# Now let me properly compute the MSE by matching predictions to actuals
# For each fold, the predictions correspond to the test set (one solvent)

def compute_cv_mse_single():
    solvents = sorted(single_data['SOLVENT NAME'].unique())
    all_preds = []
    all_actuals = []
    
    for fold_idx, solvent in enumerate(solvents):
        # Get actuals for this solvent
        mask = single_data['SOLVENT NAME'] == solvent
        actuals = single_data[mask][['Product 2', 'Product 3', 'SM']].values
        
        # Get predictions for this fold
        fold_preds = submission[(submission['task']==0) & (submission['fold']==fold_idx)]
        preds = fold_preds[['target_1', 'target_2', 'target_3']].values
        
        all_preds.append(preds)
        all_actuals.append(actuals)
    
    all_preds = np.vstack(all_preds)
    all_actuals = np.vstack(all_actuals)
    
    mse = np.mean((all_actuals - all_preds) ** 2)
    return mse, all_preds, all_actuals

mse_single, preds_single, actuals_single = compute_cv_mse_single()
print(f'Properly computed Single Solvent MSE: {mse_single:.6f}')

In [None]:
# Now for full data (leave-one-ramp-out)
def compute_cv_mse_full():
    # Get unique ramps
    ramps = full_data[['SOLVENT A NAME', 'SOLVENT B NAME']].drop_duplicates()
    ramps = ramps.sort_values(['SOLVENT A NAME', 'SOLVENT B NAME']).reset_index(drop=True)
    
    all_preds = []
    all_actuals = []
    
    for fold_idx, (_, row) in enumerate(ramps.iterrows()):
        # Get actuals for this ramp
        mask = (full_data['SOLVENT A NAME'] == row['SOLVENT A NAME']) & \
               (full_data['SOLVENT B NAME'] == row['SOLVENT B NAME'])
        actuals = full_data[mask][['Product 2', 'Product 3', 'SM']].values
        
        # Get predictions for this fold
        fold_preds = submission[(submission['task']==1) & (submission['fold']==fold_idx)]
        preds = fold_preds[['target_1', 'target_2', 'target_3']].values
        
        if len(actuals) != len(preds):
            print(f'WARNING: Fold {fold_idx} mismatch: actuals={len(actuals)}, preds={len(preds)}')
        
        all_preds.append(preds)
        all_actuals.append(actuals)
    
    all_preds = np.vstack(all_preds)
    all_actuals = np.vstack(all_actuals)
    
    mse = np.mean((all_actuals - all_preds) ** 2)
    return mse, all_preds, all_actuals

mse_full, preds_full, actuals_full = compute_cv_mse_full()
print(f'Properly computed Full Data MSE: {mse_full:.6f}')

In [None]:
# Overall MSE
n_single = len(actuals_single)
n_full = len(actuals_full)
overall_mse = (mse_single * n_single + mse_full * n_full) / (n_single + n_full)

print(f'\n=== FINAL VERIFICATION ===')
print(f'Single Solvent MSE: {mse_single:.6f} (n={n_single})')
print(f'Full Data MSE: {mse_full:.6f} (n={n_full})')
print(f'Overall MSE: {overall_mse:.6f}')
print(f'\nLB Score: 0.0982')
print(f'Gap: {abs(overall_mse - 0.0982):.6f}')

In [None]:
# CONCLUSION:
# If the properly computed MSE matches our original calculation (~0.011),
# then the issue is that the LB evaluates differently.

# If the properly computed MSE is closer to 0.0982,
# then our original calculation was wrong.

print('\nCONCLUSION:')
if abs(overall_mse - 0.0982) < 0.01:
    print('Our properly computed MSE matches LB - original calculation was wrong!')
elif abs(overall_mse - 0.011) < 0.01:
    print('Our MSE calculation is correct - LB evaluates differently!')
    print('This could be due to:')
    print('1. Different random seeds on Kaggle')
    print('2. Different PyTorch/NumPy versions')
    print('3. Different GPU behavior')
else:
    print(f'MSE is {overall_mse:.6f} - need to investigate further')