# Loop 55 Analysis: Submission Failure Investigation

**Problem:** The last 5 submissions (exp_049 through exp_054) all failed with "Evaluation metric raised an unexpected error"

**Goal:** Understand why submissions are failing and fix the issue

In [1]:
import pandas as pd
import numpy as np
import json

# Load session state
with open('/home/code/session_state.json', 'r') as f:
    state = json.load(f)

# Get successful submissions
submissions = state.get('submissions', [])
print('All submissions:')
for s in submissions:
    lb = s.get('lb_score', '')
    cv = s.get('cv_score', '')
    print(f"  {s.get('experiment_id')}: CV={cv}, LB={lb}")

# Filter successful submissions (those with LB scores)
successful = [s for s in submissions if s.get('lb_score')]
print(f'\nSuccessful submissions: {len(successful)}')
print(f'Failed submissions: {len(submissions) - len(successful)}')

# Failed submissions
failed = [s for s in submissions if not s.get('lb_score')]
print(f'\nFailed submission IDs: {[s.get("experiment_id") for s in failed]}')

All submissions:
  exp_000: CV=0.011081, LB=0.09816
  exp_001: CV=0.012297, LB=0.10649
  exp_003: CV=0.010501, LB=0.09719
  exp_005: CV=0.01043, LB=0.09691
  exp_006: CV=0.009749, LB=0.09457
  exp_007: CV=0.009262, LB=0.09316
  exp_009: CV=0.009192, LB=0.09364
  exp_012: CV=0.009004, LB=0.09134
  exp_024: CV=0.008689, LB=0.08929
  exp_026: CV=0.008465, LB=0.08875
  exp_030: CV=0.008298, LB=0.08772
  exp_035: CV=0.009825, LB=0.09696
  exp_049: CV=0.008092, LB=
  exp_050: CV=0.008092, LB=
  exp_052: CV=0.01088, LB=
  exp_053: CV=0.008092, LB=
  exp_054: CV=0.008504, LB=

Successful submissions: 12
Failed submissions: 5

Failed submission IDs: ['exp_049', 'exp_050', 'exp_052', 'exp_053', 'exp_054']


In [2]:
# CV-LB relationship analysis
import matplotlib.pyplot as plt
from scipy import stats

cv_scores = [s['cv_score'] for s in successful]
lb_scores = [s['lb_score'] for s in successful]

# Linear regression
slope, intercept, r_value, p_value, std_err = stats.linregress(cv_scores, lb_scores)

print('CV-LB Relationship Analysis')
print('=' * 50)
print(f'Linear fit: LB = {slope:.4f} * CV + {intercept:.4f}')
print(f'R-squared = {r_value**2:.4f}')
print(f'Intercept = {intercept:.4f}')
print(f'Target = 0.0347')
print()
print(f'CRITICAL: Intercept ({intercept:.4f}) > Target (0.0347)')
print(f'Required CV to hit target: (0.0347 - {intercept:.4f}) / {slope:.4f} = {(0.0347 - intercept) / slope:.6f}')

CV-LB Relationship Analysis
Linear fit: LB = 4.2876 * CV + 0.0528
R-squared = 0.9523
Intercept = 0.0528
Target = 0.0347

CRITICAL: Intercept (0.0528) > Target (0.0347)
Required CV to hit target: (0.0347 - 0.0528) / 4.2876 = -0.004218


In [3]:
# Check current submission format
df = pd.read_csv('/home/submission/submission.csv')

print('Current Submission Analysis')
print('=' * 50)
print(f'Total rows: {len(df)}')
print(f'Columns: {df.columns.tolist()}')
print(f'\nTask distribution:')
print(df['task'].value_counts().sort_index())
print(f'\nFolds per task:')
print(df.groupby('task')['fold'].nunique())
print(f'\nTarget statistics:')
for col in ['target_1', 'target_2', 'target_3']:
    print(f'  {col}: min={df[col].min():.6f}, max={df[col].max():.6f}')
    print(f'    Values > 1: {(df[col] > 1).sum()}')
    print(f'    Values < 0: {(df[col] < 0).sum()}')
    print(f'    NaN: {df[col].isna().sum()}')

Current Submission Analysis
Total rows: 1883
Columns: ['id', 'index', 'task', 'fold', 'row', 'target_1', 'target_2', 'target_3']

Task distribution:
task
0     656
1    1227
Name: count, dtype: int64

Folds per task:
task
0    24
1    13
Name: fold, dtype: int64

Target statistics:
  target_1: min=0.000000, max=0.434598
    Values > 1: 0
    Values < 0: 0
    NaN: 0
  target_2: min=0.000000, max=0.435151
    Values > 1: 0
    Values < 0: 0
    NaN: 0
  target_3: min=0.000000, max=1.000000
    Values > 1: 0
    Values < 0: 0
    NaN: 0


In [4]:
# Check the fold/row structure more carefully
print('Fold/Row structure analysis:')
print('=' * 50)

for task in [0, 1]:
    task_df = df[df['task'] == task]
    print(f'\nTask {task}:')
    print(f'  Total rows: {len(task_df)}')
    print(f'  Unique folds: {task_df["fold"].nunique()}')
    print(f'  Fold range: {task_df["fold"].min()} to {task_df["fold"].max()}')
    
    # Check rows per fold
    rows_per_fold = task_df.groupby('fold').size()
    print(f'  Rows per fold: min={rows_per_fold.min()}, max={rows_per_fold.max()}')
    
    # Check row indices within each fold
    for fold in sorted(task_df['fold'].unique())[:3]:  # Check first 3 folds
        fold_df = task_df[task_df['fold'] == fold]
        print(f'    Fold {fold}: rows {fold_df["row"].min()} to {fold_df["row"].max()}, count={len(fold_df)}')

Fold/Row structure analysis:

Task 0:
  Total rows: 656
  Unique folds: 24
  Fold range: 0 to 23
  Rows per fold: min=5, max=59
    Fold 0: rows 0 to 36, count=37
    Fold 1: rows 0 to 36, count=37
    Fold 2: rows 0 to 57, count=58

Task 1:
  Total rows: 1227
  Unique folds: 13
  Fold range: 0 to 12
  Rows per fold: min=34, max=127
    Fold 0: rows 0 to 123, count=124
    Fold 1: rows 0 to 124, count=125
    Fold 2: rows 0 to 123, count=124


In [5]:
# Load the actual data to compare
DATA_PATH = '/home/data'

# Load single solvent data
single_df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
print('Single Solvent Data:')
print(f'  Total rows: {len(single_df)}')
print(f'  Unique solvents: {single_df["SOLVENT NAME"].nunique()}')

# Load full data
full_df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
print('\nFull Data:')
print(f'  Total rows: {len(full_df)}')
print(f'  Unique solvent pairs: {full_df.groupby(["SOLVENT A NAME", "SOLVENT B NAME"]).ngroups}')

Single Solvent Data:
  Total rows: 656
  Unique solvents: 24

Full Data:
  Total rows: 1227
  Unique solvent pairs: 13


In [6]:
# Analysis Summary
print('Analysis Summary:')
print('=' * 50)
print()
print('1. Submission format appears correct:')
print('   - 1883 rows (656 single + 1227 full)')
print('   - Correct columns: id, index, task, fold, row, target_1, target_2, target_3')
print('   - Task 0: 24 folds, Task 1: 13 folds')
print('   - All targets in [0, 1] range')
print('   - No NaN or Inf values')
print()
print('2. CV-LB Relationship:')
print(f'   - LB = {slope:.4f} * CV + {intercept:.4f} (R-squared = {r_value**2:.4f})')
print(f'   - Intercept ({intercept:.4f}) > Target (0.0347)')
print(f'   - This means even CV=0 would give LB={intercept:.4f}')
print()
print('3. Best CV achieved: 0.008092 (exp_050/051/053)')
print(f'   - Predicted LB: {slope * 0.008092 + intercept:.4f}')
print(f'   - Best actual LB: 0.0877 (exp_030)')
print()
print('4. Submission failures:')
print('   - exp_049, exp_050, exp_052, exp_053, exp_054 all failed')
print('   - Error: "Evaluation metric raised an unexpected error"')
print('   - The format looks correct, so this may be a Kaggle-side issue')
print('   - OR there is something subtle about the evaluation we are missing')

Analysis Summary:

1. Submission format appears correct:
   - 1883 rows (656 single + 1227 full)
   - Correct columns: id, index, task, fold, row, target_1, target_2, target_3
   - Task 0: 24 folds, Task 1: 13 folds
   - All targets in [0, 1] range
   - No NaN or Inf values

2. CV-LB Relationship:
   - LB = 4.2876 * CV + 0.0528 (R-squared = 0.9523)
   - Intercept (0.0528) > Target (0.0347)
   - This means even CV=0 would give LB=0.0528

3. Best CV achieved: 0.008092 (exp_050/051/053)
   - Predicted LB: 0.0875
   - Best actual LB: 0.0877 (exp_030)

4. Submission failures:
   - exp_049, exp_050, exp_052, exp_053, exp_054 all failed
   - Error: "Evaluation metric raised an unexpected error"
   - The format looks correct, so this may be a Kaggle-side issue
   - OR there is something subtle about the evaluation we are missing
