# Loop 102 Analysis: Submission Failure Investigation

**Issue**: exp_101 (Mixall Kernel with GroupKFold) failed with "Evaluation metric raised an unexpected error"

**Root Cause Investigation**:
1. Negative predictions in submission.csv
2. SolventB% scaling difference (divided by 100 vs not)
3. Need to understand what's causing the evaluation error

In [8]:
import pandas as pd
import numpy as np

# Load the failed submission
submission = pd.read_csv('/home/submission/submission.csv')
print(f"Submission shape: {submission.shape}")
print(f"\nColumn names: {submission.columns.tolist()}")
print(f"\nFirst few rows:")
print(submission.head(10))

Submission shape: (1883, 8)

Column names: ['id', 'index', 'task', 'fold', 'row', 'target_1', 'target_2', 'target_3']

First few rows:
   id  index  task  fold  row  target_1  target_2  target_3
0   0      0     0     0    0  0.013813  0.008498  0.866816
1   1      1     0     0    1  0.014356  0.008836  0.863481
2   2      2     0     0    2  0.321047  0.221328  0.051202
3   3      3     0     0    3  0.325469  0.223192  0.042331
4   4      4     0     0    4  0.322876  0.223271  0.045700
5   5      5     0     0    5  0.021825  0.026440  0.865749
6   6      6     0     0    6  0.021825  0.026440  0.865749
7   7      7     0     0    7  0.021825  0.026440  0.865749
8   8      8     0     0    8  0.021825  0.026440  0.865749
9   9      9     0     0    9 -0.005979 -0.004412  0.950159


In [9]:
# Check for negative values
for col in ['target_1', 'target_2', 'target_3']:
    neg_count = (submission[col] < 0).sum()
    print(f"{col}: {neg_count} negative values ({100*neg_count/len(submission):.2f}%)")

print(f"\nTotal rows with any negative: {((submission['target_1'] < 0) | (submission['target_2'] < 0) | (submission['target_3'] < 0)).sum()}")

target_1: 56 negative values (2.97%)
target_2: 63 negative values (3.35%)
target_3: 19 negative values (1.01%)

Total rows with any negative: 92


In [10]:
# Check for values > 1 (also invalid for yields)
for col in ['target_1', 'target_2', 'target_3']:
    over_count = (submission[col] > 1).sum()
    print(f"{col}: {over_count} values > 1 ({100*over_count/len(submission):.2f}%)")

target_1: 0 values > 1 (0.00%)
target_2: 0 values > 1 (0.00%)
target_3: 3 values > 1 (0.16%)


In [11]:
# Check the distribution of predictions
print("\nPrediction statistics:")
print(submission[['target_1', 'target_2', 'target_3']].describe())


Prediction statistics:
          target_1     target_2     target_3
count  1883.000000  1883.000000  1883.000000
mean      0.146767     0.129071     0.514198
std       0.122193     0.107345     0.341795
min      -0.019779    -0.024991    -0.017362
25%       0.035375     0.033334     0.136847
50%       0.102743     0.095480     0.644871
75%       0.268811     0.225374     0.845212
max       0.407124     0.371803     1.004625


In [12]:
# Check if the sum of yields exceeds 1 (physically impossible)
submission['yield_sum'] = submission['target_1'] + submission['target_2'] + submission['target_3']
print(f"\nYield sum statistics:")
print(submission['yield_sum'].describe())
print(f"\nRows where yield_sum > 1: {(submission['yield_sum'] > 1).sum()}")


Yield sum statistics:
count    1883.000000
mean        0.790037
std         0.165299
min         0.158422
25%         0.712481
50%         0.830684
75%         0.909277
max         1.052307
Name: yield_sum, dtype: float64

Rows where yield_sum > 1: 31


In [None]:
# Analyze the CV-LB relationship from submission history
import json

# Load session state
with open('/home/code/session_state.json', 'r') as f:
    state = json.load(f)

# Extract submission history
submissions = state.get('submissions', [])
print(f"Total submissions: {len(submissions)}")
print("\nSubmission history:")
for s in submissions:
    print(f"  {s.get('experiment_id', 'N/A')}: CV={s.get('cv_score', 'N/A')}, LB={s.get('lb_score', 'N/A')}")

In [None]:
# Analyze CV-LB relationship
cv_scores = []
lb_scores = []
for s in submissions:
    cv = s.get('cv_score')
    lb = s.get('lb_score')
    if cv is not None and lb is not None and lb != 'pending':
        try:
            cv_scores.append(float(cv))
            lb_scores.append(float(lb))
        except:
            pass

print(f"\nValid CV-LB pairs: {len(cv_scores)}")
if len(cv_scores) >= 3:
    from sklearn.linear_model import LinearRegression
    X = np.array(cv_scores).reshape(-1, 1)
    y = np.array(lb_scores)
    reg = LinearRegression().fit(X, y)
    r2 = reg.score(X, y)
    print(f"\nLinear fit: LB = {reg.coef_[0]:.4f} * CV + {reg.intercept_:.4f}")
    print(f"R² = {r2:.4f}")
    print(f"\nTarget LB: 0.0347")
    print(f"Intercept: {reg.intercept_:.4f}")
    print(f"Required CV for target: {(0.0347 - reg.intercept_) / reg.coef_[0]:.6f}")

In [None]:
# Key insight: The submission failed likely due to negative predictions
# The evaluation metric expects yields in [0, 1] range

print("\n" + "="*60)
print("ROOT CAUSE ANALYSIS")
print("="*60)
print("\n1. NEGATIVE PREDICTIONS:")
print(f"   - {((submission['target_1'] < 0) | (submission['target_2'] < 0) | (submission['target_3'] < 0)).sum()} rows have negative values")
print(f"   - This is physically impossible for yields")
print(f"   - The evaluation metric likely raises an error for invalid predictions")

print("\n2. FIX REQUIRED:")
print("   - Clip predictions to [0, 1] range")
print("   - Add: final_preds = np.clip(final_preds, 0, 1)")

print("\n3. ADDITIONAL ISSUE:")
print("   - SolventB% scaling: exp_101 divides by 100, mixall kernel doesn't")
print("   - This affects mixture interpolation")

In [None]:
# Check the best experiments we have
print("\n" + "="*60)
print("BEST EXPERIMENTS SUMMARY")
print("="*60)

experiments = state.get('experiments', [])
best_cv = float('inf')
best_exp = None
for exp in experiments:
    score = exp.get('score')
    if score is not None and score < best_cv:
        best_cv = score
        best_exp = exp

if best_exp:
    print(f"\nBest CV: {best_cv:.6f}")
    print(f"Experiment: {best_exp.get('name', 'N/A')}")
    print(f"Model: {best_exp.get('model_type', 'N/A')}")

In [None]:
# Check what the best LB submission was
print("\n" + "="*60)
print("BEST LB SUBMISSIONS")
print("="*60)

best_lb = float('inf')
best_lb_sub = None
for s in submissions:
    lb = s.get('lb_score')
    if lb is not None and lb != 'pending':
        try:
            lb_val = float(lb)
            if lb_val < best_lb:
                best_lb = lb_val
                best_lb_sub = s
        except:
            pass

if best_lb_sub:
    print(f"\nBest LB: {best_lb:.6f}")
    print(f"Experiment: {best_lb_sub.get('experiment_id', 'N/A')}")
    print(f"CV: {best_lb_sub.get('cv_score', 'N/A')}")
    print(f"\nTarget: 0.0347")
    print(f"Gap: {best_lb - 0.0347:.6f} ({100*(best_lb - 0.0347)/0.0347:.1f}%)")

In [None]:
# Summary of key findings
print("\n" + "="*60)
print("KEY FINDINGS FOR NEXT EXPERIMENT")
print("="*60)

print("""
1. SUBMISSION FAILURE ROOT CAUSE:
   - Negative predictions in submission.csv
   - Need to clip predictions to [0, 1] range

2. CV-LB RELATIONSHIP:
   - All tabular approaches fall on same line: LB ≈ 4.3 × CV + 0.053
   - Intercept (0.053) > Target (0.0347)
   - This means NO amount of CV improvement can reach target
   - Need to CHANGE the relationship, not just improve CV

3. WHAT TO TRY NEXT:
   a) Fix the submission by clipping predictions
   b) Try approaches that could change the CV-LB relationship:
      - Extrapolation detection + conservative predictions
      - Domain constraints (yields sum to ≤1)
      - Different representation (GNN, ChemBERTa)
   
4. WHAT NOT TO DO:
   - More tabular model variants (exhaustively tested)
   - Multi-seed optimization (too far from target)
   - Hyperparameter tuning (won't change intercept)
""")

In [None]:
# Check exp_073 which had a very different LB (0.1451)
print("\n" + "="*60)
print("OUTLIER ANALYSIS: exp_073")
print("="*60)

for s in submissions:
    if s.get('experiment_id') == 'exp_073':
        print(f"\nexp_073:")
        print(f"  CV: {s.get('cv_score')}")
        print(f"  LB: {s.get('lb_score')}")
        print(f"  Notes: This had a much worse LB than expected from CV")
        print(f"  Likely cause: Model class mismatch or submission error")