# Loop 32 Analysis: CatBoost RFE & Logit Transform

**Objective**:
1. Analyze why exp_031 (CatBoost Top 10) performed 18% worse than baseline.
2. Test the Evaluator's suggestion: **Logit Transform** of targets.
3. Determine if "Top 10" was too aggressive and if Top 20 is better.

**Hypothesis**:
- exp_031 underfitted due to lack of features.
- Logit transform will handle [0,1] bounds better and might improve CV even with fewer features.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from scipy.special import expit, logit

# Load data
DATA_PATH = '/home/data'
df_full = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
df_single = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')

# Load Spange for feature testing
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)

# Load predictions
# exp_026 (Baseline) - I need to find where its predictions are. 
# Usually in /home/submission/submission.csv if it was the last one, but exp_031 overwrote it.
# I might not have exp_026 predictions saved on disk unless I explicitly saved them.
# But I can re-run a quick validation of exp_031 since I have the code.

print("Data loaded.")

In [None]:
# Define Logit Transform Helper
def to_logit(y, clip_eps=1e-4):
    # Clip to avoid inf
    y_clipped = np.clip(y, clip_eps, 1 - clip_eps)
    return np.log(y_clipped / (1 - y_clipped))

def from_logit(y_logit):
    return expit(y_logit)

# Check distribution of targets
targets = ['Product 2', 'Product 3', 'SM']
plt.figure(figsize=(15, 5))
for i, t in enumerate(targets):
    plt.subplot(1, 3, i+1)
    sns.histplot(df_full[t], bins=50)
    plt.title(f'{t} Distribution')
plt.tight_layout()
plt.show()

# Check Logit distribution
plt.figure(figsize=(15, 5))
for i, t in enumerate(targets):
    plt.subplot(1, 3, i+1)
    y_log = to_logit(df_full[t])
    sns.histplot(y_log, bins=50)
    plt.title(f'Logit({t}) Distribution')
plt.tight_layout()
plt.show()

In [None]:
# Quick Test: CatBoost with Logit Transform vs Raw
# Using Single Solvent data (faster) and LOO split for a few folds

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]

def get_features(df, spange_indices=None):
    # Basic kinetic features
    time = pd.to_numeric(df["Residence Time"], errors='coerce').values
    temp = pd.to_numeric(df["Temperature"], errors='coerce').values
    
    temp_k = temp + 273.15
    inv_temp = 1000.0 / temp_k
    log_time = np.log(time + 1e-6)
    interaction = inv_temp * log_time
    
    X_kinetic = np.column_stack([time, temp, inv_temp, interaction])
    
    # Spange features
    # The error is likely in SPANGE_DF loading or indexing.
    # Let's bypass the complex lookup and just use a simple map
    
    # Create a dictionary map first
    spange_map = {}
    for idx, row in SPANGE_DF.iterrows():
        spange_map[idx] = row.values.astype(float)
        
    spange_vals_list = []
    for solvent in df["SOLVENT NAME"]:
        if solvent in spange_map:
            vals = spange_map[solvent]
            if spange_indices is not None:
                vals = vals[spange_indices]
            spange_vals_list.append(vals)
        else:
            # Should not happen but handle it
            print(f"Missing solvent: {solvent}")
            spange_vals_list.append(np.zeros(13 if spange_indices is None else len(spange_indices)))
            
    spange_vals = np.array(spange_vals_list)
        
    return np.hstack([X_kinetic, spange_vals])

# Setup data
X = df_single.copy()
Y = df_single[targets].copy()

# Top 10 indices (from exp_031)
top10_indices = [1, 2, 4, 7, 8, 11] 

# Prepare features
print("Preparing features...")
try:
    X_feat_top10 = get_features(X, top10_indices)
    X_feat_all = get_features(X, None)
except Exception as e:
    print(f"Error in feature prep: {e}")
    # Fallback to random if this still fails (it shouldn't)
    X_feat_top10 = np.random.rand(len(X), 10)
    X_feat_all = np.random.rand(len(X), 17)
    print("USING RANDOM FEATURES DUE TO ERROR")

print(f"Top 10 Feat Shape: {X_feat_top10.shape}")
print(f"All Feat Shape: {X_feat_all.shape}")

# Run a quick CV (first 5 solvents) to compare approaches
solvents = sorted(X["SOLVENT NAME"].unique())[:5] # Test on 5 solvents
print(f"Testing on {len(solvents)} solvents...")

results = []

for solvent in solvents:
    mask_test = X["SOLVENT NAME"] == solvent
    mask_train = ~mask_test
    
    X_train_10, Y_train = X_feat_top10[mask_train], Y[mask_train]
    X_test_10, Y_test = X_feat_top10[mask_test], Y[mask_test]
    
    X_train_all = X_feat_all[mask_train]
    X_test_all = X_feat_all[mask_test]
    
    # 1. Raw Target + Top 10 (exp_031 style)
    model_raw = CatBoostRegressor(iterations=100, verbose=0, loss_function='RMSE', allow_writing_files=False)
    preds_raw = []
    for i in range(3):
        model_raw.fit(X_train_10, Y_train.iloc[:, i])
        preds_raw.append(model_raw.predict(X_test_10))
    preds_raw = np.column_stack(preds_raw)
    mse_raw = np.mean((Y_test.values - preds_raw)**2)
    
    # 2. Logit Target + Top 10
    model_logit = CatBoostRegressor(iterations=100, verbose=0, loss_function='RMSE', allow_writing_files=False)
    preds_logit = []
    for i in range(3):
        y_tr_logit = to_logit(Y_train.iloc[:, i].values)
        model_logit.fit(X_train_10, y_tr_logit)
        p_logit = model_logit.predict(X_test_10)
        preds_logit.append(from_logit(p_logit))
    preds_logit = np.column_stack(preds_logit)
    mse_logit = np.mean((Y_test.values - preds_logit)**2)
    
    # 3. Logit Target + All Features
    model_logit_all = CatBoostRegressor(iterations=100, verbose=0, loss_function='RMSE', allow_writing_files=False)
    preds_logit_all = []
    for i in range(3):
        y_tr_logit = to_logit(Y_train.iloc[:, i].values)
        model_logit_all.fit(X_train_all, y_tr_logit)
        p_logit = model_logit_all.predict(X_test_all)
        preds_logit_all.append(from_logit(p_logit))
    preds_logit_all = np.column_stack(preds_logit_all)
    mse_logit_all = np.mean((Y_test.values - preds_logit_all)**2)

    results.append({
        'solvent': solvent,
        'mse_raw_top10': mse_raw,
        'mse_logit_top10': mse_logit,
        'mse_logit_all': mse_logit_all
    })

res_df = pd.DataFrame(results)
print("\nResults (MSE):")
print(res_df.mean())