# Experiment 103: Mixall Kernel with SolventB% Bug Fix

**Goal**: Fix the critical SolventB% scaling bug from exp_102.

**Bug in exp_102**:
```python
sb_pct = X['SolventB%'].values.reshape(-1, 1) / 100.0  # BUG!
```

**The Problem**:
- SolventB% is ALREADY in [0, 1] range (verified: min=0.0, max=1.0)
- Dividing by 100 makes a 50% mixture (0.5) become 0.005
- This completely breaks mixture predictions

**Fix**:
```python
sb_pct = X['SolventB%'].values.reshape(-1, 1)  # Already in [0, 1]
```

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def load_features(name):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

print('Data loading functions defined')

In [None]:
# GroupKFold validation (5 splits)
from sklearn.model_selection import GroupKFold

def generate_leave_one_out_splits(X, Y):
    """Generate Group K-Fold splits across the solvents (5-fold)."""
    groups = X["SOLVENT NAME"]
    n_groups = len(groups.unique())
    n_splits = min(5, n_groups)
    
    gkf = GroupKFold(n_splits=n_splits)
    
    for train_idx, test_idx in gkf.split(X, Y, groups):
        yield (
            (X.iloc[train_idx], Y.iloc[train_idx]),
            (X.iloc[test_idx], Y.iloc[test_idx]),
        )

def generate_leave_one_ramp_out_splits(X, Y):
    """Generate Group K-Fold splits across the solvent ramps (5-fold)."""
    groups = X["SOLVENT A NAME"].astype(str) + "_" + X["SOLVENT B NAME"].astype(str)
    
    n_groups = len(groups.unique())
    n_splits = min(5, n_groups)
    
    gkf = GroupKFold(n_splits=n_splits)
    
    for train_idx, test_idx in gkf.split(X, Y, groups):
        yield (
            (X.iloc[train_idx], Y.iloc[train_idx]),
            (X.iloc[test_idx], Y.iloc[test_idx]),
        )

print('GroupKFold validation functions defined (5 splits)')

In [None]:
# Base classes
from abc import ABC, abstractmethod

class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError

    def featurize(X, Y):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass

    def train_model(self, X_train, y_train):
        raise NotImplementedError

    def predict(self):
        raise NotImplementedError

print('Base classes defined')

In [None]:
# Featurizers with BUG FIX
class PrecomputedFeaturizer(SmilesFeaturizer):
    def __init__(self, features='spange_descriptors'):
        self.features = load_features(features)
        self.feats_dim = self.features.shape[1] + 2  # +2 for Time, Temp
        
    def featurize(self, X):
        res_time = X['Residence Time'].values.reshape(-1, 1)
        temp = X['Temperature'].values.reshape(-1, 1)
        
        solvent_names = X['SOLVENT NAME']
        feats = self.features.loc[solvent_names].values
        
        final_feats = np.hstack([res_time, temp, feats])
        return torch.tensor(final_feats, dtype=torch.float32)

class PrecomputedFeaturizerMixed(SmilesFeaturizer):
    def __init__(self, features='spange_descriptors'):
        self.features = load_features(features)
        self.feats_dim = self.features.shape[1] + 3  # +3 for Time, Temp, %B
        
    def featurize(self, X):
        res_time = X['Residence Time'].values.reshape(-1, 1)
        temp = X['Temperature'].values.reshape(-1, 1)
        
        # BUG FIX: SolventB% is ALREADY in [0, 1] range - DO NOT divide by 100!
        sb_pct = X['SolventB%'].values.reshape(-1, 1)  # FIXED: No / 100.0
        
        desc_a = self.features.loc[X['SOLVENT A NAME']].values
        desc_b = self.features.loc[X['SOLVENT B NAME']].values
        
        # Correct mixture interpolation
        mixture_feats = (1 - sb_pct) * desc_a + sb_pct * desc_b
        
        final_feats = np.hstack([res_time, temp, sb_pct, mixture_feats])
        return torch.tensor(final_feats, dtype=torch.float32)

print('Featurizers defined with BUG FIX')
print('SolventB% is now used directly (already in [0, 1] range)')

In [None]:
# Enhanced MLP
class EnhancedMLP(nn.Module):
    def __init__(self, input_dim, output_dim=3, hidden_dims=[128, 64, 32], dropout=0.2):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, h_dim),
                nn.BatchNorm1d(h_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, output_dim))
        self.net = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.net(x)

print('EnhancedMLP defined')

In [None]:
# EnsembleModel with clipping and renormalization
class EnsembleModel(BaseModel):
    def __init__(self, data='single', hidden_dims=[128, 64, 32], dropout=0.2, use_tta=False, 
                 weights=[0.25, 0.25, 0.25, 0.25]):
        self.data = data
        self.use_tta = use_tta
        self.weights = weights
        
        if data == 'single':
            self.smiles_featurizer = PrecomputedFeaturizer()
        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixed()
            
        self.scaler = StandardScaler()
        self.mlp = EnhancedMLP(self.smiles_featurizer.feats_dim, hidden_dims=hidden_dims, dropout=dropout)
        
        # GBDT Models
        self.xgb_params = dict(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
        self.rf_params = dict(n_estimators=100, max_depth=10, random_state=42)
        self.lgb_params = dict(n_estimators=100, num_leaves=31, learning_rate=0.1, random_state=42, verbose=-1)
        
        self.xgb = MultiOutputRegressor(xgb.XGBRegressor(**self.xgb_params))
        self.rf = MultiOutputRegressor(RandomForestRegressor(**self.rf_params))
        self.lgbm = MultiOutputRegressor(lgb.LGBMRegressor(**self.lgb_params))
        
    def train_model(self, train_X, train_Y, num_epochs=100, batch_size=32, lr=0.001, 
                    optimizer=torch.optim.Adam, criterion=nn.MSELoss, device=None, verbose=False):
        X_tensor = self.smiles_featurizer.featurize(train_X)
        X_np = X_tensor.numpy()
        train_Y_np = train_Y.values
        
        X_scaled = self.scaler.fit_transform(X_np)
        
        # Ensure strict DataFrame format with string column names for LightGBM compatibility
        feature_names = [str(i) for i in range(X_scaled.shape[1])]
        X_scaled_df = pd.DataFrame(X_scaled, columns=feature_names)
        
        # Train GBDT models
        self.xgb.fit(X_scaled_df, train_Y_np)
        self.rf.fit(X_scaled_df, train_Y_np)
        self.lgbm.fit(X_scaled_df, train_Y_np)
        
        # Train MLP
        X_tensor_scaled = torch.tensor(X_scaled, dtype=torch.float32)
        train_Y_tensor = torch.tensor(train_Y_np, dtype=torch.float32)
        
        if device is None:
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.mlp.to(device)
        
        optimizer_inst = optimizer(self.mlp.parameters(), lr=lr)
        train_loader = DataLoader(TensorDataset(X_tensor_scaled, train_Y_tensor), 
                                  batch_size=batch_size, shuffle=True, drop_last=True)
        
        criterion_inst = criterion()
        for epoch in range(num_epochs):
            self.mlp.train()
            for inputs, targets in train_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                optimizer_inst.zero_grad()
                loss = criterion_inst(self.mlp(inputs), targets)
                loss.backward()
                optimizer_inst.step()

    def predict(self, test_X):
        X_tensor = self.smiles_featurizer.featurize(test_X)
        X_np = X_tensor.numpy()
        X_scaled = self.scaler.transform(X_np)
        
        # Ensure strict DataFrame format with string column names for LightGBM compatibility
        feature_names = [str(i) for i in range(X_scaled.shape[1])]
        X_scaled_df = pd.DataFrame(X_scaled, columns=feature_names)
        
        # MLP Preds
        self.mlp.eval()
        device = next(self.mlp.parameters()).device
        with torch.no_grad():
            X_tensor_scaled = torch.tensor(X_scaled, dtype=torch.float32).to(device)
            mlp_preds = self.mlp(X_tensor_scaled).cpu().numpy()
            
        # GBDT Preds
        xgb_preds = self.xgb.predict(X_scaled_df)
        rf_preds = self.rf.predict(X_scaled_df)
        lgb_preds = self.lgbm.predict(X_scaled_df)
        
        # Weighted Ensemble
        final_preds = (self.weights[0] * mlp_preds + 
                       self.weights[1] * xgb_preds + 
                       self.weights[2] * rf_preds + 
                       self.weights[3] * lgb_preds)
        
        # Clip predictions to valid range [0, 1]
        final_preds = np.clip(final_preds, 0.0, 1.0)
        
        # Renormalize if sum > 1 (mass balance constraint)
        totals = final_preds.sum(axis=1, keepdims=True)
        divisor = np.maximum(totals, 1.0)
        final_preds = final_preds / divisor
                       
        return torch.tensor(final_preds)

print('EnsembleModel defined with clipping and renormalization')

In [None]:
# Run CV to compute local score
import tqdm

def compute_cv_score(verbose=True):
    """Compute CV score with GroupKFold (5 splits)."""
    
    # Single solvent CV (5 folds)
    X_single, Y_single = load_data("single_solvent")
    split_generator = generate_leave_one_out_splits(X_single, Y_single)
    
    single_mse_list = []
    for fold_idx, split in enumerate(split_generator):
        (train_X, train_Y), (test_X, test_Y) = split
        
        model = EnsembleModel(data='single')
        model.train_model(train_X, train_Y)
        
        predictions = model.predict(test_X)
        predictions_np = predictions.detach().cpu().numpy()
        
        mse = np.mean((predictions_np - test_Y.values) ** 2)
        single_mse_list.append(mse)
        if verbose:
            print(f"Single Fold {fold_idx}: MSE = {mse:.6f}")
    
    single_cv = np.mean(single_mse_list)
    if verbose:
        print(f"\nSingle Solvent CV MSE (5-fold): {single_cv:.6f}")
    
    # Full data CV (5 folds)
    X_full, Y_full = load_data("full")
    split_generator = generate_leave_one_ramp_out_splits(X_full, Y_full)
    
    full_mse_list = []
    for fold_idx, split in enumerate(split_generator):
        (train_X, train_Y), (test_X, test_Y) = split
        
        model = EnsembleModel(data='full')
        model.train_model(train_X, train_Y)
        
        predictions = model.predict(test_X)
        predictions_np = predictions.detach().cpu().numpy()
        
        mse = np.mean((predictions_np - test_Y.values) ** 2)
        full_mse_list.append(mse)
        if verbose:
            print(f"Full Fold {fold_idx}: MSE = {mse:.6f}")
    
    full_cv = np.mean(full_mse_list)
    if verbose:
        print(f"\nFull Data CV MSE (5-fold): {full_cv:.6f}")
    
    combined_cv = (single_cv + full_cv) / 2
    if verbose:
        print(f"\n=== Combined CV MSE: {combined_cv:.6f} ===")
    
    return single_cv, full_cv, combined_cv

print("Running CV with FIXED SolventB% handling...")
single_cv, full_cv, combined_cv = compute_cv_score()

In [None]:
# Save results
import json

results = {
    'cv_score': float(combined_cv),
    'single_cv': float(single_cv),
    'full_cv': float(full_cv),
    'model': 'EnsembleModel (MLP + XGB + RF + LGB) with FIXED SolventB%',
    'validation': 'GroupKFold (5 splits)',
    'fix': 'Removed / 100.0 from SolventB% - it is already in [0, 1] range'
}

with open('/home/code/experiments/103_mixall_fixed/metrics.json', 'w') as f:
    json.dump(results, f, indent=2)

print("Results saved")
print(f"Combined CV (with fix): {combined_cv:.6f}")
print(f"exp_102 CV (with bug): 0.013542")
print(f"Improvement: {(0.013542 - combined_cv) / 0.013542 * 100:.2f}%")

## Generate Submission

The following cells follow the official template structure.

**CRITICAL**: The model class in submission cells MUST match the CV computation class (`EnsembleModel`).

In [None]:
# Generate submission
print(f"CV (with fix): {combined_cv:.6f}")
print("Generating submission...")

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

# Also save to standard location
import shutil
import os
os.makedirs('/home/submission', exist_ok=True)
shutil.copy("submission.csv", "/home/submission/submission.csv")
print("Submission saved!")
print(f"Shape: {submission.shape}")
print(submission.head())

# Verify predictions are valid
print(f"\nMin target_1: {submission['target_1'].min():.6f}")
print(f"Min target_2: {submission['target_2'].min():.6f}")
print(f"Min target_3: {submission['target_3'].min():.6f}")
print(f"Max sum: {(submission['target_1'] + submission['target_2'] + submission['target_3']).max():.6f}")

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################