# Experiment 078: MixAll Kernel Approach with GroupKFold(5)

**Rationale**: The mixall kernel uses GroupKFold(5) instead of Leave-One-Out CV. This is a fundamentally different CV scheme that may have a DIFFERENT CV-LB relationship. The key insight is that the intercept in our CV-LB relationship (0.052) exceeds the target (0.0347), so we need to try approaches that might change this relationship.

**Key Changes**:
1. Override `generate_leave_one_out_splits` to use GroupKFold(5)
2. Override `generate_leave_one_ramp_out_splits` to use GroupKFold(5)
3. Use ensemble: MLP + XGBoost + RF + LightGBM

In [1]:
import sys
sys.path.append('/home/data/')

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Define local load functions
def load_data(data_type):
    """Load data from local paths."""
    if data_type == "single_solvent":
        df = pd.read_csv('/home/data/catechol_single_solvent_yields.csv')
        X = df[['Residence Time', 'Temperature', 'SOLVENT NAME']]
        Y = df[['SM', 'Product 2', 'Product 3']]  # Correct column names
    elif data_type == "full":
        df = pd.read_csv('/home/data/catechol_full_data_yields.csv')
        X = df[['Residence Time', 'Temperature', 'SOLVENT A NAME', 'SOLVENT B NAME', 'SolventB%']]
        Y = df[['SM', 'Product 2', 'Product 3']]  # Correct column names
    return X, Y

def load_features(feature_type):
    """Load precomputed features."""
    if feature_type == 'spange_descriptors':
        return pd.read_csv('/home/data/spange_descriptors_lookup.csv', index_col=0)
    elif feature_type == 'drfps':
        return pd.read_csv('/home/data/drfps_catechol_lookup.csv', index_col=0)
    elif feature_type == 'fragprints':
        return pd.read_csv('/home/data/fragprints_lookup.csv', index_col=0)
    elif feature_type == 'acs_pca_descriptors':
        return pd.read_csv('/home/data/acs_pca_descriptors_lookup.csv', index_col=0)

print('Imports and local data functions done')

Imports done


In [2]:
# Override the CV split functions to use GroupKFold(5) instead of Leave-One-Out
from typing import Any, Generator

def generate_leave_one_out_splits(
    X: pd.DataFrame, Y: pd.DataFrame
) -> Generator[
    tuple[tuple[pd.DataFrame, pd.DataFrame], tuple[pd.DataFrame, pd.DataFrame]],
    Any,
    None,
]:
    """Generate Group K-Fold splits across the solvents (5-fold)."""
    groups = X["SOLVENT NAME"]
    n_groups = len(groups.unique())
    n_splits = min(5, n_groups)
    
    gkf = GroupKFold(n_splits=n_splits)
    
    for train_idx, test_idx in gkf.split(X, Y, groups):
        yield (
            (X.iloc[train_idx], Y.iloc[train_idx]),
            (X.iloc[test_idx], Y.iloc[test_idx]),
        )

def generate_leave_one_ramp_out_splits(
    X: pd.DataFrame, Y: pd.DataFrame
) -> Generator[
    tuple[tuple[pd.DataFrame, pd.DataFrame], tuple[pd.DataFrame, pd.DataFrame]],
    Any,
    None,
]:
    """Generate Group K-Fold splits across the solvent ramps (5-fold)."""
    groups = X["SOLVENT A NAME"].astype(str) + "_" + X["SOLVENT B NAME"].astype(str)
    
    n_groups = len(groups.unique())
    n_splits = min(5, n_groups)
    
    gkf = GroupKFold(n_splits=n_splits)
    
    for train_idx, test_idx in gkf.split(X, Y, groups):
        yield (
            (X.iloc[train_idx], Y.iloc[train_idx]),
            (X.iloc[test_idx], Y.iloc[test_idx]),
        )

print('GroupKFold(5) CV functions defined')

GroupKFold(5) CV functions defined


In [3]:
# Featurizers
class PrecomputedFeaturizer:
    def __init__(self, features='spange_descriptors'):
        self.features = load_features(features)
        self.feats_dim = self.features.shape[1] + 2  # +2 for Time, Temp
        
    def featurize(self, X):
        res_time = X['Residence Time'].values.reshape(-1, 1)
        temp = X['Temperature'].values.reshape(-1, 1)
        solvent_names = X['SOLVENT NAME']
        feats = self.features.loc[solvent_names].values
        final_feats = np.hstack([res_time, temp, feats])
        return torch.tensor(final_feats, dtype=torch.float32)

class PrecomputedFeaturizerMixed:
    def __init__(self, features='spange_descriptors'):
        self.features = load_features(features)
        self.feats_dim = self.features.shape[1] + 3  # +3 for Time, Temp, %B
        
    def featurize(self, X):
        res_time = X['Residence Time'].values.reshape(-1, 1)
        temp = X['Temperature'].values.reshape(-1, 1)
        sb_pct = X['SolventB%'].values.reshape(-1, 1) / 100.0  # Normalize to [0,1]
        
        desc_a = self.features.loc[X['SOLVENT A NAME']].values
        desc_b = self.features.loc[X['SOLVENT B NAME']].values
        
        # Linear mixing
        mixture_feats = (1 - sb_pct) * desc_a + sb_pct * desc_b
        
        final_feats = np.hstack([res_time, temp, sb_pct, mixture_feats])
        return torch.tensor(final_feats, dtype=torch.float32)

print('Featurizers defined')

Featurizers defined


In [4]:
# MLP Model
class EnhancedMLP(nn.Module):
    def __init__(self, input_dim, output_dim=3, hidden_dims=[128, 64, 32], dropout=0.2):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, h_dim))
            layers.append(nn.BatchNorm1d(h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, output_dim))
        layers.append(nn.Sigmoid())  # Outputs in [0,1]
        self.network = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.network(x)

print('MLP defined')

MLP defined


In [5]:
# Ensemble Model (MLP + XGBoost + RF + LightGBM)
from sklearn.multioutput import MultiOutputRegressor

class EnsembleModel:
    def __init__(self, data='single', hidden_dims=[128, 64, 32], dropout=0.2, 
                 weights=[0.25, 0.25, 0.25, 0.25]):
        self.data = data
        self.weights = weights
        
        if data == 'single':
            self.smiles_featurizer = PrecomputedFeaturizer('spange_descriptors')
        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixed('spange_descriptors')
        
        self.scaler = StandardScaler()
        self.mlp = EnhancedMLP(self.smiles_featurizer.feats_dim, 3, hidden_dims, dropout)
        
        # XGBoost - use MultiOutputRegressor
        self.xgb_base = xgb.XGBRegressor(
            n_estimators=100,
            max_depth=5,
            learning_rate=0.1,
            random_state=42,
            n_jobs=-1
        )
        
        # Random Forest
        self.rf_params = {
            'n_estimators': 100,
            'max_depth': 10,
            'random_state': 42,
            'n_jobs': -1
        }
        
        # LightGBM - use MultiOutputRegressor
        self.lgb_base = lgb.LGBMRegressor(
            n_estimators=100,
            num_leaves=31,
            learning_rate=0.1,
            random_state=42,
            n_jobs=-1,
            verbose=-1
        )
        
    def train_model(self, train_X, train_Y, num_epochs=100, lr=1e-3, batch_size=32, verbose=False):
        # Featurize
        X_tensor = self.smiles_featurizer.featurize(train_X)
        X_np = X_tensor.numpy()
        train_Y_np = train_Y.values
        
        # Scale
        X_scaled = self.scaler.fit_transform(X_np)
        
        # Train GBDT models with MultiOutputRegressor
        self.xgb = MultiOutputRegressor(self.xgb_base)
        self.xgb.fit(X_scaled, train_Y_np)
        
        self.rf = RandomForestRegressor(**self.rf_params)
        self.rf.fit(X_scaled, train_Y_np)
        
        self.lgbm = MultiOutputRegressor(self.lgb_base)
        self.lgbm.fit(X_scaled, train_Y_np)
        
        # Train MLP
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.mlp.to(device)
        
        X_tensor_scaled = torch.tensor(X_scaled, dtype=torch.float32)
        train_Y_tensor = torch.tensor(train_Y_np, dtype=torch.float32)
        
        optimizer = torch.optim.Adam(self.mlp.parameters(), lr=lr)
        criterion = nn.MSELoss()
        train_loader = DataLoader(TensorDataset(X_tensor_scaled, train_Y_tensor), 
                                  batch_size=batch_size, shuffle=True, drop_last=True)
        
        for epoch in range(num_epochs):
            self.mlp.train()
            for inputs, targets in train_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                optimizer.zero_grad()
                loss = criterion(self.mlp(inputs), targets)
                loss.backward()
                optimizer.step()
    
    def predict(self, test_X):
        X_tensor = self.smiles_featurizer.featurize(test_X)
        X_np = X_tensor.numpy()
        X_scaled = self.scaler.transform(X_np)
        
        # MLP predictions
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.mlp.eval()
        with torch.no_grad():
            X_tensor_scaled = torch.tensor(X_scaled, dtype=torch.float32).to(device)
            mlp_preds = self.mlp(X_tensor_scaled).cpu().numpy()
        
        # GBDT predictions
        xgb_preds = self.xgb.predict(X_scaled)
        rf_preds = self.rf.predict(X_scaled)
        lgb_preds = self.lgbm.predict(X_scaled)
        
        # Clip to [0,1]
        xgb_preds = np.clip(xgb_preds, 0, 1)
        rf_preds = np.clip(rf_preds, 0, 1)
        lgb_preds = np.clip(lgb_preds, 0, 1)
        
        # Weighted ensemble
        final_preds = (self.weights[0] * mlp_preds + 
                       self.weights[1] * xgb_preds + 
                       self.weights[2] * rf_preds + 
                       self.weights[3] * lgb_preds)
        
        return torch.tensor(final_preds)

print('EnsembleModel defined')

EnsembleModel defined


In [None]:
# Run CV for single solvent data with GroupKFold(5)
import tqdm

X, Y = load_data("single_solvent")
print(f"Single solvent data: {len(X)} samples, {len(X['SOLVENT NAME'].unique())} solvents")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []
fold_mses = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=5):
    (train_X, train_Y), (test_X, test_Y) = split
    
    print(f"\nFold {fold_idx}: Train {len(train_X)}, Test {len(test_X)}")
    print(f"  Test solvents: {test_X['SOLVENT NAME'].unique()[:3]}...")
    
    model = EnsembleModel(data='single')
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X)
    predictions_np = predictions.detach().cpu().numpy()
    
    # Calculate fold MSE
    fold_mse = np.mean((predictions_np - test_Y.values) ** 2)
    fold_mses.append(fold_mse)
    print(f"  Fold MSE: {fold_mse:.6f}")
    
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_single_solvent = pd.DataFrame(all_predictions)
print(f"\nSingle solvent CV MSE: {np.mean(fold_mses):.6f} ± {np.std(fold_mses):.6f}")

In [None]:
# Run CV for full (mixture) data with GroupKFold(5)
X, Y = load_data("full")
print(f"Full data: {len(X)} samples")

# Create ramp groups
groups = X["SOLVENT A NAME"].astype(str) + "_" + X["SOLVENT B NAME"].astype(str)
print(f"Number of unique ramps: {len(groups.unique())}")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []
fold_mses = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=5):
    (train_X, train_Y), (test_X, test_Y) = split
    
    print(f"\nFold {fold_idx}: Train {len(train_X)}, Test {len(test_X)}")
    
    model = EnsembleModel(data='full')
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X)
    predictions_np = predictions.detach().cpu().numpy()
    
    # Calculate fold MSE
    fold_mse = np.mean((predictions_np - test_Y.values) ** 2)
    fold_mses.append(fold_mse)
    print(f"  Fold MSE: {fold_mse:.6f}")
    
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_full_data = pd.DataFrame(all_predictions)
print(f"\nFull data CV MSE: {np.mean(fold_mses):.6f} ± {np.std(fold_mses):.6f}")

In [None]:
# Combine and save submission
submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"

print(f"Submission shape: {submission.shape}")
print(f"Columns: {submission.columns.tolist()}")
print(f"\nFirst few rows:")
print(submission.head())

# Save
submission.to_csv("/home/submission/submission.csv", index=True)
print(f"\nSubmission saved to /home/submission/submission.csv")

In [None]:
# Calculate overall CV score
# The competition uses MSE across all predictions

# Reload and verify
submission_check = pd.read_csv("/home/submission/submission.csv")
print(f"Submission rows: {len(submission_check)}")
print(f"Expected: 656 (single) + 1227 (full) = 1883")

# Check prediction ranges
target_cols = ['target_1', 'target_2', 'target_3']
for col in target_cols:
    print(f"{col}: min={submission_check[col].min():.4f}, max={submission_check[col].max():.4f}")

print("\n" + "="*50)
print("EXPERIMENT 078 COMPLETE")
print("="*50)