# Mixall Kernel Approach with GroupKFold(5)

**Hypothesis**: GroupKFold(5) validation may give a different CV-LB relationship than Leave-One-Out.

**Key differences from our current approach**:
1. GroupKFold(5) instead of Leave-One-Out (24 folds â†’ 5 folds)
2. Ensemble: MLP + XGBoost + RandomForest + LightGBM with weighted average
3. Spange descriptors only (simpler features)

**Why this might help**:
- More training data per fold (80% vs ~96%)
- Fewer folds = less variance in CV estimate
- May better simulate the actual test distribution
- The kernel claims "good CV-LB" correlation

In [None]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

In [None]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

print('Data loading functions defined')

In [None]:
# GroupKFold(5) split generators - KEY DIFFERENCE from Leave-One-Out
def generate_groupkfold_splits_single(X, Y):
    """Generate GroupKFold(5) splits for single solvent data."""
    groups = X["SOLVENT NAME"]
    n_groups = len(groups.unique())
    n_splits = min(5, n_groups)
    
    gkf = GroupKFold(n_splits=n_splits)
    
    for train_idx, test_idx in gkf.split(X, Y, groups):
        yield (
            (X.iloc[train_idx], Y.iloc[train_idx]),
            (X.iloc[test_idx], Y.iloc[test_idx]),
        )

def generate_groupkfold_splits_full(X, Y):
    """Generate GroupKFold(5) splits for full (mixture) data."""
    groups = X["SOLVENT A NAME"].astype(str) + "_" + X["SOLVENT B NAME"].astype(str)
    n_groups = len(groups.unique())
    n_splits = min(5, n_groups)
    
    gkf = GroupKFold(n_splits=n_splits)
    
    for train_idx, test_idx in gkf.split(X, Y, groups):
        yield (
            (X.iloc[train_idx], Y.iloc[train_idx]),
            (X.iloc[test_idx], Y.iloc[test_idx]),
        )

# Also keep the original Leave-One-Out for comparison
def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Split generators defined')

In [None]:
# Load Spange descriptors (as used in mixall kernel)
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
print(f'Spange descriptors shape: {SPANGE_DF.shape}')
print(SPANGE_DF.head())

In [None]:
# Featurizers (matching mixall kernel)
class PrecomputedFeaturizer:
    def __init__(self):
        self.features = SPANGE_DF
        self.feats_dim = self.features.shape[1] + 2  # +2 for Time, Temp
        
    def featurize(self, X):
        res_time = X['Residence Time'].values.reshape(-1, 1)
        temp = X['Temperature'].values.reshape(-1, 1)
        solvent_names = X['SOLVENT NAME']
        feats = self.features.loc[solvent_names].values
        final_feats = np.hstack([res_time, temp, feats])
        return torch.tensor(final_feats, dtype=torch.float32)

class PrecomputedFeaturizerMixed:
    def __init__(self):
        self.features = SPANGE_DF
        self.feats_dim = self.features.shape[1] + 3  # +3 for Time, Temp, %B
        
    def featurize(self, X):
        res_time = X['Residence Time'].values.reshape(-1, 1)
        temp = X['Temperature'].values.reshape(-1, 1)
        sb_pct = X['SolventB%'].values.reshape(-1, 1)
        desc_a = self.features.loc[X['SOLVENT A NAME']].values
        desc_b = self.features.loc[X['SOLVENT B NAME']].values
        mixture_feats = (1 - sb_pct) * desc_a + sb_pct * desc_b
        final_feats = np.hstack([res_time, temp, sb_pct, mixture_feats])
        return torch.tensor(final_feats, dtype=torch.float32)

print(f'Single featurizer dim: {PrecomputedFeaturizer().feats_dim}')
print(f'Mixed featurizer dim: {PrecomputedFeaturizerMixed().feats_dim}')

In [None]:
# MLP Model (matching mixall kernel)
class EnhancedMLP(nn.Module):
    def __init__(self, input_dim, output_dim=3, hidden_dims=[128, 64, 32], dropout=0.2):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, h_dim),
                nn.BatchNorm1d(h_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, output_dim))
        self.network = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.network(x)

print('MLP defined')

In [None]:
# Ensemble Model (matching mixall kernel)
class EnsembleModel:
    def __init__(self, data='single', hidden_dims=[128, 64, 32], dropout=0.2, 
                 weights=[0.25, 0.25, 0.25, 0.25]):
        self.data = data
        self.weights = weights
        
        if data == 'single':
            self.smiles_featurizer = PrecomputedFeaturizer()
        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixed()
        
        input_dim = self.smiles_featurizer.feats_dim
        self.mlp = EnhancedMLP(input_dim, hidden_dims=hidden_dims, dropout=dropout)
        self.scaler = StandardScaler()
        
        # XGBoost
        self.xgb_params = {
            'n_estimators': 100,
            'max_depth': 5,
            'learning_rate': 0.1,
            'random_state': 42
        }
        
        # Random Forest
        self.rf_params = {
            'n_estimators': 100,
            'max_depth': 10,
            'random_state': 42
        }
        
        # LightGBM
        self.lgb_params = {
            'n_estimators': 100,
            'num_leaves': 31,
            'learning_rate': 0.1,
            'random_state': 42,
            'verbose': -1
        }
    
    def train_model(self, train_X, train_Y, num_epochs=100, lr=1e-3, batch_size=32):
        X_tensor = self.smiles_featurizer.featurize(train_X)
        X_np = X_tensor.numpy()
        train_Y_np = train_Y.values
        
        # Fit scaler
        X_scaled = self.scaler.fit_transform(X_np)
        
        # Train XGBoost
        self.xgb = xgb.XGBRegressor(**self.xgb_params)
        self.xgb.fit(X_scaled, train_Y_np)
        
        # Train Random Forest
        self.rf = RandomForestRegressor(**self.rf_params)
        self.rf.fit(X_scaled, train_Y_np)
        
        # Train LightGBM
        feature_names = [str(i) for i in range(X_scaled.shape[1])]
        X_scaled_df = pd.DataFrame(X_scaled, columns=feature_names)
        self.lgbm = lgb.LGBMRegressor(**self.lgb_params)
        self.lgbm.fit(X_scaled_df, train_Y_np)
        
        # Train MLP
        X_tensor_scaled = torch.tensor(X_scaled, dtype=torch.float32)
        train_Y_tensor = torch.tensor(train_Y_np, dtype=torch.float32)
        
        self.mlp.to(device)
        optimizer = torch.optim.Adam(self.mlp.parameters(), lr=lr)
        criterion = nn.MSELoss()
        
        train_loader = DataLoader(
            TensorDataset(X_tensor_scaled, train_Y_tensor), 
            batch_size=batch_size, shuffle=True, drop_last=True
        )
        
        for epoch in range(num_epochs):
            self.mlp.train()
            for inputs, targets in train_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                optimizer.zero_grad()
                loss = criterion(self.mlp(inputs), targets)
                loss.backward()
                optimizer.step()
    
    def predict(self, test_X):
        X_tensor = self.smiles_featurizer.featurize(test_X)
        X_np = X_tensor.numpy()
        X_scaled = self.scaler.transform(X_np)
        
        feature_names = [str(i) for i in range(X_scaled.shape[1])]
        X_scaled_df = pd.DataFrame(X_scaled, columns=feature_names)
        
        # MLP predictions
        self.mlp.eval()
        with torch.no_grad():
            X_tensor_scaled = torch.tensor(X_scaled, dtype=torch.float32).to(device)
            mlp_preds = self.mlp(X_tensor_scaled).cpu().numpy()
        
        # GBDT predictions
        xgb_preds = self.xgb.predict(X_scaled_df)
        rf_preds = self.rf.predict(X_scaled_df)
        lgb_preds = self.lgbm.predict(X_scaled_df)
        
        # Weighted ensemble
        final_preds = (self.weights[0] * mlp_preds + 
                       self.weights[1] * xgb_preds + 
                       self.weights[2] * rf_preds + 
                       self.weights[3] * lgb_preds)
        
        return torch.tensor(final_preds)

print('EnsembleModel defined')

In [None]:
# Test GroupKFold(5) on single solvent data
print("="*60)
print("Testing GroupKFold(5) on Single Solvent Data")
print("="*60)

X_single, Y_single = load_data("single_solvent")
print(f"Single solvent data: X={X_single.shape}, Y={Y_single.shape}")
print(f"Number of unique solvents: {X_single['SOLVENT NAME'].nunique()}")

# GroupKFold(5)
all_mse_gkf = []
for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_groupkfold_splits_single(X_single, Y_single)):
    print(f"Fold {fold_idx}: train={len(train_X)}, test={len(test_X)}, test_solvents={test_X['SOLVENT NAME'].nunique()}")
    model = EnsembleModel(data='single')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    all_mse_gkf.append(mse)
    print(f"  Fold MSE: {mse:.6f}")

mse_gkf_single = np.mean(all_mse_gkf)
print(f"\nGroupKFold(5) Single Solvent MSE: {mse_gkf_single:.6f} (+/- {np.std(all_mse_gkf):.6f})")

In [None]:
# Compare with Leave-One-Out on single solvent data
print("="*60)
print("Comparing with Leave-One-Out on Single Solvent Data")
print("="*60)

all_mse_loo = []
for (train_X, train_Y), (test_X, test_Y) in tqdm.tqdm(generate_leave_one_out_splits(X_single, Y_single), total=24):
    model = EnsembleModel(data='single')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    all_mse_loo.append(mse)

mse_loo_single = np.mean(all_mse_loo)
print(f"\nLeave-One-Out Single Solvent MSE: {mse_loo_single:.6f} (+/- {np.std(all_mse_loo):.6f})")
print(f"GroupKFold(5) Single Solvent MSE: {mse_gkf_single:.6f}")
print(f"Difference: {(mse_gkf_single - mse_loo_single) / mse_loo_single * 100:.2f}%")

In [None]:
# Test GroupKFold(5) on full data
print("="*60)
print("Testing GroupKFold(5) on Full Data")
print("="*60)

X_full, Y_full = load_data("full")
print(f"Full data: X={X_full.shape}, Y={Y_full.shape}")
groups = X_full["SOLVENT A NAME"].astype(str) + "_" + X_full["SOLVENT B NAME"].astype(str)
print(f"Number of unique ramps: {groups.nunique()}")

# GroupKFold(5)
all_mse_gkf_full = []
for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_groupkfold_splits_full(X_full, Y_full)):
    groups_test = test_X["SOLVENT A NAME"].astype(str) + "_" + test_X["SOLVENT B NAME"].astype(str)
    print(f"Fold {fold_idx}: train={len(train_X)}, test={len(test_X)}, test_ramps={groups_test.nunique()}")
    model = EnsembleModel(data='full')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    all_mse_gkf_full.append(mse)
    print(f"  Fold MSE: {mse:.6f}")

mse_gkf_full = np.mean(all_mse_gkf_full)
print(f"\nGroupKFold(5) Full Data MSE: {mse_gkf_full:.6f} (+/- {np.std(all_mse_gkf_full):.6f})")

In [None]:
# Compare with Leave-One-Ramp-Out on full data
print("="*60)
print("Comparing with Leave-One-Ramp-Out on Full Data")
print("="*60)

all_mse_loro = []
for (train_X, train_Y), (test_X, test_Y) in tqdm.tqdm(generate_leave_one_ramp_out_splits(X_full, Y_full), total=13):
    model = EnsembleModel(data='full')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    all_mse_loro.append(mse)

mse_loro_full = np.mean(all_mse_loro)
print(f"\nLeave-One-Ramp-Out Full Data MSE: {mse_loro_full:.6f} (+/- {np.std(all_mse_loro):.6f})")
print(f"GroupKFold(5) Full Data MSE: {mse_gkf_full:.6f}")
print(f"Difference: {(mse_gkf_full - mse_loro_full) / mse_loro_full * 100:.2f}%")

In [None]:
# Calculate overall MSE
N_single = len(X_single)
N_full = len(X_full)
N_total = N_single + N_full

overall_mse_gkf = (mse_gkf_single * N_single + mse_gkf_full * N_full) / N_total
overall_mse_loo = (mse_loo_single * N_single + mse_loro_full * N_full) / N_total

print("="*60)
print("SUMMARY")
print("="*60)
print(f"\nGroupKFold(5) Validation:")
print(f"  Single Solvent MSE: {mse_gkf_single:.6f}")
print(f"  Full Data MSE: {mse_gkf_full:.6f}")
print(f"  Overall MSE: {overall_mse_gkf:.6f}")

print(f"\nLeave-One-Out Validation:")
print(f"  Single Solvent MSE: {mse_loo_single:.6f}")
print(f"  Full Data MSE: {mse_loro_full:.6f}")
print(f"  Overall MSE: {overall_mse_loo:.6f}")

print(f"\nComparison:")
print(f"  GroupKFold(5) vs Leave-One-Out: {(overall_mse_gkf - overall_mse_loo) / overall_mse_loo * 100:.2f}%")
print(f"  Best GP+MLP+LGBM ensemble (exp_030): 0.008298")
print(f"  This ensemble (LOO) vs Best: {(overall_mse_loo - 0.008298) / 0.008298 * 100:.2f}%")

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

# Use GroupKFold(5) as in mixall kernel
def generate_leave_one_out_splits(X, Y):
    groups = X["SOLVENT NAME"]
    n_splits = min(5, len(groups.unique()))
    gkf = GroupKFold(n_splits=n_splits)
    for train_idx, test_idx in gkf.split(X, Y, groups):
        yield (X.iloc[train_idx], Y.iloc[train_idx]), (X.iloc[test_idx], Y.iloc[test_idx])

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

# Use GroupKFold(5) as in mixall kernel
def generate_leave_one_ramp_out_splits(X, Y):
    groups = X["SOLVENT A NAME"].astype(str) + "_" + X["SOLVENT B NAME"].astype(str)
    n_splits = min(5, len(groups.unique()))
    gkf = GroupKFold(n_splits=n_splits)
    for train_idx, test_idx in gkf.split(X, Y, groups):
        yield (X.iloc[train_idx], Y.iloc[train_idx]), (X.iloc[test_idx], Y.iloc[test_idx])

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################