# Experiment 011: GroupKFold + Top Kernel Architecture

**CRITICAL CHANGE**: Overwrite utility functions with GroupKFold (5-fold) instead of Leave-One-Out.

This is the SINGLE MOST IMPORTANT change identified by the evaluator. The top kernel (lishellliang) uses GroupKFold, which gives more realistic CV estimates.

**Key implementation:**
1. GroupKFold (5-fold) validation
2. MLP: [128, 64, 32], NO Sigmoid, 100 epochs, lr=1e-3, dropout=0.1
3. GBDT: n_estimators=300, max_depth=15 for RF
4. Ensemble weights: [0.4, 0.2, 0.2, 0.2]

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GroupKFold
import xgboost as xgb
import lightgbm as lgb
from abc import ABC
import tqdm
import warnings
warnings.filterwarnings('ignore')

DATA_PATH = '/home/data'
torch.set_default_dtype(torch.double)
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

CUDA available: True
GPU: NVIDIA H100 80GB HBM3


In [2]:
# --- UTILITY FUNCTIONS (ORIGINAL) ---
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[TARGET_LABELS]
    return X, Y

# Load Spange descriptors (correct filename)
Spange = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv')
print(f"Spange: {Spange.shape}")
print(f"Columns: {Spange.columns.tolist()}")
Spange_dict = {row['SOLVENT NAME']: row.drop('SOLVENT NAME').values for _, row in Spange.iterrows()}

Spange: (26, 14)
Columns: ['SOLVENT NAME', 'dielectric constant', 'ET(30)', 'alpha', 'beta', 'pi*', 'SA', 'SB', 'SP', 'SdP', 'N', 'n', 'f(n)', 'delta']


In [3]:
# --- CRITICAL: OVERWRITE UTILITY FUNCTIONS WITH GROUPKFOLD ---
# This is what the top kernel (lishellliang) does!
# GroupKFold gives more realistic CV estimates than Leave-One-Out

def generate_leave_one_out_splits(X, Y):
    """GroupKFold (5-fold) instead of Leave-One-Out for single solvent data."""
    groups = X["SOLVENT NAME"]
    n_splits = min(5, len(groups.unique()))
    gkf = GroupKFold(n_splits=n_splits)
    for train_idx, test_idx in gkf.split(X, Y, groups):
        yield ((X.iloc[train_idx], Y.iloc[train_idx]), (X.iloc[test_idx], Y.iloc[test_idx]))

def generate_leave_one_ramp_out_splits(X, Y):
    """GroupKFold (5-fold) instead of Leave-One-Out for full data."""
    groups = X["SOLVENT A NAME"].astype(str) + "_" + X["SOLVENT B NAME"].astype(str)
    n_splits = min(5, len(groups.unique()))
    gkf = GroupKFold(n_splits=n_splits)
    for train_idx, test_idx in gkf.split(X, Y, groups):
        yield ((X.iloc[train_idx], Y.iloc[train_idx]), (X.iloc[test_idx], Y.iloc[test_idx]))

print("GroupKFold utility functions defined (5-fold instead of LOO)")

GroupKFold utility functions defined (5-fold instead of LOO)


In [4]:
# --- BASE CLASSES ---
class SmilesFeaturizer(ABC):
    def __init__(self): raise NotImplementedError
    def featurize(self, X): raise NotImplementedError

class BaseModel(ABC):
    def __init__(self): pass
    def train_model(self, X_train, y_train): raise NotImplementedError
    def predict(self): raise NotImplementedError

In [5]:
# --- MLP ARCHITECTURE (TOP KERNEL - NO SIGMOID) ---
class TopKernelMLP(nn.Module):
    """MLP with BatchNorm + ReLU + Dropout, LINEAR output (no Sigmoid).
    
    This matches the top kernel architecture exactly.
    Uses eval mode for BatchNorm during inference to handle batch_size=1.
    """
    def __init__(self, input_dim, hidden_dims=[128, 64, 32], output_dim=3, dropout=0.1):
        super().__init__()
        layers = []
        
        # Input BatchNorm
        layers.append(nn.BatchNorm1d(input_dim))
        
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, h_dim))
            layers.append(nn.BatchNorm1d(h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev_dim = h_dim
        
        # Output layer - NO SIGMOID (linear output)
        layers.append(nn.Linear(prev_dim, output_dim))
        
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        # Handle batch_size=1 by using eval mode for BatchNorm
        if x.size(0) == 1 and self.training:
            self.eval()
            out = self.network(x)
            self.train()
            return out
        return self.network(x)

print("TopKernelMLP defined (NO Sigmoid output, handles batch_size=1)")

TopKernelMLP defined (NO Sigmoid output, handles batch_size=1)


In [6]:
# --- MLP + GBDT ENSEMBLE MODEL (TOP KERNEL ARCHITECTURE) ---
class TopKernelEnsemble(BaseModel):
    """Ensemble of MLP + XGBoost + RandomForest + LightGBM.
    
    Matches top kernel (lishellliang) architecture exactly:
    - MLP: [128, 64, 32], NO Sigmoid, 100 epochs, lr=1e-3, dropout=0.1
    - XGBoost: n_estimators=300, max_depth=6, learning_rate=0.05
    - RandomForest: n_estimators=300, max_depth=15
    - LightGBM: n_estimators=300, learning_rate=0.05
    - Weights: [0.4, 0.2, 0.2, 0.2] for MLP, XGB, RF, LGB
    """
    
    def __init__(self, data='single'):
        super().__init__()
        self.data = data
        self.scaler = StandardScaler()
        self.mlp = None
        self.xgb_models = []
        self.rf_model = None
        self.lgb_models = []
        
        # Top kernel weights
        self.weights = [0.4, 0.2, 0.2, 0.2]  # MLP, XGB, RF, LGB
    
    def _get_features(self, X):
        """Extract Spange features only (like top kernel)."""
        features = []
        
        if self.data == 'single':
            for _, row in X.iterrows():
                solvent = row['SOLVENT NAME']
                spange = Spange_dict.get(solvent, np.zeros(12))
                feat = np.concatenate([
                    [row['Residence Time'], row['Temperature']],
                    spange
                ])
                features.append(feat)
        else:
            for _, row in X.iterrows():
                solvent_a = row['SOLVENT A NAME']
                solvent_b = row['SOLVENT B NAME']
                pct_b = row['SolventB%'] / 100.0
                
                spange_a = Spange_dict.get(solvent_a, np.zeros(12))
                spange_b = Spange_dict.get(solvent_b, np.zeros(12))
                
                # Linear interpolation of Spange features
                spange_mix = (1 - pct_b) * spange_a + pct_b * spange_b
                
                feat = np.concatenate([
                    [row['Residence Time'], row['Temperature'], pct_b],
                    spange_mix
                ])
                features.append(feat)
        
        return np.array(features)
    
    def train_model(self, X_train, y_train):
        # Get features
        X_feat = self._get_features(X_train)
        y_np = y_train.values
        
        # Scale features
        X_scaled = self.scaler.fit_transform(X_feat)
        
        # 1. Train MLP (100 epochs, lr=1e-3, dropout=0.1)
        input_dim = X_scaled.shape[1]
        self.mlp = TopKernelMLP(input_dim, hidden_dims=[128, 64, 32], output_dim=3, dropout=0.1).to(device)
        
        X_tensor = torch.tensor(X_scaled, dtype=torch.double).to(device)
        y_tensor = torch.tensor(y_np, dtype=torch.double).to(device)
        
        dataset = TensorDataset(X_tensor, y_tensor)
        loader = DataLoader(dataset, batch_size=32, shuffle=True)
        
        optimizer = torch.optim.Adam(self.mlp.parameters(), lr=1e-3)
        criterion = nn.MSELoss()
        
        self.mlp.train()
        for epoch in range(100):  # 100 epochs like top kernel
            for batch_X, batch_y in loader:
                optimizer.zero_grad()
                pred = self.mlp(batch_X)
                loss = criterion(pred, batch_y)
                loss.backward()
                optimizer.step()
        
        # 2. Train XGBoost (per target, n_estimators=300)
        self.xgb_models = []
        for i in range(3):
            model = xgb.XGBRegressor(
                n_estimators=300,
                max_depth=6,
                learning_rate=0.05,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                verbosity=0
            )
            model.fit(X_scaled, y_np[:, i])
            self.xgb_models.append(model)
        
        # 3. Train RandomForest (n_estimators=300, max_depth=15)
        self.rf_model = MultiOutputRegressor(
            RandomForestRegressor(
                n_estimators=300,
                max_depth=15,
                random_state=42,
                n_jobs=-1
            )
        )
        self.rf_model.fit(X_scaled, y_np)
        
        # 4. Train LightGBM (per target, n_estimators=300)
        self.lgb_models = []
        for i in range(3):
            model = lgb.LGBMRegressor(
                n_estimators=300,
                learning_rate=0.05,
                num_leaves=31,
                max_depth=-1,
                random_state=42,
                verbosity=-1
            )
            model.fit(X_scaled, y_np[:, i])
            self.lgb_models.append(model)
    
    def predict(self, X_test):
        X_feat = self._get_features(X_test)
        X_scaled = self.scaler.transform(X_feat)
        
        # MLP prediction
        self.mlp.eval()
        with torch.no_grad():
            X_tensor = torch.tensor(X_scaled, dtype=torch.double).to(device)
            mlp_pred = self.mlp(X_tensor).cpu().numpy()
        
        # XGBoost prediction
        xgb_pred = np.column_stack([m.predict(X_scaled) for m in self.xgb_models])
        
        # RandomForest prediction
        rf_pred = self.rf_model.predict(X_scaled)
        
        # LightGBM prediction
        lgb_pred = np.column_stack([m.predict(X_scaled) for m in self.lgb_models])
        
        # Weighted ensemble
        final_pred = (
            self.weights[0] * mlp_pred +
            self.weights[1] * xgb_pred +
            self.weights[2] * rf_pred +
            self.weights[3] * lgb_pred
        )
        
        # Clip to [0, 1]
        final_pred = np.clip(final_pred, 0, 1)
        
        return torch.tensor(final_pred)

print("TopKernelEnsemble defined")

TopKernelEnsemble defined


In [9]:
# --- QUICK VALIDATION TEST ---
print("Testing TopKernelEnsemble with GroupKFold...")
X_test, Y_test = load_data("single_solvent")

# Quick test on first 2 folds
errors = []
split_gen = generate_leave_one_out_splits(X_test, Y_test)
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(split_gen):
    if i >= 2: break
    print(f"\nFold {i}: Train={len(train_X)}, Test={len(test_X)}")
    print(f"  Test solvents: {test_X['SOLVENT NAME'].unique()[:3]}...")
    model = TopKernelEnsemble(data='single')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors.append(mae)
    print(f"  MAE = {mae:.4f}")

print(f"\nQuick test MAE (2 folds): {np.mean(errors):.4f}")
print(f"Note: GroupKFold gives ~20% test data per fold (vs 4% for LOO)")

Testing TopKernelEnsemble with GroupKFold...

Fold 0: Train=531, Test=125
  Test solvents: ['IPA [Propan-2-ol]' 'Acetonitrile' 'Diethyl Ether [Ether]']...


  MAE = 0.0667

Fold 1: Train=526, Test=130
  Test solvents: ['2-Methyltetrahydrofuran [2-MeTHF]' 'Cyclohexane' 'Decanol']...


  MAE = 0.0638

Quick test MAE (2 folds): 0.0653
Note: GroupKFold gives ~20% test data per fold (vs 4% for LOO)


In [7]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = TopKernelEnsemble(data='single') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)
    predictions = model.predict(test_X)

    predictions_df = pd.DataFrame(predictions.numpy(), columns=["target_1", "target_2", "target_3"])
    predictions_df["fold"] = fold_idx
    predictions_df["task"] = 0
    all_predictions.append(predictions_df)

submission_single_solvent = pd.concat(all_predictions)
submission_single_solvent = submission_single_solvent.reset_index(drop=True)

# Calculate CV MAE
X_single, Y_single = load_data("single_solvent")
single_errors = []
for fold_idx, ((_, _), (test_X, test_Y)) in enumerate(generate_leave_one_out_splits(X_single, Y_single)):
    fold_preds = submission_single_solvent[submission_single_solvent['fold'] == fold_idx]
    pred_vals = fold_preds[['target_1', 'target_2', 'target_3']].values
    mae = np.mean(np.abs(pred_vals - test_Y.values))
    single_errors.append(mae)
    print(f"Fold {fold_idx}: MAE = {mae:.4f}")

print(f"\nSingle Solvent CV MAE: {np.mean(single_errors):.6f} +/- {np.std(single_errors):.6f}")

0it [00:00, ?it/s]

1it [00:04,  4.36s/it]

2it [00:07,  3.93s/it]

3it [00:11,  3.80s/it]

4it [00:15,  3.75s/it]

5it [00:18,  3.72s/it]

5it [00:18,  3.79s/it]

Fold 0: MAE = 0.0668
Fold 1: MAE = 0.0612
Fold 2: MAE = 0.0552
Fold 3: MAE = 0.0970
Fold 4: MAE = 0.0864

Single Solvent CV MAE: 0.073326 +/- 0.015822





In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = TopKernelEnsemble(data = 'full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)
    predictions = model.predict(test_X)

    predictions_df = pd.DataFrame(predictions.numpy(), columns=["target_1", "target_2", "target_3"])
    predictions_df["fold"] = fold_idx
    predictions_df["task"] = 1
    all_predictions.append(predictions_df)

submission_full_data = pd.concat(all_predictions)
submission_full_data = submission_full_data.reset_index(drop=True)

# Calculate CV MAE
X_full, Y_full = load_data("full")
full_errors = []
for fold_idx, ((_, _), (test_X, test_Y)) in enumerate(generate_leave_one_ramp_out_splits(X_full, Y_full)):
    fold_preds = submission_full_data[submission_full_data['fold'] == fold_idx]
    pred_vals = fold_preds[['target_1', 'target_2', 'target_3']].values
    mae = np.mean(np.abs(pred_vals - test_Y.values))
    full_errors.append(mae)
    print(f"Fold {fold_idx}: MAE = {mae:.4f}")

print(f"\nFull Data CV MAE: {np.mean(full_errors):.6f} +/- {np.std(full_errors):.6f}")

0it [00:00, ?it/s]

1it [00:06,  6.14s/it]

2it [00:12,  6.01s/it]

3it [00:18,  6.02s/it]

4it [00:24,  5.99s/it]

5it [00:30,  6.00s/it]

5it [00:30,  6.01s/it]

Fold 0: MAE = 0.0631
Fold 1: MAE = 0.0882
Fold 2: MAE = 0.0901
Fold 3: MAE = 0.1151
Fold 4: MAE = 0.0931

Full Data CV MAE: 0.089923 +/- 0.016541





In [9]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

# Final summary
total_single = len(submission_single_solvent)
total_full = len(submission_full_data)
total = total_single + total_full
combined_cv = (np.mean(single_errors) * total_single + np.mean(full_errors) * total_full) / total

print(f"\n=== FINAL RESULTS (GroupKFold 5-fold) ===")
print(f"Single Solvent CV MAE: {np.mean(single_errors):.6f} +/- {np.std(single_errors):.6f}")
print(f"Full Data CV MAE: {np.mean(full_errors):.6f} +/- {np.std(full_errors):.6f}")
print(f"Combined CV MAE: {combined_cv:.6f}")
print(f"\n=== COMPARISON ===")
print(f"Best LOO CV (exp_004): 0.0623")
print(f"Best LB (exp_004): 0.0956")
print(f"This experiment (GroupKFold): {combined_cv:.6f}")
print(f"\nNote: GroupKFold CV should be MORE REALISTIC (closer to LB)")
print(f"Expected CV-LB gap: ~10-20% (vs 50% for LOO)")


=== FINAL RESULTS (GroupKFold 5-fold) ===
Single Solvent CV MAE: 0.073326 +/- 0.015822
Full Data CV MAE: 0.089923 +/- 0.016541
Combined CV MAE: 0.084141

=== COMPARISON ===
Best LOO CV (exp_004): 0.0623
Best LB (exp_004): 0.0956
This experiment (GroupKFold): 0.084141

Note: GroupKFold CV should be MORE REALISTIC (closer to LB)
Expected CV-LB gap: ~10-20% (vs 50% for LOO)
