# MLP + XGBoost + RandomForest + LightGBM Ensemble

**Problem**: CV-LB intercept (0.0525) > Target (0.0347). Need to change the CV-LB relationship.

**Approach**: Implement the lishellliang kernel's ensemble with RandomForest.
- MLP + XGBoost + RandomForest + LightGBM
- Use Leave-One-Out validation (NOT GroupKFold)
- Test if RF adds diversity that changes CV-LB relationship

**Key**: This notebook has EXACTLY 3 submission cells at the end.

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.set_default_dtype(torch.float32)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [2]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [3]:
# Load feature lookups
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFP_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)

# Filter DRFP to high-variance columns
drfp_variance = DRFP_DF.var()
nonzero_variance_cols = drfp_variance[drfp_variance > 0].index.tolist()
DRFP_FILTERED = DRFP_DF[nonzero_variance_cols]

print(f'Spange: {SPANGE_DF.shape}, DRFP filtered: {DRFP_FILTERED.shape}, ACS PCA: {ACS_PCA_DF.shape}')

Spange: (26, 13), DRFP filtered: (24, 122), ACS PCA: (24, 5)


In [4]:
# Featurizer class
class Featurizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.drfp_df = DRFP_FILTERED
        self.acs_pca_df = ACS_PCA_DF
        self.feats_dim = 2 + 3 + self.spange_df.shape[1] + self.drfp_df.shape[1] + self.acs_pca_df.shape[1]

    def featurize(self, X):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float32)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            A_drfp = self.drfp_df.loc[X["SOLVENT A NAME"]].values
            B_drfp = self.drfp_df.loc[X["SOLVENT B NAME"]].values
            A_acs = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
            B_acs = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            X_spange = A_spange * (1 - pct) + B_spange * pct
            X_drfp = A_drfp * (1 - pct) + B_drfp * pct
            X_acs = A_acs * (1 - pct) + B_acs * pct
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
            X_drfp = self.drfp_df.loc[X["SOLVENT NAME"]].values
            X_acs = self.acs_pca_df.loc[X["SOLVENT NAME"]].values
        
        return np.hstack([X_kinetic, X_spange, X_drfp, X_acs]).astype(np.float32)

print(f'Feature dimension: {Featurizer().feats_dim}')

Feature dimension: 145


In [5]:
# MLP Model
class EnhancedMLP(nn.Module):
    def __init__(self, input_dim, hidden_dims=[128, 64, 32], output_dim=3, dropout=0.2):
        super().__init__()
        layers = [nn.BatchNorm1d(input_dim)]
        prev_dim = input_dim
        for h in hidden_dims:
            layers.extend([nn.Linear(prev_dim, h), nn.ReLU(), nn.Dropout(dropout)])
            prev_dim = h
        layers.append(nn.Linear(prev_dim, output_dim))
        layers.append(nn.Sigmoid())
        self.net = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.net(x)

print('EnhancedMLP defined')

EnhancedMLP defined


In [6]:
# MLP + XGBoost + RandomForest + LightGBM Ensemble
class RFEnsembleModel:
    """Ensemble with MLP + XGBoost + RandomForest + LightGBM."""
    
    def __init__(self, data='single', weights=[0.25, 0.25, 0.25, 0.25]):
        self.data = data
        self.mixed = (data == 'full')
        self.weights = weights  # [mlp, xgb, rf, lgbm]
        
        self.featurizer = Featurizer(mixed=self.mixed)
        self.scaler = StandardScaler()
        
    def train_model(self, X, Y, epochs=150):
        Y_np = Y.values if hasattr(Y, 'values') else Y
        
        # Featurize and scale
        X_features = self.featurizer.featurize(X)
        X_scaled = self.scaler.fit_transform(X_features)
        
        # Train MLP
        X_torch = torch.tensor(X_scaled, dtype=torch.float32).to(device)
        Y_torch = torch.tensor(Y_np, dtype=torch.float32).to(device)
        
        self.mlp = EnhancedMLP(input_dim=X_scaled.shape[1]).to(device)
        optimizer = torch.optim.Adam(self.mlp.parameters(), lr=5e-4, weight_decay=1e-5)
        criterion = nn.HuberLoss()
        
        dataset = TensorDataset(X_torch, Y_torch)
        loader = DataLoader(dataset, batch_size=32, shuffle=True)
        
        self.mlp.train()
        for epoch in range(epochs):
            for xb, yb in loader:
                optimizer.zero_grad()
                pred = self.mlp(xb)
                loss = criterion(pred, yb)
                loss.backward()
                optimizer.step()
        
        # Train XGBoost (per-target)
        self.xgb_models = []
        for i in range(3):
            model = xgb.XGBRegressor(
                n_estimators=200, max_depth=6, learning_rate=0.05,
                subsample=0.8, colsample_bytree=0.8,
                random_state=42, verbosity=0
            )
            model.fit(X_scaled, Y_np[:, i])
            self.xgb_models.append(model)
        
        # Train RandomForest (per-target)
        self.rf_models = []
        for i in range(3):
            model = RandomForestRegressor(
                n_estimators=100, max_depth=10, min_samples_split=5,
                random_state=42, n_jobs=-1
            )
            model.fit(X_scaled, Y_np[:, i])
            self.rf_models.append(model)
        
        # Train LightGBM (per-target)
        self.lgbm_models = []
        for i in range(3):
            model = lgb.LGBMRegressor(
                n_estimators=200, learning_rate=0.05, max_depth=6,
                num_leaves=31, reg_alpha=0.1, reg_lambda=0.1,
                random_state=42, verbose=-1
            )
            model.fit(X_scaled, Y_np[:, i])
            self.lgbm_models.append(model)
    
    def predict(self, X):
        X_features = self.featurizer.featurize(X)
        X_scaled = self.scaler.transform(X_features)
        
        # MLP predictions
        self.mlp.eval()
        with torch.no_grad():
            X_torch = torch.tensor(X_scaled, dtype=torch.float32).to(device)
            mlp_preds = self.mlp(X_torch).cpu().numpy()
        
        # XGBoost predictions
        xgb_preds = np.column_stack([m.predict(X_scaled) for m in self.xgb_models])
        
        # RandomForest predictions
        rf_preds = np.column_stack([m.predict(X_scaled) for m in self.rf_models])
        
        # LightGBM predictions
        lgbm_preds = np.column_stack([m.predict(X_scaled) for m in self.lgbm_models])
        
        # Weighted ensemble
        final_preds = (self.weights[0] * mlp_preds + 
                       self.weights[1] * xgb_preds + 
                       self.weights[2] * rf_preds + 
                       self.weights[3] * lgbm_preds)
        
        # Clip to valid range [0, 1]
        final_preds = np.clip(final_preds, 0, 1)
        
        return torch.tensor(final_preds, dtype=torch.float32)

print('RFEnsembleModel defined')

RFEnsembleModel defined


In [7]:
# Quick test on single fold
X_single, Y_single = load_data("single_solvent")
test_solvent = sorted(X_single["SOLVENT NAME"].unique())[0]
mask = X_single["SOLVENT NAME"] != test_solvent

print(f"Test solvent: {test_solvent}")
print(f"Training samples: {mask.sum()}, Test samples: {(~mask).sum()}")

model = RFEnsembleModel(data='single')
model.train_model(X_single[mask], Y_single[mask], epochs=100)
preds = model.predict(X_single[~mask])

actuals = Y_single[~mask].values
mse = np.mean((actuals - preds.numpy()) ** 2)
print(f'Test fold MSE: {mse:.6f}')

Test solvent: 1,1,1,3,3,3-Hexafluoropropan-2-ol
Training samples: 619, Test samples: 37


Test fold MSE: 0.045170


In [8]:
# Run full CV for single solvent
print('Running CV for single solvent data...')

X, Y = load_data("single_solvent")
split_generator = generate_leave_one_out_splits(X, Y)
all_preds = []
all_actuals = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = RFEnsembleModel(data='single')
    model.train_model(train_X, train_Y, epochs=150)
    
    predictions = model.predict(test_X)
    predictions_np = predictions.detach().cpu().numpy()
    
    all_preds.append(predictions_np)
    all_actuals.append(test_Y.values)

all_preds = np.vstack(all_preds)
all_actuals = np.vstack(all_actuals)
mse_single = np.mean((all_preds - all_actuals) ** 2)
print(f'\nSingle Solvent MSE: {mse_single:.6f}')

Running CV for single solvent data...


0it [00:00, ?it/s]

1it [00:04,  4.44s/it]

2it [00:08,  4.48s/it]

3it [00:13,  4.38s/it]

4it [00:17,  4.33s/it]

5it [00:22,  4.41s/it]

6it [00:26,  4.43s/it]

7it [00:30,  4.42s/it]

8it [00:35,  4.44s/it]

9it [00:39,  4.45s/it]

10it [00:44,  4.46s/it]

11it [00:48,  4.47s/it]

12it [00:53,  4.45s/it]

13it [00:57,  4.45s/it]

14it [01:02,  4.47s/it]

15it [01:06,  4.49s/it]

16it [01:11,  4.51s/it]

17it [01:15,  4.55s/it]

18it [01:20,  4.50s/it]

19it [01:24,  4.52s/it]

20it [01:29,  4.49s/it]

21it [01:33,  4.49s/it]

22it [01:38,  4.51s/it]

23it [01:42,  4.51s/it]

24it [01:47,  4.47s/it]

24it [01:47,  4.47s/it]


Single Solvent MSE: 0.010213





In [9]:
# Run full CV for full data
print('Running CV for full data...')

X, Y = load_data("full")
split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_preds_full = []
all_actuals_full = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = RFEnsembleModel(data='full')
    model.train_model(train_X, train_Y, epochs=150)
    
    predictions = model.predict(test_X)
    predictions_np = predictions.detach().cpu().numpy()
    
    all_preds_full.append(predictions_np)
    all_actuals_full.append(test_Y.values)

all_preds_full = np.vstack(all_preds_full)
all_actuals_full = np.vstack(all_actuals_full)
mse_full = np.mean((all_preds_full - all_actuals_full) ** 2)
print(f'\nFull Data MSE: {mse_full:.6f}')

# Overall MSE
n_single = len(all_actuals)
n_full = len(all_actuals_full)
overall_mse = (mse_single * n_single + mse_full * n_full) / (n_single + n_full)
print(f'\nOverall MSE: {overall_mse:.6f}')
print(f'Baseline (exp_030 GP+MLP+LGBM): CV 0.008298')

Running CV for full data...


0it [00:00, ?it/s]

1it [00:07,  7.72s/it]

2it [00:15,  7.70s/it]

3it [00:23,  7.80s/it]

4it [00:31,  7.79s/it]

5it [00:38,  7.80s/it]

6it [00:46,  7.77s/it]

7it [00:54,  7.75s/it]

8it [01:01,  7.72s/it]

9it [01:09,  7.71s/it]

10it [01:18,  7.91s/it]

11it [01:26,  8.06s/it]

12it [01:34,  8.18s/it]

13it [01:43,  8.24s/it]

13it [01:43,  7.94s/it]


Full Data MSE: 0.009643

Overall MSE: 0.009842
Baseline (exp_030 GP+MLP+LGBM): CV 0.008298





In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = RFEnsembleModel(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = RFEnsembleModel(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################