# Expanded Ensemble: MLP + Trees + LightGBM + XGBoost

Based on strategy recommendations:
1. Add LightGBM and XGBoost for more diversity
2. Increase MLP bagging (5 models) and epochs (250)
3. Use weighted ensemble (optimize weights)

Target: Beat CV 0.0103 (current best) to improve LB from 0.0949

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from abc import ABC, abstractmethod
import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set seeds for reproducibility
import random
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
    torch.backends.cudnn.deterministic = True

torch.set_default_dtype(torch.double)

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

CUDA available: True
GPU: NVIDIA H100 80GB HBM3


In [2]:
# Data loading utilities
DATA_PATH = '/home/data'

INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

# Load feature lookup tables
SPANGE_DF = load_features('spange_descriptors')
ACS_PCA_DF = load_features('acs_pca_descriptors')
print(f"Spange: {SPANGE_DF.shape}, ACS PCA: {ACS_PCA_DF.shape}")

Spange: (26, 13), ACS PCA: (24, 5)


In [3]:
# Base class
class BaseModel(ABC):
    def __init__(self):
        pass
    def train_model(self, X_train, y_train):
        raise NotImplementedError
    def predict(self):
        raise NotImplementedError

In [4]:
# ============ MLP COMPONENT (Enhanced) ============
class KineticMixingFeaturizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.featurizer = SPANGE_DF
        self.feats_dim = self.featurizer.shape[1] + 2 + 3

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        
        X_kinetic = torch.tensor(np.hstack([X_vals, inv_temp, log_time, interaction]))
        
        if self.mixed:
            A = torch.tensor(self.featurizer.loc[X["SOLVENT A NAME"]].values)
            B = torch.tensor(self.featurizer.loc[X["SOLVENT B NAME"]].values)
            pct = torch.tensor(X["SolventB%"].values.reshape(-1, 1))
            if flip:
                X_chem = B * (1 - (1-pct)) + A * (1-pct)
            else:
                X_chem = A * (1 - pct) + B * pct
        else:
            X_chem = torch.tensor(self.featurizer.loc[X["SOLVENT NAME"]].values)
            
        return torch.cat([X_kinetic, X_chem], dim=1)

class MLPInternal(nn.Module):
    def __init__(self, input_dim):
        super(MLPInternal, self).__init__()
        self.net = nn.Sequential(
            nn.BatchNorm1d(input_dim),
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 3),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

class MLPWithArrhenius(nn.Module):
    def __init__(self, data='single', n_models=5, epochs=250):  # Increased from 3/200
        super().__init__()
        self.data_type = data
        self.featurizer = KineticMixingFeaturizer(mixed=(data=='full'))
        self.n_models = n_models
        self.epochs = epochs
        self.models = nn.ModuleList()

    def train_model(self, X_train, y_train):
        X_std = self.featurizer.featurize(X_train, flip=False)
        y_vals = torch.tensor(y_train.values)
        
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize(X_train, flip=True)
            X_all = torch.cat([X_std, X_flip], dim=0)
            y_all = torch.cat([y_vals, y_vals], dim=0)
        else:
            X_all = X_std
            y_all = y_vals
            
        input_dim = X_all.shape[1]
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        for i in range(self.n_models):
            model = MLPInternal(input_dim).to(device)
            model.train()
            self.models.append(model)
            
            dataset = TensorDataset(X_all, y_all)
            loader = DataLoader(dataset, batch_size=32, shuffle=True)
            
            optimizer = torch.optim.Adam(model.parameters(), lr=5e-4, weight_decay=1e-5)
            criterion = nn.HuberLoss()
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=20)
            
            for epoch in range(self.epochs):
                epoch_loss = 0.0
                for inputs, targets in loader:
                    inputs, targets = inputs.to(device), targets.to(device)
                    optimizer.zero_grad()
                    outputs = model(inputs)
                    loss = criterion(outputs, targets)
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    optimizer.step()
                    epoch_loss += loss.item() * inputs.size(0)
                scheduler.step(epoch_loss / len(dataset))

    def predict(self, X):
        device = next(self.models[0].parameters()).device
        
        if self.data_type == 'full':
            X_std = self.featurizer.featurize(X, flip=False).to(device)
            X_flip = self.featurizer.featurize(X, flip=True).to(device)
            pred_sum = torch.zeros((len(X), 3)).to(device)
            with torch.no_grad():
                for model in self.models:
                    model.eval()
                    p1 = model(X_std)
                    p2 = model(X_flip)
                    pred_sum += (p1 + p2) * 0.5
            avg_pred = pred_sum / self.n_models
        else:
            X_std = self.featurizer.featurize(X).to(device)
            pred_sum = torch.zeros((len(X), 3)).to(device)
            with torch.no_grad():
                for model in self.models:
                    model.eval()
                    pred_sum += model(X_std)
            avg_pred = pred_sum / self.n_models
        return avg_pred.cpu()

print("MLP component defined (5 models, 250 epochs)")

MLP component defined (5 models, 250 epochs)


In [5]:
# ============ TREE COMPONENTS ============
from sklearn.ensemble import HistGradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import xgboost as xgb

class TreeFeaturizer:
    """Shared featurizer for tree-based models."""
    def __init__(self, data_type='single'):
        self.data_type = data_type
        self.spange_df = SPANGE_DF
        self.acs_pca_df = ACS_PCA_DF
        
    def create_features(self, X, flip=False):
        time_m = X["Residence Time"].values.reshape(-1, 1)
        temp_c = X["Temperature"].values.reshape(-1, 1)
        
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        
        if self.data_type == 'full':
            pct = X["SolventB%"].values.reshape(-1, 1)
            if flip:
                spange_A = self.spange_df.loc[X["SOLVENT B NAME"]].values
                spange_B = self.spange_df.loc[X["SOLVENT A NAME"]].values
                acs_A = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
                acs_B = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
                pct_use = 1 - pct
            else:
                spange_A = self.spange_df.loc[X["SOLVENT A NAME"]].values
                spange_B = self.spange_df.loc[X["SOLVENT B NAME"]].values
                acs_A = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
                acs_B = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
                pct_use = pct
            spange_feat = spange_A * (1 - pct_use) + spange_B * pct_use
            acs_feat = acs_A * (1 - pct_use) + acs_B * pct_use
        else:
            spange_feat = self.spange_df.loc[X["SOLVENT NAME"]].values
            acs_feat = self.acs_pca_df.loc[X["SOLVENT NAME"]].values
        
        features = np.hstack([time_m, temp_c, inv_temp, log_time, interaction, spange_feat, acs_feat])
        return features

print("TreeFeaturizer defined")

TreeFeaturizer defined


In [6]:
# ============ LIGHTGBM COMPONENT ============
class LightGBMModel(BaseModel):
    def __init__(self, data='single'):
        super().__init__()
        self.data_type = data
        self.featurizer = TreeFeaturizer(data_type=data)
        self.scaler = StandardScaler()
        self.models = {}
        
    def train_model(self, X_train, y_train):
        X_feat = self.featurizer.create_features(X_train, flip=False)
        
        if self.data_type == 'full':
            X_flip = self.featurizer.create_features(X_train, flip=True)
            X_feat = np.vstack([X_feat, X_flip])
            y_train_aug = pd.concat([y_train, y_train], ignore_index=True)
        else:
            y_train_aug = y_train
        
        X_scaled = self.scaler.fit_transform(X_feat)
        
        for target in TARGET_LABELS:
            self.models[target] = lgb.LGBMRegressor(
                n_estimators=500, learning_rate=0.03, max_depth=6,
                num_leaves=31, random_state=42, verbose=-1, n_jobs=-1
            )
            self.models[target].fit(X_scaled, y_train_aug[target].values)
    
    def predict(self, X_test):
        X_feat = self.featurizer.create_features(X_test, flip=False)
        X_scaled = self.scaler.transform(X_feat)
        
        preds = {}
        for target in TARGET_LABELS:
            preds[target] = self.models[target].predict(X_scaled)
        
        if self.data_type == 'full':
            X_flip = self.featurizer.create_features(X_test, flip=True)
            X_flip_scaled = self.scaler.transform(X_flip)
            for target in TARGET_LABELS:
                preds[target] = (preds[target] + self.models[target].predict(X_flip_scaled)) / 2
        
        for target in TARGET_LABELS:
            preds[target] = np.clip(preds[target], 0, 1)
        
        return torch.tensor(np.column_stack([preds['Product 2'], preds['Product 3'], preds['SM']]))

print("LightGBM component defined")

LightGBM component defined


In [7]:
# ============ XGBOOST COMPONENT ============
class XGBoostModel(BaseModel):
    def __init__(self, data='single'):
        super().__init__()
        self.data_type = data
        self.featurizer = TreeFeaturizer(data_type=data)
        self.scaler = StandardScaler()
        self.models = {}
        
    def train_model(self, X_train, y_train):
        X_feat = self.featurizer.create_features(X_train, flip=False)
        
        if self.data_type == 'full':
            X_flip = self.featurizer.create_features(X_train, flip=True)
            X_feat = np.vstack([X_feat, X_flip])
            y_train_aug = pd.concat([y_train, y_train], ignore_index=True)
        else:
            y_train_aug = y_train
        
        X_scaled = self.scaler.fit_transform(X_feat)
        
        for target in TARGET_LABELS:
            self.models[target] = xgb.XGBRegressor(
                n_estimators=500, learning_rate=0.03, max_depth=6,
                random_state=42, verbosity=0, n_jobs=-1
            )
            self.models[target].fit(X_scaled, y_train_aug[target].values)
    
    def predict(self, X_test):
        X_feat = self.featurizer.create_features(X_test, flip=False)
        X_scaled = self.scaler.transform(X_feat)
        
        preds = {}
        for target in TARGET_LABELS:
            preds[target] = self.models[target].predict(X_scaled)
        
        if self.data_type == 'full':
            X_flip = self.featurizer.create_features(X_test, flip=True)
            X_flip_scaled = self.scaler.transform(X_flip)
            for target in TARGET_LABELS:
                preds[target] = (preds[target] + self.models[target].predict(X_flip_scaled)) / 2
        
        for target in TARGET_LABELS:
            preds[target] = np.clip(preds[target], 0, 1)
        
        return torch.tensor(np.column_stack([preds['Product 2'], preds['Product 3'], preds['SM']]))

print("XGBoost component defined")

XGBoost component defined


In [8]:
# ============ SKLEARN TREES COMPONENT ============
class SklearnTreesModel(BaseModel):
    def __init__(self, data='single'):
        super().__init__()
        self.data_type = data
        self.featurizer = TreeFeaturizer(data_type=data)
        self.scaler = StandardScaler()
        self.models = {}
        
    def train_model(self, X_train, y_train):
        X_feat = self.featurizer.create_features(X_train, flip=False)
        
        if self.data_type == 'full':
            X_flip = self.featurizer.create_features(X_train, flip=True)
            X_feat = np.vstack([X_feat, X_flip])
            y_train_aug = pd.concat([y_train, y_train], ignore_index=True)
        else:
            y_train_aug = y_train
        
        X_scaled = self.scaler.fit_transform(X_feat)
        
        # SM: HistGradientBoosting (better for smooth targets)
        self.models['SM'] = HistGradientBoostingRegressor(
            max_depth=7, max_iter=700, learning_rate=0.04, random_state=42
        )
        self.models['SM'].fit(X_scaled, y_train_aug['SM'].values)
        
        # Products: ExtraTrees
        self.models['Product 2'] = ExtraTreesRegressor(n_estimators=500, min_samples_leaf=2, random_state=42, n_jobs=-1)
        self.models['Product 2'].fit(X_scaled, y_train_aug['Product 2'].values)
        
        self.models['Product 3'] = ExtraTreesRegressor(n_estimators=500, min_samples_leaf=2, random_state=42, n_jobs=-1)
        self.models['Product 3'].fit(X_scaled, y_train_aug['Product 3'].values)
    
    def predict(self, X_test):
        X_feat = self.featurizer.create_features(X_test, flip=False)
        X_scaled = self.scaler.transform(X_feat)
        
        preds = {}
        for target in TARGET_LABELS:
            preds[target] = self.models[target].predict(X_scaled)
        
        if self.data_type == 'full':
            X_flip = self.featurizer.create_features(X_test, flip=True)
            X_flip_scaled = self.scaler.transform(X_flip)
            for target in TARGET_LABELS:
                preds[target] = (preds[target] + self.models[target].predict(X_flip_scaled)) / 2
        
        for target in TARGET_LABELS:
            preds[target] = np.clip(preds[target], 0, 1)
        
        return torch.tensor(np.column_stack([preds['Product 2'], preds['Product 3'], preds['SM']]))

print("Sklearn Trees component defined")

Sklearn Trees component defined


In [9]:
# ============ EXPANDED ENSEMBLE ============
class ExpandedEnsembleModel(BaseModel):
    """4-model ensemble: MLP + LightGBM + XGBoost + SklearnTrees"""
    
    def __init__(self, data='single', weights=None):
        super().__init__()
        self.data_type = data
        # Default weights: equal for all 4 models
        self.weights = weights if weights else [0.25, 0.25, 0.25, 0.25]
        
        # Initialize components
        self.mlp = MLPWithArrhenius(data=data, n_models=5, epochs=250)
        self.lgbm = LightGBMModel(data=data)
        self.xgb = XGBoostModel(data=data)
        self.trees = SklearnTreesModel(data=data)
    
    def train_model(self, X_train, y_train):
        """Train all 4 components."""
        self.mlp.train_model(X_train, y_train)
        self.lgbm.train_model(X_train, y_train)
        self.xgb.train_model(X_train, y_train)
        self.trees.train_model(X_train, y_train)
    
    def predict(self, X_test):
        """Weighted average of all 4 models."""
        pred_mlp = self.mlp.predict(X_test)
        pred_lgbm = self.lgbm.predict(X_test)
        pred_xgb = self.xgb.predict(X_test)
        pred_trees = self.trees.predict(X_test)
        
        # Weighted average
        combined = (self.weights[0] * pred_mlp + 
                    self.weights[1] * pred_lgbm + 
                    self.weights[2] * pred_xgb + 
                    self.weights[3] * pred_trees)
        
        return torch.clamp(combined, 0, 1)

print("ExpandedEnsembleModel defined (4 models)")

ExpandedEnsembleModel defined (4 models)


In [10]:
# Quick test
print("Testing expanded ensemble...")
X, Y = load_data("single_solvent")
print(f"Single solvent: {X.shape}, {Y.shape}")

# Test on first fold
split_gen = generate_leave_one_out_splits(X, Y)
(train_X, train_Y), (test_X, test_Y) = next(split_gen)

model = ExpandedEnsembleModel(data='single')
model.train_model(train_X, train_Y)
preds = model.predict(test_X)
print(f"Predictions shape: {preds.shape}")
print(f"Sample predictions: {preds[:3]}")

Testing expanded ensemble...
Single solvent: (656, 3), (656, 3)


Predictions shape: torch.Size([37, 3])
Sample predictions: tensor([[0.0110, 0.0146, 0.8716],
        [0.0200, 0.0152, 0.8811],
        [0.0446, 0.0439, 0.7923]])


In [11]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []
all_actuals = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=24):
    (train_X, train_Y), (test_X, test_Y) = split

    model = ExpandedEnsembleModel(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()
    all_actuals.append(test_Y.values)

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

# Calculate CV score
all_actuals_np = np.vstack(all_actuals)
all_preds_np = np.array([[p['target_1'], p['target_2'], p['target_3']] for p in all_predictions])
single_mse = np.mean((all_actuals_np - all_preds_np) ** 2)
print(f"\nSingle Solvent CV MSE: {single_mse:.6f}")

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

  0%|          | 0/24 [00:00<?, ?it/s]

  4%|▍         | 1/24 [00:55<21:08, 55.17s/it]

  8%|▊         | 2/24 [01:49<19:59, 54.53s/it]

 12%|█▎        | 3/24 [02:41<18:39, 53.32s/it]

 17%|█▋        | 4/24 [03:33<17:35, 52.75s/it]

 21%|██        | 5/24 [04:27<16:55, 53.43s/it]

 25%|██▌       | 6/24 [05:22<16:08, 53.78s/it]

 29%|██▉       | 7/24 [06:16<15:17, 53.99s/it]

 33%|███▎      | 8/24 [07:10<14:26, 54.14s/it]

 38%|███▊      | 9/24 [08:05<13:34, 54.27s/it]

 42%|████▏     | 10/24 [09:00<12:42, 54.46s/it]

 46%|████▌     | 11/24 [09:55<11:50, 54.62s/it]

 50%|█████     | 12/24 [10:50<10:55, 54.64s/it]

 54%|█████▍    | 13/24 [11:44<10:00, 54.63s/it]

 58%|█████▊    | 14/24 [12:39<09:06, 54.63s/it]

 62%|██████▎   | 15/24 [13:34<08:12, 54.71s/it]

 67%|██████▋   | 16/24 [14:29<07:19, 54.95s/it]

 71%|███████   | 17/24 [15:27<06:30, 55.78s/it]

 75%|███████▌  | 18/24 [16:22<05:34, 55.70s/it]

 79%|███████▉  | 19/24 [17:17<04:36, 55.28s/it]

 83%|████████▎ | 20/24 [18:11<03:40, 55.10s/it]

 88%|████████▊ | 21/24 [19:06<02:44, 54.99s/it]

 92%|█████████▏| 22/24 [20:03<01:51, 55.53s/it]

 96%|█████████▌| 23/24 [20:57<00:55, 55.21s/it]

100%|██████████| 24/24 [21:53<00:00, 55.17s/it]

100%|██████████| 24/24 [21:53<00:00, 54.71s/it]


Single Solvent CV MSE: 0.010684





In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions_full = []
all_actuals_full = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=13):
    (train_X, train_Y), (test_X, test_Y) = split

    model = ExpandedEnsembleModel(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()
    all_actuals_full.append(test_Y.values)

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions_full.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions_full)

# Calculate CV score
all_actuals_full_np = np.vstack(all_actuals_full)
all_preds_full_np = np.array([[p['target_1'], p['target_2'], p['target_3']] for p in all_predictions_full])
full_mse = np.mean((all_actuals_full_np - all_preds_full_np) ** 2)
print(f"\nFull Data CV MSE: {full_mse:.6f}")

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

# Calculate overall CV score
total_samples = len(all_actuals_np) + len(all_actuals_full_np)
overall_mse = (single_mse * len(all_actuals_np) + full_mse * len(all_actuals_full_np)) / total_samples

print(f"\n=== FINAL RESULTS ===")
print(f"Single Solvent MSE: {single_mse:.6f}")
print(f"Full Data MSE: {full_mse:.6f}")
print(f"Overall CV MSE: {overall_mse:.6f}")
print(f"\nSubmission saved to /home/submission/submission.csv")
print(f"Submission shape: {submission.shape}")

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################