In [1]:
# Experiment 063: CORRECT Final Cell Structure
# THE FINAL CELL MUST BE EXACTLY THE TEMPLATE - NO EXTRA CODE!
# CV calculation is in a SEPARATE cell AFTER the final cell

import sys
import numpy as np
import pandas as pd
import torch
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from abc import ABC, abstractmethod
import warnings
warnings.filterwarnings('ignore')

# Define constants
INPUT_LABELS_FULL_SOLVENT = [
    "Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"
]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

DATA_PATH = "/home/data"

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    assert name in ["spange_descriptors", "acs_pca_descriptors", "drfps_catechol", "fragprints", "smiles"]
    features = pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)
    return features

def generate_leave_one_out_splits(X, Y):
    all_solvents = X["SOLVENT NAME"].unique()
    for solvent_name in sorted(all_solvents):
        train_idcs_mask = X["SOLVENT NAME"] != solvent_name
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

def generate_leave_one_ramp_out_splits(X, Y):
    all_solvent_ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    all_solvent_ramps = all_solvent_ramps.sort_values(by=["SOLVENT A NAME", "SOLVENT B NAME"])
    for _, solvent_pair in all_solvent_ramps.iterrows():
        train_idcs_mask = (X[["SOLVENT A NAME", "SOLVENT B NAME"]] != solvent_pair).all(axis=1)
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

print('Imports and data loading functions defined')

Imports and data loading functions defined


In [2]:
# Feature engineering functions

def feature_priority(name: str) -> int:
    if name.startswith("spange_"):
        return 5
    if name.startswith("acs_"):
        return 4
    if name.startswith("drfps_"):
        return 3
    if name.startswith("frag_"):
        return 2
    if name.startswith("smiles_"):
        return 1
    return 0

def filter_correlated_features(df: pd.DataFrame, threshold: float = 0.90):
    numeric_df = df.select_dtypes(include=[np.number])
    if numeric_df.shape[1] == 0:
        return df, []
    
    std = numeric_df.std(axis=0)
    constant_cols = std[std == 0].index.tolist()
    if constant_cols:
        numeric_df = numeric_df.drop(columns=constant_cols)
    
    corr = numeric_df.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)).fillna(0.0)
    
    cols = upper.columns.tolist()
    to_drop = set()
    
    high_corr_pairs = []
    for i, col_i in enumerate(cols):
        for j in range(i + 1, len(cols)):
            col_j = cols[j]
            cval = upper.iloc[i, j]
            if cval > threshold:
                high_corr_pairs.append((col_i, col_j, cval))
    
    for col_i, col_j, cval in high_corr_pairs:
        if col_i in to_drop or col_j in to_drop:
            continue
        p_i = feature_priority(col_i)
        p_j = feature_priority(col_j)
        if p_i > p_j:
            drop = col_j
        elif p_j > p_i:
            drop = col_i
        else:
            idx_i = df.columns.get_loc(col_i)
            idx_j = df.columns.get_loc(col_j)
            drop = col_i if idx_i > idx_j else col_j
        to_drop.add(drop)
    
    all_to_drop = list(set(constant_cols).union(to_drop))
    df_filtered = df.drop(columns=all_to_drop, errors="ignore")
    return df_filtered, all_to_drop

print('Feature engineering functions defined')

Feature engineering functions defined


In [3]:
# Build combined solvent feature table

def build_solvent_feature_table(threshold: float = 0.90):
    sources = [
        "spange_descriptors",
        "acs_pca_descriptors",
        "drfps_catechol",
        "fragprints",
    ]
    
    dfs = []
    
    for src in sources:
        df_src = load_features(src).copy()
        
        if "SOLVENT NAME" not in df_src.columns:
            df_src = df_src.reset_index().rename(columns={"index": "SOLVENT NAME"})
        
        if src in ["drfps_catechol", "fragprints"]:
            prefix = "drfps" if src == "drfps_catechol" else "frag"
            df_src = df_src.loc[:, (df_src != 0).any(axis=0)]
            df_src = df_src.loc[:, (df_src != 1).any(axis=0)]
            values = df_src.drop(columns={"SOLVENT NAME"})
            count = values.sum(axis=0).T
            drop_cols = count[count == 1].index
            df_src = df_src.drop(columns=drop_cols)
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"{prefix}_{c}" for c in cols_to_rename})
        else:
            if src == "spange_descriptors":
                prefix = "spange"
            elif src == "acs_pca_descriptors":
                prefix = "acs"
            else:
                prefix = src
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"{prefix}_{c}" for c in cols_to_rename})
        
        dfs.append(df_src)
    
    from functools import reduce
    merged = reduce(
        lambda left, right: pd.merge(left, right, on="SOLVENT NAME", how="outer"),
        dfs
    )
    
    merged_filtered, dropped = filter_correlated_features(merged, threshold=threshold)
    print(f"Combined features: {merged.shape} -> {merged_filtered.shape} (dropped {len(dropped)})")
    
    return merged_filtered

solvent_table = build_solvent_feature_table(threshold=0.90)
print(f"Solvent table shape: {solvent_table.shape}")

Combined features: (26, 114) -> (26, 67) (dropped 47)
Solvent table shape: (26, 67)


In [4]:
# Featurizer and Model classes

class BaseModel(ABC):
    def __init__(self):
        pass
    def train_model(self, X_train, y_train):
        raise NotImplementedError
    def predict(self):
        raise NotImplementedError

class CombinedFeaturizer:
    def __init__(self, solvent_table, data='single'):
        self.solvent_table = solvent_table
        self.data_mode = data
        self.scaler = None
        self.feature_cols = None
    
    def featurize(self, X, fit_scaler=False):
        X = X.copy()
        
        if self.data_mode == 'single':
            X_merged = X.merge(self.solvent_table, on='SOLVENT NAME', how='left')
            numeric_cols = [c for c in X_merged.columns if c != 'SOLVENT NAME' and X_merged[c].dtype in [np.float64, np.int64, np.float32, np.int32]]
            X_numeric = X_merged[numeric_cols].copy()
        else:
            solvent_table_a = self.solvent_table.copy()
            solvent_table_a.columns = ['SOLVENT A NAME' if c == 'SOLVENT NAME' else f'{c}_A' for c in solvent_table_a.columns]
            solvent_table_b = self.solvent_table.copy()
            solvent_table_b.columns = ['SOLVENT B NAME' if c == 'SOLVENT NAME' else f'{c}_B' for c in solvent_table_b.columns]
            
            X_merged = X.merge(solvent_table_a, on='SOLVENT A NAME', how='left')
            X_merged = X_merged.merge(solvent_table_b, on='SOLVENT B NAME', how='left')
            
            numeric_cols = [c for c in X_merged.columns if c not in ['SOLVENT A NAME', 'SOLVENT B NAME'] and X_merged[c].dtype in [np.float64, np.int64, np.float32, np.int32]]
            X_numeric = X_merged[numeric_cols].copy()
        
        X_numeric = X_numeric.fillna(0.0)
        
        if fit_scaler:
            self.scaler = StandardScaler()
            X_scaled = self.scaler.fit_transform(X_numeric)
            self.feature_cols = numeric_cols
        else:
            if self.scaler is None:
                self.scaler = StandardScaler()
                X_scaled = self.scaler.fit_transform(X_numeric)
                self.feature_cols = numeric_cols
            else:
                X_scaled = self.scaler.transform(X_numeric)
        
        return torch.tensor(X_scaled, dtype=torch.double)

print('Featurizer class defined')

Featurizer class defined


In [5]:
# CatBoost + XGBoost Ensemble Model

class CatBoostXGBEnsemble(BaseModel):
    def __init__(self, data='single', verbose=False):
        self.data_mode = data
        self.verbose = verbose
        self.featurizer = CombinedFeaturizer(solvent_table, data=data)
        
        # Ensemble weights from ens-model kernel
        if data == 'single':
            cat_weight = 7.0
            xgb_weight = 6.0
        else:
            cat_weight = 1.0
            xgb_weight = 2.0
        
        w_sum = cat_weight + xgb_weight
        self.cat_weight = cat_weight / w_sum
        self.xgb_weight = xgb_weight / w_sum
        
        # CatBoost parameters
        if data == 'single':
            self.cat_params = dict(
                random_seed=42,
                loss_function="MultiRMSE",
                depth=3,
                learning_rate=0.07,
                n_estimators=1050,
                l2_leaf_reg=3.5,
                verbose=False,
            )
        else:
            self.cat_params = dict(
                random_seed=42,
                loss_function="MultiRMSE",
                depth=3,
                learning_rate=0.06,
                n_estimators=1100,
                l2_leaf_reg=2.5,
                verbose=False,
            )
        
        # XGBoost parameters
        self.xgb_params = dict(
            random_state=42,
            n_estimators=1000,
            max_depth=4,
            learning_rate=0.02,
            subsample=0.5,
            colsample_bytree=0.8,
            verbosity=0,
        )
        
        self.cat_model = None
        self.xgb_models = None
    
    def train_model(self, train_X, train_Y, device=None, verbose=False):
        X_tensor = self.featurizer.featurize(train_X, fit_scaler=True)
        X_np = X_tensor.numpy()
        Y_np = train_Y.values
        
        # Train CatBoost (multi-target)
        self.cat_model = CatBoostRegressor(**self.cat_params)
        self.cat_model.fit(X_np, Y_np)
        
        # Train XGBoost (one per target)
        self.xgb_models = []
        for t in range(Y_np.shape[1]):
            xgb = XGBRegressor(**self.xgb_params)
            xgb.fit(X_np, Y_np[:, t])
            self.xgb_models.append(xgb)
    
    def predict(self, X):
        X_tensor = self.featurizer.featurize(X, fit_scaler=False)
        X_np = X_tensor.numpy()
        
        # CatBoost predictions
        cat_pred = self.cat_model.predict(X_np)
        cat_pred = np.clip(cat_pred, 0.0, None)
        
        # XGBoost predictions
        xgb_preds = [m.predict(X_np) for m in self.xgb_models]
        xgb_pred = np.column_stack(xgb_preds)
        xgb_pred = np.clip(xgb_pred, 0.0, None)
        
        # Ensemble
        out = self.cat_weight * cat_pred + self.xgb_weight * xgb_pred
        
        # Normalize to sum to 1 (if sum > 1)
        totals = out.sum(axis=1, keepdims=True)
        divisor = np.maximum(totals, 1.0)
        out = out / divisor
        
        return torch.tensor(out, dtype=torch.double)

print('CatBoostXGBEnsemble class defined')

CatBoostXGBEnsemble class defined


In [6]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = CatBoostXGBEnsemble() # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:01,  1.27s/it]

2it [00:02,  1.08s/it]

3it [00:03,  1.01s/it]

4it [00:04,  1.01it/s]

5it [00:05,  1.03it/s]

6it [00:05,  1.04it/s]

7it [00:06,  1.03it/s]

8it [00:07,  1.03it/s]

9it [00:08,  1.03it/s]

10it [00:09,  1.04it/s]

11it [00:10,  1.05it/s]

12it [00:11,  1.03it/s]

13it [00:12,  1.04it/s]

14it [00:13,  1.04it/s]

15it [00:14,  1.05it/s]

16it [00:15,  1.02s/it]

17it [00:16,  1.01s/it]

18it [00:17,  1.00s/it]

19it [00:18,  1.01s/it]

20it [00:19,  1.01s/it]

21it [00:20,  1.00it/s]

22it [00:21,  1.00it/s]

23it [00:22,  1.01it/s]

24it [00:23,  1.02it/s]

24it [00:23,  1.01it/s]




In [7]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = CatBoostXGBEnsemble(data = 'full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:01,  1.24s/it]

2it [00:02,  1.25s/it]

3it [00:03,  1.26s/it]

4it [00:05,  1.26s/it]

5it [00:06,  1.30s/it]

6it [00:07,  1.30s/it]

7it [00:08,  1.29s/it]

8it [00:10,  1.29s/it]

9it [00:11,  1.29s/it]

10it [00:12,  1.29s/it]

11it [00:14,  1.31s/it]

12it [00:15,  1.33s/it]

13it [00:16,  1.35s/it]

13it [00:16,  1.31s/it]




In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [9]:
# CV CALCULATION - This cell is AFTER the final submission cell
# It will be ignored by Kaggle but useful for local evaluation

import os
from sklearn.metrics import mean_squared_error

# Save to /home/submission for local use
os.makedirs('/home/submission', exist_ok=True)
submission.to_csv('/home/submission/submission.csv', index=True)

# Single solvent CV
X_single, Y_single = load_data("single_solvent")
split_gen = list(generate_leave_one_out_splits(X_single, Y_single))
all_y_true, all_y_pred = [], []
for fold_idx, split in enumerate(split_gen):
    (_, _), (_, test_Y) = split
    fold_preds = submission_single_solvent[submission_single_solvent['fold'] == fold_idx]
    all_y_true.append(test_Y.values)
    all_y_pred.append(fold_preds[['target_1', 'target_2', 'target_3']].values)
mse_single = mean_squared_error(np.vstack(all_y_true), np.vstack(all_y_pred))

# Full data CV
X_full, Y_full = load_data("full")
split_gen = list(generate_leave_one_ramp_out_splits(X_full, Y_full))
all_y_true, all_y_pred = [], []
for fold_idx, split in enumerate(split_gen):
    (_, _), (_, test_Y) = split
    fold_preds = submission_full_data[submission_full_data['fold'] == fold_idx]
    all_y_true.append(test_Y.values)
    all_y_pred.append(fold_preds[['target_1', 'target_2', 'target_3']].values)
mse_full = mean_squared_error(np.vstack(all_y_true), np.vstack(all_y_pred))

print(f'Single Solvent CV MSE: {mse_single:.6f}')
print(f'Full Data CV MSE: {mse_full:.6f}')
print(f'Submission saved with {len(submission)} rows')

Single Solvent CV MSE: 0.008811
Full Data CV MSE: 0.015203
Submission saved with 1883 rows
