# Ens-Model Kernel Replica

**Hypothesis**: The "ens-model" kernel has sophisticated feature engineering that might change the CV-LB relationship.

**Key techniques from ens-model kernel**:
1. Combine ALL feature sources (spange, acs_pca, drfps, fragprints, smiles)
2. Filter correlated features with priority (spange > acs > drfps > frag > smiles)
3. Add numeric features (T_inv, T_x_RT, RT_log, RT_scaled)
4. CatBoost + XGBoost ensemble with task-specific weights
   - Single: CatBoost=7, XGBoost=6 (0.538, 0.462)
   - Full: CatBoost=1, XGBoost=2 (0.333, 0.667)
5. Output normalization: clip to [0, ∞), renormalize so sum ≤ 1

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
from catboost import CatBoostRegressor
import xgboost as xgb
import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.set_default_dtype(torch.double)

print('Imports complete')

Imports complete


In [2]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def load_features(name):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [3]:
# Feature engineering functions from ens-model kernel

def feature_priority(name):
    """Assign priority score to feature name based on prefix."""
    if name.startswith("spange_"):
        return 5
    if name.startswith("acs_"):
        return 4
    if name.startswith("drfps_"):
        return 3
    if name.startswith("frag_"):
        return 2
    if name.startswith("smiles_"):
        return 1
    return 0

def filter_correlated_features(df, threshold=0.90):
    """Drop columns that are highly correlated, keeping higher priority ones."""
    numeric_df = df.select_dtypes(include=[np.number])
    
    if numeric_df.shape[1] == 0:
        return df, []
    
    # Drop constant columns first
    std = numeric_df.std(axis=0)
    constant_cols = std[std == 0].index.tolist()
    if constant_cols:
        numeric_df = numeric_df.drop(columns=constant_cols)
    
    # Correlation matrix
    corr = numeric_df.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)).fillna(0.0)
    
    cols = upper.columns.tolist()
    to_drop = set()
    
    # Find all pairs with corr > threshold
    high_corr_pairs = []
    for i, col_i in enumerate(cols):
        for j in range(i + 1, len(cols)):
            col_j = cols[j]
            cval = upper.iloc[i, j]
            if cval > threshold:
                high_corr_pairs.append((col_i, col_j, cval))
    
    # For each pair, decide which to drop
    for col_i, col_j, cval in high_corr_pairs:
        if col_i in to_drop or col_j in to_drop:
            continue
        
        p_i = feature_priority(col_i)
        p_j = feature_priority(col_j)
        
        if p_i > p_j:
            drop = col_j
        elif p_j > p_i:
            drop = col_i
        else:
            # Same priority; drop the one that appears later
            idx_i = df.columns.get_loc(col_i) if col_i in df.columns else 999
            idx_j = df.columns.get_loc(col_j) if col_j in df.columns else 999
            drop = col_i if idx_i > idx_j else col_j
        
        to_drop.add(drop)
    
    all_to_drop = list(set(constant_cols).union(to_drop))
    df_filtered = df.drop(columns=all_to_drop, errors="ignore")
    
    return df_filtered, all_to_drop

def add_numeric_features(X_numeric):
    """Add engineered numeric features."""
    X_num = X_numeric.copy()
    cols = set(X_num.columns)
    
    if {"Temperature", "Residence Time"} <= cols:
        # Convert Temperature to Kelvin
        X_num["Temperature"] = X_num["Temperature"] + 273.15
        
        T = X_num["Temperature"]
        rt = X_num["Residence Time"]
        
        # Interaction term
        X_num["T_x_RT"] = T * rt
        
        # Log transformation
        X_num["RT_log"] = np.log(rt + 1e-6)
        
        # Inverse temperature
        X_num["T_inv"] = 1 / T
        
        # Scaled residence time
        X_num["RT_scaled"] = rt / rt.mean()
    
    return X_num

print('Feature engineering functions defined')

Feature engineering functions defined


In [4]:
# Build combined solvent feature table
_SOLVENT_TABLE_CACHE = None

def build_solvent_feature_table(threshold=0.90):
    """Build combined solvent feature table from multiple sources."""
    global _SOLVENT_TABLE_CACHE
    
    if _SOLVENT_TABLE_CACHE is not None:
        return _SOLVENT_TABLE_CACHE
    
    print(">>> Building solvent feature table...")
    
    sources = [
        "spange_descriptors",
        "acs_pca_descriptors",
        "drfps_catechol",
        "fragprints",
        "smiles",
    ]
    
    dfs = []
    
    for src in sources:
        df_src = load_features(src).copy()
        
        if "SOLVENT NAME" not in df_src.columns:
            df_src = df_src.reset_index().rename(columns={"index": "SOLVENT NAME"})
        
        # Bit-table filtering for binary fingerprints
        if src in ["drfps_catechol", "fragprints"]:
            prefix = "drfps" if src == "drfps_catechol" else "frag"
            
            # Drop all-zero and all-one columns
            df_src = df_src.loc[:, (df_src != 0).any(axis=0)]
            df_src = df_src.loc[:, (df_src != 1).any(axis=0)]
            
            # Drop columns with only 1 non-zero value
            values = df_src.drop(columns={"SOLVENT NAME"}, errors="ignore")
            count = values.sum(axis=0).T
            drop_cols = count[count == 1].index
            df_src = df_src.drop(columns=drop_cols, errors="ignore")
            
            # Rename columns with prefix
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"{prefix}_{c}" for c in cols_to_rename})
        
        else:
            if src == "spange_descriptors":
                prefix = "spange"
            elif src == "acs_pca_descriptors":
                prefix = "acs"
            elif src == "smiles":
                prefix = "smiles"
            else:
                prefix = src
            
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"{prefix}_{c}" for c in cols_to_rename})
        
        dfs.append(df_src)
        print(f"  Loaded {src}: {df_src.shape}")
    
    # Merge all dataframes on SOLVENT NAME
    from functools import reduce
    merged = reduce(
        lambda left, right: pd.merge(left, right, on="SOLVENT NAME", how="outer"),
        dfs
    )
    
    print(f"  Merged shape before filtering: {merged.shape}")
    
    # Filter correlated features
    merged, dropped = filter_correlated_features(merged, threshold=threshold)
    print(f"  Dropped {len(dropped)} correlated features")
    print(f"  Final shape: {merged.shape}")
    
    merged = merged.set_index("SOLVENT NAME")
    _SOLVENT_TABLE_CACHE = merged
    
    return merged

# Build the table
SOLVENT_TABLE = build_solvent_feature_table(threshold=0.90)
print(f"\nSolvent table shape: {SOLVENT_TABLE.shape}")
print(f"Columns: {list(SOLVENT_TABLE.columns)[:10]}...")

>>> Building solvent feature table...
  Loaded spange_descriptors: (26, 14)
  Loaded acs_pca_descriptors: (24, 6)
  Loaded drfps_catechol: (24, 41)
  Loaded fragprints: (24, 56)
  Loaded smiles: (26, 2)
  Merged shape before filtering: (26, 115)
  Dropped 47 correlated features
  Final shape: (26, 68)

Solvent table shape: (26, 67)
Columns: ['spange_dielectric constant', 'spange_ET(30)', 'spange_beta', 'spange_pi*', 'spange_SB', 'spange_SP', 'spange_SdP', 'spange_N', 'spange_n', 'acs_PC1']...


In [10]:
# Featurizers
class PrecomputedFeaturizer:
    def __init__(self):
        self.solvent_table = SOLVENT_TABLE
        # Ensure all columns are numeric
        self.solvent_table = self.solvent_table.select_dtypes(include=[np.number])
        self.feats_dim = self.solvent_table.shape[1] + 6  # +6 for numeric features
    
    def featurize(self, X):
        # Get numeric features
        X_numeric = X[["Residence Time", "Temperature"]].copy()
        X_numeric = add_numeric_features(X_numeric)
        
        # Get solvent features
        solvent_names = X["SOLVENT NAME"]
        solvent_feats = self.solvent_table.loc[solvent_names].values.astype(np.float64)
        
        # Combine
        final_feats = np.hstack([X_numeric.values.astype(np.float64), solvent_feats])
        return torch.tensor(final_feats, dtype=torch.double)

class PrecomputedFeaturizerMixed:
    def __init__(self):
        self.solvent_table = SOLVENT_TABLE
        # Ensure all columns are numeric
        self.solvent_table = self.solvent_table.select_dtypes(include=[np.number])
        self.feats_dim = self.solvent_table.shape[1] + 7  # +7 for numeric features + SolventB%
    
    def featurize(self, X):
        # Get numeric features
        X_numeric = X[["Residence Time", "Temperature"]].copy()
        X_numeric = add_numeric_features(X_numeric)
        
        # Get solvent features (weighted average)
        sb_pct = X["SolventB%"].values.reshape(-1, 1).astype(np.float64)
        desc_a = self.solvent_table.loc[X["SOLVENT A NAME"]].values.astype(np.float64)
        desc_b = self.solvent_table.loc[X["SOLVENT B NAME"]].values.astype(np.float64)
        mixture_feats = (1 - sb_pct) * desc_a + sb_pct * desc_b
        
        # Combine
        final_feats = np.hstack([X_numeric.values.astype(np.float64), sb_pct, mixture_feats])
        return torch.tensor(final_feats, dtype=torch.double)

print(f"Single featurizer dim: {PrecomputedFeaturizer().feats_dim}")
print(f"Mixed featurizer dim: {PrecomputedFeaturizerMixed().feats_dim}")

Single featurizer dim: 72
Mixed featurizer dim: 73


In [11]:
# CatBoost Model (from ens-model kernel)
class CatBoostModel:
    def __init__(self, data='single', random_state=42):
        self.data_mode = data
        self.random_state = random_state
        
        if data == 'single':
            self.smiles_featurizer = PrecomputedFeaturizer()
            self.cat_params = dict(
                random_seed=random_state,
                iterations=500,
                learning_rate=0.05,
                depth=6,
                l2_leaf_reg=3.0,
                verbose=False,
            )
        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixed()
            self.cat_params = dict(
                random_seed=random_state,
                iterations=400,
                learning_rate=0.05,
                depth=5,
                l2_leaf_reg=5.0,
                verbose=False,
            )
        
        self.models = None
        self.n_targets = None
    
    def train_model(self, train_X, train_Y):
        X_tensor = self.smiles_featurizer.featurize(train_X)
        X_np = X_tensor.detach().cpu().numpy()
        Y_np = train_Y.values
        self.n_targets = Y_np.shape[1]
        
        self.models = []
        for t in range(self.n_targets):
            m = CatBoostRegressor(**self.cat_params)
            m.fit(X_np, Y_np[:, t])
            self.models.append(m)
    
    def predict(self, X):
        X_tensor = self.smiles_featurizer.featurize(X)
        X_np = X_tensor.detach().cpu().numpy()
        
        preds_list = [m.predict(X_np) for m in self.models]
        out = np.column_stack(preds_list)
        
        # Clip and renormalize
        out = np.clip(out, a_min=0.0, a_max=None)
        if out.shape[1] > 1:
            totals = out.sum(axis=1, keepdims=True)
            divisor = np.maximum(totals, 1.0)
            out = out / divisor
        
        return torch.tensor(out, dtype=torch.double)

print('CatBoostModel defined')

CatBoostModel defined


In [12]:
# XGBoost Model (from ens-model kernel)
class XGBModel:
    def __init__(self, data='single', random_state=42):
        self.data_mode = data
        self.random_state = random_state
        
        if data == 'single':
            self.smiles_featurizer = PrecomputedFeaturizer()
            self.xgb_params = dict(
                random_state=random_state,
                n_estimators=400,
                learning_rate=0.05,
                max_depth=5,
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=0.1,
                reg_lambda=1.0,
                verbosity=0,
            )
        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixed()
            self.xgb_params = dict(
                random_state=random_state,
                n_estimators=350,
                learning_rate=0.05,
                max_depth=4,
                subsample=0.85,
                colsample_bytree=0.85,
                reg_alpha=0.05,
                reg_lambda=0.5,
                verbosity=0,
            )
        
        self.models = None
        self.n_targets = None
    
    def train_model(self, train_X, train_Y):
        X_tensor = self.smiles_featurizer.featurize(train_X)
        X_np = X_tensor.detach().cpu().numpy()
        Y_np = train_Y.values
        self.n_targets = Y_np.shape[1]
        
        self.models = []
        for t in range(self.n_targets):
            m = xgb.XGBRegressor(**self.xgb_params)
            m.fit(X_np, Y_np[:, t])
            self.models.append(m)
    
    def predict(self, X):
        X_tensor = self.smiles_featurizer.featurize(X)
        X_np = X_tensor.detach().cpu().numpy()
        
        preds_list = [m.predict(X_np) for m in self.models]
        out = np.column_stack(preds_list)
        
        # Clip and renormalize
        out = np.clip(out, a_min=0.0, a_max=None)
        if out.shape[1] > 1:
            totals = out.sum(axis=1, keepdims=True)
            divisor = np.maximum(totals, 1.0)
            out = out / divisor
        
        return torch.tensor(out, dtype=torch.double)

print('XGBModel defined')

XGBModel defined


In [13]:
# Ensemble Model (from ens-model kernel)
class EnsembleModel:
    def __init__(self, data='single'):
        self.data_mode = data
        
        # Task-specific weights
        if data == 'single':
            cat_weight = 7.0
            xgb_weight = 6.0
        else:
            cat_weight = 1.0
            xgb_weight = 2.0
        
        # Normalize weights
        w_sum = cat_weight + xgb_weight
        self.cat_weight = cat_weight / w_sum
        self.xgb_weight = xgb_weight / w_sum
        
        # Initialize base models
        self.cat_model = CatBoostModel(data=data)
        self.xgb_model = XGBModel(data=data)
    
    def train_model(self, train_X, train_Y):
        self.cat_model.train_model(train_X, train_Y)
        self.xgb_model.train_model(train_X, train_Y)
    
    def predict(self, X):
        cat_pred = self.cat_model.predict(X)
        xgb_pred = self.xgb_model.predict(X)
        
        out = self.cat_weight * cat_pred + self.xgb_weight * xgb_pred
        return out

print('EnsembleModel defined')
print(f"Single weights: CatBoost={7/13:.3f}, XGBoost={6/13:.3f}")
print(f"Full weights: CatBoost={1/3:.3f}, XGBoost={2/3:.3f}")

EnsembleModel defined
Single weights: CatBoost=0.538, XGBoost=0.462
Full weights: CatBoost=0.333, XGBoost=0.667


In [14]:
# Cross-validation for single solvent data
print("="*60)
print("Cross-validation: Single Solvent Data (Leave-One-Out)")
print("="*60)

X_single, Y_single = load_data("single_solvent")
print(f"Single solvent data: X={X_single.shape}, Y={Y_single.shape}")

all_mse_single = []
for (train_X, train_Y), (test_X, test_Y) in tqdm.tqdm(generate_leave_one_out_splits(X_single, Y_single), total=24):
    model = EnsembleModel(data='single')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    all_mse_single.append(mse)

mse_single = np.mean(all_mse_single)
print(f"\nSingle Solvent MSE: {mse_single:.6f} (+/- {np.std(all_mse_single):.6f})")

Cross-validation: Single Solvent Data (Leave-One-Out)
Single solvent data: X=(656, 3), Y=(656, 3)


  0%|          | 0/24 [00:00<?, ?it/s]

  4%|▍         | 1/24 [00:01<00:34,  1.51s/it]

  8%|▊         | 2/24 [00:02<00:30,  1.37s/it]

 12%|█▎        | 3/24 [00:03<00:27,  1.30s/it]

 17%|█▋        | 4/24 [00:05<00:25,  1.29s/it]

 21%|██        | 5/24 [00:06<00:24,  1.27s/it]

 25%|██▌       | 6/24 [00:07<00:22,  1.26s/it]

 29%|██▉       | 7/24 [00:09<00:21,  1.26s/it]

 33%|███▎      | 8/24 [00:10<00:20,  1.26s/it]

 38%|███▊      | 9/24 [00:11<00:19,  1.31s/it]

 42%|████▏     | 10/24 [00:12<00:18,  1.29s/it]

 46%|████▌     | 11/24 [00:14<00:16,  1.29s/it]

 50%|█████     | 12/24 [00:15<00:15,  1.28s/it]

 54%|█████▍    | 13/24 [00:16<00:14,  1.28s/it]

 58%|█████▊    | 14/24 [00:18<00:12,  1.27s/it]

 62%|██████▎   | 15/24 [00:19<00:11,  1.27s/it]

 67%|██████▋   | 16/24 [00:20<00:10,  1.27s/it]

 71%|███████   | 17/24 [00:21<00:08,  1.26s/it]

 75%|███████▌  | 18/24 [00:23<00:07,  1.28s/it]

 79%|███████▉  | 19/24 [00:24<00:06,  1.28s/it]

 83%|████████▎ | 20/24 [00:25<00:05,  1.27s/it]

 88%|████████▊ | 21/24 [00:26<00:03,  1.28s/it]

 92%|█████████▏| 22/24 [00:28<00:02,  1.28s/it]

 96%|█████████▌| 23/24 [00:29<00:01,  1.30s/it]

100%|██████████| 24/24 [00:30<00:00,  1.31s/it]

100%|██████████| 24/24 [00:30<00:00,  1.29s/it]


Single Solvent MSE: 0.009551 (+/- 0.008479)





In [15]:
# Cross-validation for full data
print("="*60)
print("Cross-validation: Full Data (Leave-One-Ramp-Out)")
print("="*60)

X_full, Y_full = load_data("full")
print(f"Full data: X={X_full.shape}, Y={Y_full.shape}")

all_mse_full = []
for (train_X, train_Y), (test_X, test_Y) in tqdm.tqdm(generate_leave_one_ramp_out_splits(X_full, Y_full), total=13):
    model = EnsembleModel(data='full')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    all_mse_full.append(mse)

mse_full = np.mean(all_mse_full)
print(f"\nFull Data MSE: {mse_full:.6f} (+/- {np.std(all_mse_full):.6f})")

Cross-validation: Full Data (Leave-One-Ramp-Out)
Full data: X=(1227, 5), Y=(1227, 3)


  0%|          | 0/13 [00:00<?, ?it/s]

  8%|▊         | 1/13 [00:01<00:13,  1.15s/it]

 15%|█▌        | 2/13 [00:02<00:12,  1.16s/it]

 23%|██▎       | 3/13 [00:03<00:11,  1.18s/it]

 31%|███       | 4/13 [00:04<00:10,  1.19s/it]

 38%|███▊      | 5/13 [00:05<00:09,  1.19s/it]

 46%|████▌     | 6/13 [00:07<00:08,  1.16s/it]

 54%|█████▍    | 7/13 [00:08<00:07,  1.17s/it]

 62%|██████▏   | 8/13 [00:09<00:05,  1.17s/it]

 69%|██████▉   | 9/13 [00:10<00:04,  1.16s/it]

 77%|███████▋  | 10/13 [00:11<00:03,  1.16s/it]

 85%|████████▍ | 11/13 [00:12<00:02,  1.16s/it]

 92%|█████████▏| 12/13 [00:13<00:01,  1.16s/it]

100%|██████████| 13/13 [00:15<00:00,  1.17s/it]

100%|██████████| 13/13 [00:15<00:00,  1.17s/it]


Full Data MSE: 0.009038 (+/- 0.006532)





In [16]:
# Calculate overall MSE
N_single = len(X_single)
N_full = len(X_full)
N_total = N_single + N_full

overall_mse = (mse_single * N_single + mse_full * N_full) / N_total

print("="*60)
print("SUMMARY")
print("="*60)
print(f"\nEns-Model Replica:")
print(f"  Single Solvent MSE: {mse_single:.6f}")
print(f"  Full Data MSE: {mse_full:.6f}")
print(f"  Overall MSE: {overall_mse:.6f}")

print(f"\nComparison:")
print(f"  Best GP+MLP+LGBM ensemble (exp_030): 0.008298")
print(f"  This ensemble vs Best: {(overall_mse - 0.008298) / 0.008298 * 100:.2f}%")

# Expected LB based on CV-LB relationship
expected_lb = 4.31 * overall_mse + 0.0525
print(f"\nExpected LB (based on CV-LB line): {expected_lb:.4f}")
print(f"Target LB: 0.0347")
print(f"Gap to target: {(expected_lb - 0.0347) / 0.0347 * 100:.1f}%")

SUMMARY

Ens-Model Replica:
  Single Solvent MSE: 0.009551
  Full Data MSE: 0.009038
  Overall MSE: 0.009217

Comparison:
  Best GP+MLP+LGBM ensemble (exp_030): 0.008298
  This ensemble vs Best: 11.07%

Expected LB (based on CV-LB line): 0.0922
Target LB: 0.0347
Gap to target: 165.8%


In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################