# Predict 4 Targets (SM, P2, P3, Other)

**Strategy**: Explicitly model the "other products" fraction (1 - SM - P2 - P3) as a 4th target.

**Key Insight from Loop 86 Analysis**:
- Mass balance (SM + P2 + P3) is NOT 1.0 - it averages 0.7955 with ~20.4% unaccounted for
- Mass balance varies significantly by solvent: 0.486 (2,2,2-Trifluoroethanol) to 0.994 (IPA)
- Correlation with conversion: -0.68 (higher conversion = lower mass balance)

**Hypothesis**: By predicting the "other" fraction explicitly, we capture solvent-specific behavior that might generalize better to unseen solvents.

**Model**: CatBoost+XGBoost ensemble (best performing tabular model)

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.set_default_dtype(torch.double)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [2]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [3]:
# Analyze mass balance in the data
X, Y = load_data("single_solvent")
mass_balance = Y.sum(axis=1)
print(f'Mass balance statistics:')
print(f'  Mean: {mass_balance.mean():.4f}')
print(f'  Std: {mass_balance.std():.4f}')
print(f'  Min: {mass_balance.min():.4f}')
print(f'  Max: {mass_balance.max():.4f}')
print(f'\n"Other" fraction (1 - mass_balance):')
other = 1 - mass_balance
print(f'  Mean: {other.mean():.4f}')
print(f'  Std: {other.std():.4f}')
print(f'  Min: {other.min():.4f}')
print(f'  Max: {other.max():.4f}')

Mass balance statistics:
  Mean: 0.7955
  Std: 0.1943
  Min: 0.0288
  Max: 1.0000

"Other" fraction (1 - mass_balance):
  Mean: 0.2045
  Std: 0.1943
  Min: -0.0000
  Max: 0.9712


In [4]:
# Load feature lookups
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFP_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)

# Filter DRFP to high-variance columns
drfp_variance = DRFP_DF.var()
nonzero_variance_cols = drfp_variance[drfp_variance > 0].index.tolist()
DRFP_FILTERED = DRFP_DF[nonzero_variance_cols]

print(f'Spange: {SPANGE_DF.shape}, DRFP filtered: {DRFP_FILTERED.shape}, ACS PCA: {ACS_PCA_DF.shape}')

Spange: (26, 13), DRFP filtered: (24, 122), ACS PCA: (24, 5)


In [5]:
# Full Featurizer
class FullFeaturizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.drfp_df = DRFP_FILTERED
        self.acs_pca_df = ACS_PCA_DF
        self.feats_dim = 2 + 3 + self.spange_df.shape[1] + self.drfp_df.shape[1] + self.acs_pca_df.shape[1]

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            A_drfp = self.drfp_df.loc[X["SOLVENT A NAME"]].values
            B_drfp = self.drfp_df.loc[X["SOLVENT B NAME"]].values
            A_acs = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
            B_acs = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            if flip:
                X_spange = B_spange * (1 - (1-pct)) + A_spange * (1-pct)
                X_drfp = B_drfp * (1 - (1-pct)) + A_drfp * (1-pct)
                X_acs = B_acs * (1 - (1-pct)) + A_acs * (1-pct)
            else:
                X_spange = A_spange * (1 - pct) + B_spange * pct
                X_drfp = A_drfp * (1 - pct) + B_drfp * pct
                X_acs = A_acs * (1 - pct) + B_acs * pct
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
            X_drfp = self.drfp_df.loc[X["SOLVENT NAME"]].values
            X_acs = self.acs_pca_df.loc[X["SOLVENT NAME"]].values
        
        return np.hstack([X_kinetic, X_spange, X_drfp, X_acs])
    
    def featurize_torch(self, X, flip=False):
        return torch.tensor(self.featurize(X, flip))

print(f'Full feature dimension: {FullFeaturizer().feats_dim}')

Full feature dimension: 145


In [6]:
# CatBoost + XGBoost Ensemble with 4 targets
import catboost as cb
import xgboost as xgb
import tqdm

class FourTargetCatXGBEnsemble:
    def __init__(self, data='single'):
        self.data = data
        self.mixed = (data == 'full')
        self.featurizer = FullFeaturizer(mixed=self.mixed)
        self.scalers = [StandardScaler() for _ in range(4)]  # 4 targets now
        self.catboost_models = []
        self.xgb_models = []
        self.cat_weight = 0.5
        self.xgb_weight = 0.5
        
    def train_model(self, X, Y):
        X_feats = self.featurizer.featurize(X)
        Y_vals = Y.values  # [P2, P3, SM]
        
        # Add 4th target: Other = 1 - (P2 + P3 + SM)
        other = 1 - Y_vals.sum(axis=1, keepdims=True)
        Y_4targets = np.hstack([Y_vals, other])  # [P2, P3, SM, Other]
        
        self.catboost_models = []
        self.xgb_models = []
        
        for i in range(4):  # 4 targets
            y_scaled = self.scalers[i].fit_transform(Y_4targets[:, i:i+1]).ravel()
            
            # CatBoost
            cat_model = cb.CatBoostRegressor(
                iterations=500,
                learning_rate=0.05,
                depth=6,
                l2_leaf_reg=3,
                random_seed=42,
                verbose=False
            )
            cat_model.fit(X_feats, y_scaled)
            self.catboost_models.append(cat_model)
            
            # XGBoost
            xgb_model = xgb.XGBRegressor(
                n_estimators=500,
                learning_rate=0.05,
                max_depth=6,
                reg_lambda=3,
                random_state=42,
                verbosity=0
            )
            xgb_model.fit(X_feats, y_scaled)
            self.xgb_models.append(xgb_model)
    
    def predict(self, X):
        X_feats = self.featurizer.featurize(X)
        preds = []
        
        for i in range(4):  # 4 targets
            cat_pred = self.catboost_models[i].predict(X_feats)
            xgb_pred = self.xgb_models[i].predict(X_feats)
            
            # Ensemble
            pred_scaled = self.cat_weight * cat_pred + self.xgb_weight * xgb_pred
            pred = self.scalers[i].inverse_transform(pred_scaled.reshape(-1, 1)).ravel()
            preds.append(pred)
        
        # Stack: [P2, P3, SM, Other]
        preds_4 = np.stack(preds, axis=1)
        
        # Return only first 3 targets (P2, P3, SM) - don't normalize
        # The "Other" prediction is just used to inform the model
        return torch.tensor(preds_4[:, :3])

print('FourTargetCatXGBEnsemble defined')

FourTargetCatXGBEnsemble defined


In [7]:
# Cross-validation on single solvent data
X, Y = load_data("single_solvent")
print(f'Single solvent data: {len(X)} samples, {len(X["SOLVENT NAME"].unique())} solvents')

all_mse = []
for fold_idx, split in tqdm.tqdm(enumerate(generate_leave_one_out_splits(X, Y))):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = FourTargetCatXGBEnsemble(data='single')
    model.train_model(train_X, train_Y)
    
    preds = model.predict(test_X).numpy()
    targets = test_Y.values
    
    mse = np.mean((preds - targets) ** 2)
    all_mse.append(mse)

single_mse = np.mean(all_mse)
print(f'\nSingle Solvent MSE: {single_mse:.6f}')

Single solvent data: 656 samples, 24 solvents


0it [00:00, ?it/s]

1it [00:02,  2.21s/it]

2it [00:04,  2.15s/it]

3it [00:06,  2.16s/it]

4it [00:08,  2.17s/it]

5it [00:10,  2.21s/it]

6it [00:13,  2.24s/it]

7it [00:15,  2.21s/it]

8it [00:17,  2.17s/it]

9it [00:19,  2.15s/it]

10it [00:21,  2.19s/it]

11it [00:24,  2.19s/it]

12it [00:26,  2.17s/it]

13it [00:28,  2.18s/it]

14it [00:30,  2.18s/it]

15it [00:32,  2.06s/it]

16it [00:34,  2.08s/it]

17it [00:36,  2.12s/it]

18it [00:38,  2.16s/it]

19it [00:41,  2.21s/it]

20it [00:43,  2.20s/it]

21it [00:45,  2.19s/it]

22it [00:47,  2.18s/it]

23it [00:49,  2.18s/it]

24it [00:52,  2.17s/it]

24it [00:52,  2.17s/it]


Single Solvent MSE: 0.010230





In [8]:
# Cross-validation on full data
X_full, Y_full = load_data("full")
print(f'Full data: {len(X_full)} samples')

all_mse_full = []
for fold_idx, split in tqdm.tqdm(enumerate(generate_leave_one_ramp_out_splits(X_full, Y_full))):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = FourTargetCatXGBEnsemble(data='full')
    model.train_model(train_X, train_Y)
    
    preds = model.predict(test_X).numpy()
    targets = test_Y.values
    
    mse = np.mean((preds - targets) ** 2)
    all_mse_full.append(mse)

full_mse = np.mean(all_mse_full)
print(f'\nFull Data MSE: {full_mse:.6f}')

Full data: 1227 samples


0it [00:00, ?it/s]

1it [00:03,  3.85s/it]

2it [00:07,  3.60s/it]

3it [00:11,  3.69s/it]

4it [00:14,  3.76s/it]

5it [00:18,  3.74s/it]

6it [00:22,  3.66s/it]

7it [00:26,  3.74s/it]

8it [00:29,  3.69s/it]

9it [00:33,  3.70s/it]

10it [00:37,  3.74s/it]

11it [00:40,  3.74s/it]

12it [00:44,  3.74s/it]

13it [00:48,  3.78s/it]

13it [00:48,  3.73s/it]


Full Data MSE: 0.008117





In [9]:
# Calculate overall CV score
n_single = 656
n_full = 1227
overall_mse = (single_mse * n_single + full_mse * n_full) / (n_single + n_full)

print(f'\n=== Four Target Results ===')
print(f'Single Solvent MSE: {single_mse:.6f}')
print(f'Full Data MSE: {full_mse:.6f}')
print(f'Overall MSE: {overall_mse:.6f}')
print(f'\nBest baseline (CatBoost+XGBoost 3 targets): 0.008092')
print(f'Difference: {(overall_mse - 0.008092) / 0.008092 * 100:.2f}%')


=== Four Target Results ===
Single Solvent MSE: 0.010230
Full Data MSE: 0.008117
Overall MSE: 0.008853

Best baseline (CatBoost+XGBoost 3 targets): 0.008092
Difference: 9.41%


In [10]:
# Let's also try the hierarchical prediction approach
# Predict: conversion (1-SM), selectivity (P2/(P2+P3)), mass_balance (SM+P2+P3)

class HierarchicalCatXGBEnsemble:
    def __init__(self, data='single'):
        self.data = data
        self.mixed = (data == 'full')
        self.featurizer = FullFeaturizer(mixed=self.mixed)
        self.scalers = [StandardScaler() for _ in range(3)]  # conversion, selectivity, mass_balance
        self.catboost_models = []
        self.xgb_models = []
        self.cat_weight = 0.5
        self.xgb_weight = 0.5
        
    def train_model(self, X, Y):
        X_feats = self.featurizer.featurize(X)
        Y_vals = Y.values  # [P2, P3, SM]
        
        # Compute hierarchical targets
        P2, P3, SM = Y_vals[:, 0], Y_vals[:, 1], Y_vals[:, 2]
        conversion = 1 - SM  # Total conversion
        selectivity = np.where(P2 + P3 > 0, P2 / (P2 + P3), 0.5)  # P2 selectivity
        mass_balance = SM + P2 + P3  # Total mass balance
        
        Y_hier = np.stack([conversion, selectivity, mass_balance], axis=1)
        
        self.catboost_models = []
        self.xgb_models = []
        
        for i in range(3):
            y_scaled = self.scalers[i].fit_transform(Y_hier[:, i:i+1]).ravel()
            
            # CatBoost
            cat_model = cb.CatBoostRegressor(
                iterations=500,
                learning_rate=0.05,
                depth=6,
                l2_leaf_reg=3,
                random_seed=42,
                verbose=False
            )
            cat_model.fit(X_feats, y_scaled)
            self.catboost_models.append(cat_model)
            
            # XGBoost
            xgb_model = xgb.XGBRegressor(
                n_estimators=500,
                learning_rate=0.05,
                max_depth=6,
                reg_lambda=3,
                random_state=42,
                verbosity=0
            )
            xgb_model.fit(X_feats, y_scaled)
            self.xgb_models.append(xgb_model)
    
    def predict(self, X):
        X_feats = self.featurizer.featurize(X)
        preds = []
        
        for i in range(3):
            cat_pred = self.catboost_models[i].predict(X_feats)
            xgb_pred = self.xgb_models[i].predict(X_feats)
            
            # Ensemble
            pred_scaled = self.cat_weight * cat_pred + self.xgb_weight * xgb_pred
            pred = self.scalers[i].inverse_transform(pred_scaled.reshape(-1, 1)).ravel()
            preds.append(pred)
        
        # Hierarchical predictions: [conversion, selectivity, mass_balance]
        conversion = preds[0]
        selectivity = np.clip(preds[1], 0, 1)  # Selectivity must be in [0, 1]
        mass_balance = preds[2]
        
        # Derive SM, P2, P3
        SM = 1 - conversion
        total_products = mass_balance - SM
        total_products = np.maximum(total_products, 0)  # Can't be negative
        P2 = total_products * selectivity
        P3 = total_products * (1 - selectivity)
        
        # Return [P2, P3, SM] to match expected format
        return torch.tensor(np.stack([P2, P3, SM], axis=1))

print('HierarchicalCatXGBEnsemble defined')

HierarchicalCatXGBEnsemble defined


In [11]:
# Cross-validation with hierarchical model on single solvent data
X, Y = load_data("single_solvent")
print(f'Single solvent data: {len(X)} samples')

all_mse_hier = []
for fold_idx, split in tqdm.tqdm(enumerate(generate_leave_one_out_splits(X, Y))):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = HierarchicalCatXGBEnsemble(data='single')
    model.train_model(train_X, train_Y)
    
    preds = model.predict(test_X).numpy()
    targets = test_Y.values
    
    mse = np.mean((preds - targets) ** 2)
    all_mse_hier.append(mse)

single_mse_hier = np.mean(all_mse_hier)
print(f'\nHierarchical Single Solvent MSE: {single_mse_hier:.6f}')

Single solvent data: 656 samples


0it [00:00, ?it/s]

1it [00:01,  1.59s/it]

2it [00:03,  1.64s/it]

3it [00:04,  1.61s/it]

4it [00:06,  1.58s/it]

5it [00:07,  1.57s/it]

6it [00:09,  1.58s/it]

7it [00:11,  1.58s/it]

8it [00:12,  1.57s/it]

9it [00:14,  1.56s/it]

10it [00:15,  1.57s/it]

11it [00:17,  1.57s/it]

12it [00:18,  1.58s/it]

13it [00:20,  1.59s/it]

14it [00:22,  1.60s/it]

15it [00:23,  1.53s/it]

16it [00:25,  1.54s/it]

17it [00:26,  1.55s/it]

18it [00:28,  1.56s/it]

19it [00:29,  1.57s/it]

20it [00:31,  1.56s/it]

21it [00:32,  1.57s/it]

22it [00:34,  1.61s/it]

23it [00:36,  1.63s/it]

24it [00:37,  1.63s/it]

24it [00:37,  1.58s/it]


Hierarchical Single Solvent MSE: 0.009784





In [12]:
# Cross-validation with hierarchical model on full data
X_full, Y_full = load_data("full")
print(f'Full data: {len(X_full)} samples')

all_mse_full_hier = []
for fold_idx, split in tqdm.tqdm(enumerate(generate_leave_one_ramp_out_splits(X_full, Y_full))):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = HierarchicalCatXGBEnsemble(data='full')
    model.train_model(train_X, train_Y)
    
    preds = model.predict(test_X).numpy()
    targets = test_Y.values
    
    mse = np.mean((preds - targets) ** 2)
    all_mse_full_hier.append(mse)

full_mse_hier = np.mean(all_mse_full_hier)
print(f'\nHierarchical Full Data MSE: {full_mse_hier:.6f}')

Full data: 1227 samples


0it [00:00, ?it/s]

1it [00:02,  2.76s/it]

2it [00:05,  2.62s/it]

3it [00:08,  2.70s/it]

4it [00:10,  2.75s/it]

5it [00:13,  2.77s/it]

6it [00:16,  2.72s/it]

7it [00:19,  2.75s/it]

8it [00:21,  2.72s/it]

9it [00:24,  2.70s/it]

10it [00:27,  2.75s/it]

11it [00:30,  2.79s/it]

12it [00:33,  2.80s/it]

13it [00:35,  2.79s/it]

13it [00:35,  2.75s/it]


Hierarchical Full Data MSE: 0.008099





In [13]:
# Calculate overall hierarchical CV score
overall_mse_hier = (single_mse_hier * n_single + full_mse_hier * n_full) / (n_single + n_full)

print(f'\n=== Hierarchical Prediction Results ===')
print(f'Single Solvent MSE: {single_mse_hier:.6f}')
print(f'Full Data MSE: {full_mse_hier:.6f}')
print(f'Overall MSE: {overall_mse_hier:.6f}')
print(f'\nBest baseline (CatBoost+XGBoost 3 targets): 0.008092')
print(f'Difference: {(overall_mse_hier - 0.008092) / 0.008092 * 100:.2f}%')

print(f'\n=== Comparison ===')
print(f'Four Target MSE: {overall_mse:.6f} ({(overall_mse - 0.008092) / 0.008092 * 100:+.2f}%)')
print(f'Hierarchical MSE: {overall_mse_hier:.6f} ({(overall_mse_hier - 0.008092) / 0.008092 * 100:+.2f}%)')
print(f'Baseline MSE: 0.008092')


=== Hierarchical Prediction Results ===
Single Solvent MSE: 0.009784
Full Data MSE: 0.008099
Overall MSE: 0.008686

Best baseline (CatBoost+XGBoost 3 targets): 0.008092
Difference: 7.34%

=== Comparison ===
Four Target MSE: 0.008853 (+9.41%)
Hierarchical MSE: 0.008686 (+7.34%)
Baseline MSE: 0.008092


In [14]:
# Use the better approach for submission
# If four target is better, use FourTargetCatXGBEnsemble
# If hierarchical is better, use HierarchicalCatXGBEnsemble

best_approach = 'four_target' if overall_mse < overall_mse_hier else 'hierarchical'
print(f'Best approach: {best_approach}')
print(f'Best CV: {min(overall_mse, overall_mse_hier):.6f}')

Best approach: hierarchical
Best CV: 0.008686


In [15]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = HierarchicalCatXGBEnsemble(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:01,  1.56s/it]

2it [00:03,  1.58s/it]

3it [00:04,  1.60s/it]

4it [00:06,  1.58s/it]

5it [00:07,  1.57s/it]

6it [00:09,  1.59s/it]

7it [00:11,  1.58s/it]

8it [00:12,  1.58s/it]

9it [00:14,  1.56s/it]

10it [00:15,  1.57s/it]

11it [00:17,  1.57s/it]

12it [00:18,  1.58s/it]

13it [00:20,  1.58s/it]

14it [00:22,  1.58s/it]

15it [00:23,  1.51s/it]

16it [00:25,  1.56s/it]

17it [00:26,  1.62s/it]

18it [00:28,  1.66s/it]

19it [00:30,  1.66s/it]

20it [00:31,  1.67s/it]

21it [00:33,  1.65s/it]

22it [00:35,  1.63s/it]

23it [00:36,  1.62s/it]

24it [00:38,  1.61s/it]

24it [00:38,  1.60s/it]




In [16]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = HierarchicalCatXGBEnsemble(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:02,  2.73s/it]

2it [00:05,  2.59s/it]

3it [00:07,  2.67s/it]

4it [00:10,  2.76s/it]

5it [00:13,  2.83s/it]

6it [00:16,  2.79s/it]

7it [00:19,  2.79s/it]

8it [00:21,  2.71s/it]

9it [00:24,  2.72s/it]

10it [00:27,  2.79s/it]

11it [00:30,  2.79s/it]

12it [00:33,  2.80s/it]

13it [00:35,  2.80s/it]

13it [00:35,  2.77s/it]




In [17]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################