# Experiment 053: Exact Template Submission

**Goal:** Use EXACTLY the template code structure to ensure submission format is correct.

**Approach:** Copy the template code exactly, only changing the model definition.

In [1]:
# Cell 1: Imports and setup (from template)
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from abc import ABC, abstractmethod
import warnings
warnings.filterwarnings('ignore')

torch.set_default_dtype(torch.double)

# Data path for local execution
DATA_PATH = "/home/data"

print("Imports complete.")

Imports complete.


In [2]:
# Cell 2: Data loading functions (adapted for local paths)

INPUT_LABELS_FULL_SOLVENT = [
    "Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"
]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_FEATURES = ["SOLVENT NAME"]
INPUT_LABELS_FULL_FEATURES = ["SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    features = pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)
    return features

def generate_leave_one_out_splits(X, Y):
    """Generate all leave-one-out splits across the solvents."""
    all_solvents = X["SOLVENT NAME"].unique()
    for solvent_name in sorted(all_solvents):
        train_idcs_mask = X["SOLVENT NAME"] != solvent_name
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

def generate_leave_one_ramp_out_splits(X, Y):
    """Generate all leave-one-out splits across the solvent ramps."""
    all_solvent_ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    all_solvent_ramps = all_solvent_ramps.sort_values(by=["SOLVENT A NAME", "SOLVENT B NAME"])
    for _, solvent_pair in all_solvent_ramps.iterrows():
        train_idcs_mask = (X[["SOLVENT A NAME", "SOLVENT B NAME"]] != solvent_pair).any(axis=1)
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

print("Data loading functions defined.")

Data loading functions defined.


In [3]:
# Cell 3: Base classes (from template)

class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError

    def featurize(self, X):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass

    def train_model(self, X_train, y_train):
        raise NotImplementedError

    def predict(self):
        raise NotImplementedError

print("Base classes defined.")

Base classes defined.


In [4]:
# Cell 4: Featurizer (from template)

class PrecomputedFeaturizer(SmilesFeaturizer):
    def __init__(self, features='spange_descriptors'):
        assert features in ['drfps_catechol', 'fragprints', 'smiles', 'acs_pca_descriptors', 'spange_descriptors']
        self.features = features
        self.featurizer = load_features(self.features)
        self.feats_dim = self.featurizer.shape[1] + 2

    def featurize(self, X):
        X_numeric = X[INPUT_LABELS_NUMERIC]
        X_smiles_feat = self.featurizer.loc[X["SOLVENT NAME"]]
        X_numeric_tensor = torch.tensor(X_numeric.values)
        X_smiles_feat_tensor = torch.tensor(X_smiles_feat.values)
        X_out = torch.cat((X_numeric_tensor, X_smiles_feat_tensor), dim=1)
        return X_out

class PrecomputedFeaturizerMixed(SmilesFeaturizer):
    def __init__(self, features='spange_descriptors'):
        assert features in ['drfps_catechol', 'fragprints', 'smiles', 'acs_pca_descriptors', 'spange_descriptors']
        self.features = features
        self.featurizer = load_features(self.features)
        self.feats_dim = self.featurizer.shape[1] * 2 + 3

    def featurize(self, X):
        X_numeric = X[INPUT_LABELS_NUMERIC]
        X_smiles_A_feat = self.featurizer.loc[X["SOLVENT A NAME"]]
        X_smiles_B_feat = self.featurizer.loc[X["SOLVENT B NAME"]]
        X_solventB_pct = X[["SolventB%"]]
        X_numeric_tensor = torch.tensor(X_numeric.values)
        X_smiles_A_feat_tensor = torch.tensor(X_smiles_A_feat.values)
        X_smiles_B_feat_tensor = torch.tensor(X_smiles_B_feat.values)
        X_solventB_pct_tensor = torch.tensor(X_solventB_pct.values)
        X_out = torch.cat((X_numeric_tensor, X_smiles_A_feat_tensor, X_smiles_B_feat_tensor, X_solventB_pct_tensor), dim=1)
        return X_out

print("Featurizers defined.")

Featurizers defined.


In [5]:
# Cell 5: Simple MLP Model (from template, with minor improvements)

class MLPModel(BaseModel):
    def __init__(self, data='single'):
        self.data = data
        if data == 'single':
            self.smiles_featurizer = PrecomputedFeaturizer()
        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixed()
        
        self.model = None
        self.scaler_mean = None
        self.scaler_std = None

    def train_model(self, train_X, train_Y, device=None, verbose=False):
        X_tensor = self.smiles_featurizer.featurize(train_X)
        Y_tensor = torch.tensor(train_Y.values)
        
        # Normalize inputs
        self.scaler_mean = X_tensor.mean(dim=0)
        self.scaler_std = X_tensor.std(dim=0) + 1e-8
        X_tensor = (X_tensor - self.scaler_mean) / self.scaler_std
        
        # Simple MLP
        input_dim = X_tensor.shape[1]
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 3)
        ).double()
        
        # Training
        optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001, weight_decay=1e-4)
        criterion = nn.MSELoss()
        
        dataset = TensorDataset(X_tensor, Y_tensor)
        loader = DataLoader(dataset, batch_size=32, shuffle=True)
        
        self.model.train()
        for epoch in range(200):
            for batch_X, batch_Y in loader:
                optimizer.zero_grad()
                pred = self.model(batch_X)
                loss = criterion(pred, batch_Y)
                loss.backward()
                optimizer.step()

    def predict(self, X):
        X_tensor = self.smiles_featurizer.featurize(X)
        X_tensor = (X_tensor - self.scaler_mean) / self.scaler_std
        
        self.model.eval()
        with torch.no_grad():
            pred = self.model(X_tensor)
        
        # Clip to [0, 1]
        pred = torch.clamp(pred, 0, 1)
        
        return pred

print("MLPModel defined.")

MLPModel defined.


In [6]:
# Quick test to verify model works
print("Testing model...")
X, Y = load_data("single_solvent")
print(f"Single solvent data: X={X.shape}, Y={Y.shape}")

# Test one fold
split_gen = generate_leave_one_out_splits(X, Y)
(train_X, train_Y), (test_X, test_Y) = next(split_gen)

model = MLPModel()
model.train_model(train_X, train_Y)
preds = model.predict(test_X)

print(f"Predictions shape: {preds.shape}")
print(f"Predictions range: [{preds.min():.4f}, {preds.max():.4f}]")
print("Model test passed!")

Testing model...
Single solvent data: X=(656, 3), Y=(656, 3)


Predictions shape: torch.Size([37, 3])
Predictions range: [0.0028, 0.8619]
Model test passed!


In [7]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = MLPModel() # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

print(f"Single solvent predictions: {len(submission_single_solvent)}")

0it [00:00, ?it/s]

1it [00:02,  2.81s/it]

2it [00:05,  2.82s/it]

3it [00:08,  2.76s/it]

4it [00:11,  2.73s/it]

5it [00:13,  2.76s/it]

6it [00:16,  2.80s/it]

7it [00:19,  2.81s/it]

8it [00:22,  2.80s/it]

9it [00:25,  2.86s/it]

10it [00:28,  2.85s/it]

11it [00:30,  2.83s/it]

12it [00:33,  2.83s/it]

13it [00:36,  2.82s/it]

14it [00:39,  2.82s/it]

15it [00:42,  2.82s/it]

16it [00:45,  2.83s/it]

17it [00:47,  2.86s/it]

18it [00:50,  2.86s/it]

19it [00:53,  2.84s/it]

20it [00:56,  2.83s/it]

21it [00:59,  2.84s/it]

22it [01:02,  2.83s/it]

23it [01:04,  2.83s/it]

24it [01:07,  2.82s/it]

24it [01:07,  2.82s/it]

Single solvent predictions: 656





In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = MLPModel(data = 'full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

print(f"Full data predictions: {len(submission_full_data)}")

0it [00:00, ?it/s]

1it [00:05,  5.06s/it]

2it [00:10,  5.05s/it]

3it [00:15,  5.06s/it]

4it [00:20,  5.04s/it]

5it [00:25,  5.08s/it]

6it [00:30,  5.06s/it]

7it [00:35,  5.19s/it]

8it [00:40,  5.14s/it]

9it [00:46,  5.24s/it]

10it [00:51,  5.18s/it]

11it [00:56,  5.30s/it]

12it [01:02,  5.23s/it]

13it [01:07,  5.32s/it]

13it [01:07,  5.20s/it]

Full data predictions: 1227





In [9]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

print(f"Submission saved to /home/submission/submission.csv")
print(f"Total rows: {len(submission)}")

Submission saved to /home/submission/submission.csv
Total rows: 1883


In [10]:
# Verify submission format
print("\n" + "="*60)
print("SUBMISSION VERIFICATION")
print("="*60)

df = pd.read_csv('/home/submission/submission.csv')

print(f"\nColumns: {df.columns.tolist()}")
print(f"Total rows: {len(df)}")
print(f"Tasks: {df['task'].unique()}")
print(f"Folds per task:")
print(df.groupby('task')['fold'].nunique())

print(f"\nTarget statistics:")
for col in ['target_1', 'target_2', 'target_3']:
    print(f"  {col}: min={df[col].min():.6f}, max={df[col].max():.6f}")
    print(f"    Values > 1: {(df[col] > 1).sum()}")
    print(f"    Values < 0: {(df[col] < 0).sum()}")

print(f"\nFirst 5 rows:")
print(df.head())


SUBMISSION VERIFICATION

Columns: ['id', 'index', 'task', 'fold', 'row', 'target_1', 'target_2', 'target_3']
Total rows: 1883
Tasks: [0 1]
Folds per task:
task
0    24
1    13
Name: fold, dtype: int64

Target statistics:
  target_1: min=0.000000, max=0.434598
    Values > 1: 0
    Values < 0: 0
  target_2: min=0.000000, max=0.435151
    Values > 1: 0
    Values < 0: 0
  target_3: min=0.000000, max=1.000000
    Values > 1: 0
    Values < 0: 0

First 5 rows:
   id  index  task  fold  row  target_1  target_2  target_3
0   0      0     0     0    0  0.001120  0.001556  0.984652
1   1      1     0     0    1  0.010681  0.008359  0.949444
2   2      2     0     0    2  0.039946  0.027438  0.847869
3   3      3     0     0    3  0.090046  0.059719  0.678352
4   4      4     0     0    4  0.137220  0.091779  0.518236


In [11]:
# Calculate CV for logging
print("\n" + "="*60)
print("CV CALCULATION")
print("="*60)

# Single solvent CV
X, Y = load_data("single_solvent")
fold_mses = []

for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_out_splits(X, Y)):
    model = MLPModel()
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    fold_mses.append(mse)

single_cv = np.mean(fold_mses)
print(f"Single solvent CV MSE: {single_cv:.6f}")

# Full data CV
X, Y = load_data("full")
full_fold_mses = []

for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_ramp_out_splits(X, Y)):
    model = MLPModel(data='full')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    full_fold_mses.append(mse)

full_cv = np.mean(full_fold_mses)
print(f"Full data CV MSE: {full_cv:.6f}")

print(f"\nFINAL CV FOR LOGGING: {single_cv:.6f}")


CV CALCULATION


Single solvent CV MSE: 0.008504


Full data CV MSE: 0.014875

FINAL CV FOR LOGGING: 0.008504
