In [1]:
# Experiment 068: Extrapolation Detection + Conservative Predictions
# Goal: Reduce the CV-LB intercept by making conservative predictions when extrapolating
#
# Strategy:
# 1. Compute distance from test solvent to nearest training solvents (using Spange descriptors)
# 2. When extrapolating (high distance), blend predictions toward population mean
# 3. This could reduce the intercept by making conservative predictions for hard cases

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
torch.set_default_dtype(torch.double)

print('Imports successful')

Imports successful


In [2]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    features = pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)
    return features

def generate_leave_one_out_splits(X, Y):
    all_solvents = X["SOLVENT NAME"].unique()
    for solvent_name in sorted(all_solvents):
        train_idcs_mask = X["SOLVENT NAME"] != solvent_name
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

def generate_leave_one_ramp_out_splits(X, Y):
    all_solvent_ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, solvent_pair in all_solvent_ramps.iterrows():
        train_idcs_mask = (X[["SOLVENT A NAME", "SOLVENT B NAME"]] != solvent_pair).all(axis=1)
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

# Load Spange descriptors for extrapolation detection
SPANGE_DF = load_features('spange_descriptors')
print(f'Spange descriptors shape: {SPANGE_DF.shape}')
print('Data loading functions defined')

Spange descriptors shape: (26, 13)
Data loading functions defined


In [3]:
from abc import ABC, abstractmethod

class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError
    def featurize(X, Y):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass
    def train_model(self, X_train, y_train):
        raise NotImplementedError
    def predict(self):
        raise NotImplementedError

In [4]:
# Featurizers
class PrecomputedFeaturizer(SmilesFeaturizer):
    def __init__(self, features='spange_descriptors'):
        self.features = features
        self.featurizer = load_features(self.features)
        self.feats_dim = self.featurizer.shape[1] + 2

    def featurize(self, X):
        X_numeric = X[INPUT_LABELS_NUMERIC]
        X_smiles_feat = self.featurizer.loc[X["SOLVENT NAME"]]
        X_numeric_tensor = torch.tensor(X_numeric.values)
        X_smiles_feat_tensor = torch.tensor(X_smiles_feat.values)
        X_out = torch.cat((X_numeric_tensor, X_smiles_feat_tensor), dim=1)
        return X_out

class PrecomputedFeaturizerMixed(SmilesFeaturizer):
    def __init__(self, features='spange_descriptors'):
        self.features = features
        self.featurizer = load_features(self.features)
        self.feats_dim = self.featurizer.shape[1] + 2

    def featurize(self, X):
        X_numeric = X[INPUT_LABELS_NUMERIC]
        X_smiles_A_feat = self.featurizer.loc[X["SOLVENT A NAME"]]
        X_smiles_B_feat = self.featurizer.loc[X["SOLVENT B NAME"]]
        X_numeric_tensor = torch.tensor(X_numeric.values)
        X_smiles_feat_tensor = X_smiles_A_feat.values * (1 - X["SolventB%"].values.reshape(-1, 1)) + X_smiles_B_feat.values * X["SolventB%"].values.reshape(-1, 1)
        X_smiles_feat_tensor = torch.tensor(X_smiles_feat_tensor)
        X_out = torch.cat((X_numeric_tensor, X_smiles_feat_tensor), dim=1)
        return X_out

print('Featurizers defined')

Featurizers defined


In [5]:
# MLP Model with Extrapolation Detection
class ExtrapolationAwareMLPModel(nn.Module, BaseModel):
    def __init__(self, features='spange_descriptors', hidden_dims=[64, 64], output_dim=3, dropout=0.0, data='single', blend_threshold=0.3):
        super(ExtrapolationAwareMLPModel, self).__init__()
        self.data_type = data
        self.blend_threshold = blend_threshold
        
        if data == 'single':
            self.smiles_featurizer = PrecomputedFeaturizer(features=features)
        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixed(features=features)
        
        input_dim = self.smiles_featurizer.feats_dim
        prev_dim = input_dim
        layers = []
        for h_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, h_dim))
            layers.append(nn.ReLU())
            if dropout > 0:
                layers.append(nn.Dropout(dropout))
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, output_dim))
        self.model = nn.Sequential(*layers)
        
        # Store training data for extrapolation detection
        self.train_solvents = None
        self.train_solvent_features = None
        self.mean_pred = None
        self.nn_model = None

    def train_model(self, train_X, train_Y, criterion=nn.MSELoss, optimizer=torch.optim.Adam,
                    num_epochs=100, batch_size=1048, device="cpu", verbose=True, lr=1e-3):
        self.train()
        
        # Store training solvents for extrapolation detection
        if self.data_type == 'single':
            self.train_solvents = train_X["SOLVENT NAME"].unique().tolist()
            self.train_solvent_features = SPANGE_DF.loc[self.train_solvents].values
        else:
            solvents_a = train_X["SOLVENT A NAME"].unique().tolist()
            solvents_b = train_X["SOLVENT B NAME"].unique().tolist()
            self.train_solvents = list(set(solvents_a + solvents_b))
            self.train_solvent_features = SPANGE_DF.loc[self.train_solvents].values
        
        # Fit nearest neighbors for extrapolation detection
        self.nn_model = NearestNeighbors(n_neighbors=1, metric='euclidean')
        self.nn_model.fit(self.train_solvent_features)
        
        # Store mean prediction for blending
        self.mean_pred = torch.tensor(train_Y.values.mean(axis=0), dtype=torch.double)
        
        # Train the MLP
        train_X_tensor = self.smiles_featurizer.featurize(train_X)
        train_Y_tensor = torch.tensor(train_Y.values)
        train_dataset = TensorDataset(train_X_tensor, train_Y_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
        
        if device is None:
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.to(device)
        
        criterion = criterion()
        optimizer = optimizer(self.parameters(), lr=lr)
        
        for epoch in range(1, num_epochs + 1):
            self.train()
            for inputs, targets in train_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                optimizer.zero_grad()
                outputs = self.forward_raw(inputs)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()

    def forward_raw(self, x):
        return torch.sigmoid(self.model(x))

    def compute_extrapolation_score(self, X):
        """Compute how far test solvents are from training distribution."""
        if self.data_type == 'single':
            test_features = SPANGE_DF.loc[X["SOLVENT NAME"]].values
        else:
            # For mixtures, use weighted average of solvent features
            feat_a = SPANGE_DF.loc[X["SOLVENT A NAME"]].values
            feat_b = SPANGE_DF.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            test_features = feat_a * (1 - pct) + feat_b * pct
        
        # Compute distance to nearest training solvent
        distances, _ = self.nn_model.kneighbors(test_features)
        return distances.flatten()

    def predict(self, X):
        self.eval()
        X_tensor = self.smiles_featurizer.featurize(X)
        
        with torch.no_grad():
            raw_pred = self.forward_raw(X_tensor)
        
        # Compute extrapolation score
        extrap_scores = self.compute_extrapolation_score(X)
        
        # Normalize extrapolation scores (0 = no extrapolation, 1 = max extrapolation)
        max_score = extrap_scores.max() if extrap_scores.max() > 0 else 1.0
        normalized_scores = extrap_scores / max_score
        
        # Blend predictions toward mean based on extrapolation score
        # Higher extrapolation score = more conservative (closer to mean)
        blend_weights = np.minimum(normalized_scores / self.blend_threshold, 1.0)
        blend_weights = torch.tensor(blend_weights, dtype=torch.double).unsqueeze(1)
        
        # Blend: (1 - weight) * raw_pred + weight * mean_pred
        blended_pred = (1 - blend_weights) * raw_pred + blend_weights * self.mean_pred
        
        return blended_pred

    def forward(self, x):
        return self.forward_raw(x)

print('ExtrapolationAwareMLPModel defined')

ExtrapolationAwareMLPModel defined


In [6]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = ExtrapolationAwareMLPModel(blend_threshold=0.5) # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:01,  1.10s/it]

2it [00:01,  1.54it/s]

3it [00:01,  1.94it/s]

4it [00:02,  2.32it/s]

5it [00:02,  2.55it/s]

6it [00:02,  2.72it/s]

7it [00:03,  2.75it/s]

8it [00:03,  2.86it/s]

9it [00:03,  2.70it/s]

10it [00:04,  2.82it/s]

11it [00:04,  2.78it/s]

12it [00:04,  2.85it/s]

13it [00:05,  2.93it/s]

14it [00:05,  3.00it/s]

15it [00:05,  2.83it/s]

16it [00:06,  2.94it/s]

17it [00:06,  2.99it/s]

18it [00:06,  2.88it/s]

19it [00:07,  2.79it/s]

20it [00:07,  2.89it/s]

21it [00:07,  2.74it/s]

22it [00:08,  2.88it/s]

23it [00:08,  2.99it/s]

24it [00:08,  3.02it/s]

24it [00:08,  2.69it/s]




In [7]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = ExtrapolationAwareMLPModel(data='full', blend_threshold=0.5) # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:00,  1.46it/s]

2it [00:01,  1.60it/s]

3it [00:01,  1.73it/s]

4it [00:02,  1.72it/s]

5it [00:02,  1.73it/s]

6it [00:03,  1.67it/s]

7it [00:04,  1.66it/s]

8it [00:04,  1.73it/s]

9it [00:05,  1.73it/s]

10it [00:05,  1.74it/s]

11it [00:06,  1.66it/s]

12it [00:07,  1.61it/s]

13it [00:07,  1.63it/s]

13it [00:07,  1.67it/s]




In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################