In [1]:
# Experiment 070: Extrapolation Detection v2
# 
# Key insight: When comparing to ALL solvents, the test solvent is close to ITSELF
# So we need to compare to all solvents EXCEPT the test solvent
# 
# This identifies solvents that are truly "outliers" in the solvent space
# Water, HFIP, Ethylene Glycol might be outliers

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import pdist, cdist
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
torch.set_default_dtype(torch.double)

print('Imports successful')

Imports successful


In [2]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    features = pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)
    return features

def generate_leave_one_out_splits(X, Y):
    all_solvents = X["SOLVENT NAME"].unique()
    for solvent_name in sorted(all_solvents):
        train_idcs_mask = X["SOLVENT NAME"] != solvent_name
        test_idcs_mask = X["SOLVENT NAME"] == solvent_name
        train_X = X[train_idcs_mask]
        train_Y = Y[train_idcs_mask]
        test_X = X[test_idcs_mask]
        test_Y = Y[test_idcs_mask]
        yield (train_X, train_Y), (test_X, test_Y)

def generate_leave_one_ramp_out_splits(X, Y):
    all_solvents_A = X["SOLVENT A NAME"].unique()
    all_solvents_B = X["SOLVENT B NAME"].unique()
    all_solvents = np.union1d(all_solvents_A, all_solvents_B)
    for solvent_name in sorted(all_solvents):
        train_idcs_mask = (X["SOLVENT A NAME"] != solvent_name) & (X["SOLVENT B NAME"] != solvent_name)
        test_idcs_mask = (X["SOLVENT A NAME"] == solvent_name) | (X["SOLVENT B NAME"] == solvent_name)
        train_X = X[train_idcs_mask]
        train_Y = Y[train_idcs_mask]
        test_X = X[test_idcs_mask]
        test_Y = Y[test_idcs_mask]
        yield (train_X, train_Y), (test_X, test_Y)

# Load Spange descriptors for ALL solvents (global)
SPANGE_DF = load_features('spange_descriptors')
print(f'Spange descriptors shape: {SPANGE_DF.shape}')
print('Data loading functions defined')

Spange descriptors shape: (26, 13)
Data loading functions defined


In [3]:
# Analyze which solvents are outliers
# Compute distance from each solvent to all OTHER solvents

solvent_scaler = StandardScaler()
scaled_features = solvent_scaler.fit_transform(SPANGE_DF.values)

# For each solvent, compute mean distance to k nearest OTHER solvents
k = 3
outlier_scores = {}

for i, solvent in enumerate(SPANGE_DF.index):
    # Get all other solvents
    other_indices = [j for j in range(len(SPANGE_DF)) if j != i]
    other_features = scaled_features[other_indices]
    
    # Compute distance to k nearest
    distances = cdist([scaled_features[i]], other_features, metric='euclidean')[0]
    k_nearest_dist = np.sort(distances)[:k].mean()
    outlier_scores[solvent] = k_nearest_dist

# Sort by outlier score
sorted_scores = sorted(outlier_scores.items(), key=lambda x: x[1], reverse=True)
print("Solvent outlier scores (higher = more isolated):")
for solvent, score in sorted_scores:
    print(f"  {solvent}: {score:.4f}")

# Compute mean and std
mean_score = np.mean(list(outlier_scores.values()))
std_score = np.std(list(outlier_scores.values()))
print(f"\nMean: {mean_score:.4f}, Std: {std_score:.4f}")
print(f"Threshold (mean + 1*std): {mean_score + std_score:.4f}")

Solvent outlier scores (higher = more isolated):
  1,1,1,3,3,3-Hexafluoropropan-2-ol: 4.5701
  Cyclohexane: 4.1844
  Water: 3.7000
  Ethylene Glycol [1,2-Ethanediol]: 3.6865
  2,2,2-Trifluoroethanol: 3.5894
  Decanol: 2.8970
  DMA [N,N-Dimethylacetamide]: 2.8753
  Water.2,2,2-Trifluoroethanol: 2.8643
  Water.Acetonitrile: 2.7194
  Dihydrolevoglucosenone (Cyrene): 2.3906
  Methanol: 2.3689
  Acetonitrile: 2.3031
  Acetic Acid: 2.2813
  tert-Butanol [2-Methylpropan-2-ol]: 2.1456
  Acetonitrile.Acetic Acid: 1.8456
  IPA [Propan-2-ol]: 1.7483
  Ethanol: 1.7259
  Butanone [MEK]: 1.3767
  Ethyl Lactate: 1.3360
  Diethyl Ether [Ether]: 1.3061
  MTBE [tert-Butylmethylether]: 1.1296
  Dimethyl Carbonate: 1.0743
  2-Methyltetrahydrofuran [2-MeTHF]: 1.0678
  THF [Tetrahydrofuran]: 1.0429
  Methyl Propionate: 1.0343
  Ethyl Acetate: 0.9985

Mean: 2.2408, Std: 1.0424
Threshold (mean + 1*std): 3.2833


In [4]:
from abc import ABC, abstractmethod

class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError
    def featurize(X, Y):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass
    def train_model(self, X_train, y_train):
        raise NotImplementedError
    def predict(self):
        raise NotImplementedError

In [5]:
# Featurizers
class PrecomputedFeaturizer(SmilesFeaturizer):
    def __init__(self, features='spange_descriptors'):
        self.features = features
        self.featurizer = load_features(self.features)
        self.feats_dim = self.featurizer.shape[1] + 2

    def featurize(self, X):
        X_numeric = X[INPUT_LABELS_NUMERIC]
        X_smiles_feat = self.featurizer.loc[X["SOLVENT NAME"]]
        X_numeric_tensor = torch.tensor(X_numeric.values)
        X_smiles_feat_tensor = torch.tensor(X_smiles_feat.values)
        X_out = torch.cat((X_numeric_tensor, X_smiles_feat_tensor), dim=1)
        return X_out

class PrecomputedFeaturizerMixed(SmilesFeaturizer):
    def __init__(self, features='spange_descriptors'):
        self.features = features
        self.featurizer = load_features(self.features)
        self.feats_dim = self.featurizer.shape[1] + 2

    def featurize(self, X):
        X_numeric = X[INPUT_LABELS_NUMERIC]
        X_smiles_A_feat = self.featurizer.loc[X["SOLVENT A NAME"]]
        X_smiles_B_feat = self.featurizer.loc[X["SOLVENT B NAME"]]
        X_pct = X["SolventB%"].values.reshape(-1, 1)
        X_smiles_feat = X_smiles_A_feat.values * (1 - X_pct/100) + X_smiles_B_feat.values * (X_pct/100)
        X_numeric_tensor = torch.tensor(X_numeric.values)
        X_smiles_feat_tensor = torch.tensor(X_smiles_feat)
        X_out = torch.cat((X_numeric_tensor, X_smiles_feat_tensor), dim=1)
        return X_out

print('Featurizers defined')

Featurizers defined


In [6]:
# Pre-compute outlier scores for all solvents (excluding self)
# This is done ONCE globally

def compute_solvent_outlier_scores(k=3):
    """Compute outlier score for each solvent based on distance to k nearest OTHER solvents."""
    solvent_scaler = StandardScaler()
    scaled_features = solvent_scaler.fit_transform(SPANGE_DF.values)
    
    outlier_scores = {}
    for i, solvent in enumerate(SPANGE_DF.index):
        other_indices = [j for j in range(len(SPANGE_DF)) if j != i]
        other_features = scaled_features[other_indices]
        distances = cdist([scaled_features[i]], other_features, metric='euclidean')[0]
        k_nearest_dist = np.sort(distances)[:k].mean()
        outlier_scores[solvent] = k_nearest_dist
    
    return outlier_scores

SOLVENT_OUTLIER_SCORES = compute_solvent_outlier_scores(k=3)
mean_outlier_score = np.mean(list(SOLVENT_OUTLIER_SCORES.values()))
std_outlier_score = np.std(list(SOLVENT_OUTLIER_SCORES.values()))

print(f"Pre-computed outlier scores. Mean: {mean_outlier_score:.4f}, Std: {std_outlier_score:.4f}")

Pre-computed outlier scores. Mean: 2.2408, Std: 1.0424


In [7]:
# Extrapolation-Aware MLP Model v2
# Uses pre-computed outlier scores to identify solvents that need conservative predictions

class ExtrapolationAwareMLPModelV2(nn.Module, BaseModel):
    def __init__(self, features='spange_descriptors', hidden_dims=[64, 64], output_dim=3, 
                 dropout=0.0, data='single', blend_threshold=1.5):
        super(ExtrapolationAwareMLPModelV2, self).__init__()
        self.data_type = data
        self.blend_threshold = blend_threshold  # Number of std devs above mean to start blending
        
        if data == 'single':
            self.smiles_featurizer = PrecomputedFeaturizer(features=features)
        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixed(features=features)
        
        input_dim = self.smiles_featurizer.feats_dim
        prev_dim = input_dim
        layers = []
        for h_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, h_dim))
            layers.append(nn.ReLU())
            if dropout > 0:
                layers.append(nn.Dropout(dropout))
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, output_dim))
        layers.append(nn.Sigmoid())  # Ensure [0,1] output
        self.model = nn.Sequential(*layers)
        
        self.train_Y = None
        self.input_scaler = StandardScaler()
    
    def get_blend_weight(self, solvent_name):
        """Get blend weight based on pre-computed outlier score."""
        score = SOLVENT_OUTLIER_SCORES.get(solvent_name, mean_outlier_score)
        # Normalize: how many std devs above mean?
        z_score = (score - mean_outlier_score) / std_outlier_score
        # Blend weight: 0 for normal solvents, increases for outliers
        blend_weight = np.clip((z_score - self.blend_threshold) / 2.0, 0, 1)
        return blend_weight
    
    def train_model(self, X_train, y_train):
        self.train_Y = y_train.values if hasattr(y_train, 'values') else y_train
        
        X_featurized = self.smiles_featurizer.featurize(X_train)
        X_scaled = self.input_scaler.fit_transform(X_featurized.numpy())
        X_tensor = torch.tensor(X_scaled)
        Y_tensor = torch.tensor(y_train.values if hasattr(y_train, 'values') else y_train)
        
        dataset = TensorDataset(X_tensor, Y_tensor)
        dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
        
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
        criterion = nn.MSELoss()
        
        self.train()
        for epoch in range(200):
            for batch_X, batch_Y in dataloader:
                optimizer.zero_grad()
                pred = self.model(batch_X)
                loss = criterion(pred, batch_Y)
                loss.backward()
                optimizer.step()
    
    def predict(self, X):
        self.eval()
        with torch.no_grad():
            X_featurized = self.smiles_featurizer.featurize(X)
            X_scaled = self.input_scaler.transform(X_featurized.numpy())
            X_tensor = torch.tensor(X_scaled)
            raw_pred = self.model(X_tensor).numpy()
        
        # Get blend weights for each sample
        if self.data_type == 'single':
            solvent_names = X["SOLVENT NAME"].values
            blend_weights = np.array([self.get_blend_weight(s) for s in solvent_names])
        else:
            # For mixtures, use max of both solvents' blend weights
            solvent_a_names = X["SOLVENT A NAME"].values
            solvent_b_names = X["SOLVENT B NAME"].values
            blend_weights_a = np.array([self.get_blend_weight(s) for s in solvent_a_names])
            blend_weights_b = np.array([self.get_blend_weight(s) for s in solvent_b_names])
            blend_weights = np.maximum(blend_weights_a, blend_weights_b)
        
        # Compute population mean from training data
        mean_pred = self.train_Y.mean(axis=0)
        
        # Blend: for outliers, move toward mean
        blended = (1 - blend_weights.reshape(-1, 1)) * raw_pred + blend_weights.reshape(-1, 1) * mean_pred
        
        # Debug: print blend weights for first fold
        unique_solvents = X["SOLVENT NAME"].unique() if self.data_type == 'single' else np.union1d(X["SOLVENT A NAME"].unique(), X["SOLVENT B NAME"].unique())
        if len(unique_solvents) == 1:
            solvent = unique_solvents[0]
            print(f"  Solvent: {solvent}, Outlier score: {SOLVENT_OUTLIER_SCORES.get(solvent, 0):.4f}, Blend weight: {blend_weights[0]:.4f}")
        
        return torch.tensor(blended)

print('ExtrapolationAwareMLPModelV2 defined')

ExtrapolationAwareMLPModelV2 defined


In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = ExtrapolationAwareMLPModelV2(blend_threshold=1.0) # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_single_solvent = pd.DataFrame(all_predictions)
print(f"Single solvent predictions shape: {submission_single_solvent.shape}")

# Calculate CV score for single solvent
X, Y = load_data("single_solvent")
split_generator = generate_leave_one_out_splits(X, Y)
all_mse = []
for fold_idx, split in enumerate(split_generator):
    (train_X, train_Y), (test_X, test_Y) = split
    fold_preds = submission_single_solvent[submission_single_solvent['fold'] == fold_idx]
    pred_values = fold_preds[['target_1', 'target_2', 'target_3']].values
    true_values = test_Y.values
    mse = ((pred_values - true_values) ** 2).mean()
    all_mse.append(mse)
print(f"Single solvent CV MSE: {np.mean(all_mse):.6f}")

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:03,  3.44s/it]

  Solvent: 1,1,1,3,3,3-Hexafluoropropan-2-ol, Outlier score: 4.5701, Blend weight: 0.6172


2it [00:06,  3.00s/it]

  Solvent: 2,2,2-Trifluoroethanol, Outlier score: 3.5894, Blend weight: 0.1468


3it [00:08,  2.81s/it]

  Solvent: 2-Methyltetrahydrofuran [2-MeTHF], Outlier score: 1.0678, Blend weight: 0.0000


4it [00:11,  2.73s/it]

  Solvent: Acetonitrile, Outlier score: 2.3031, Blend weight: 0.0000


5it [00:14,  2.73s/it]

  Solvent: Acetonitrile.Acetic Acid, Outlier score: 1.8456, Blend weight: 0.0000


6it [00:16,  2.73s/it]

  Solvent: Butanone [MEK], Outlier score: 1.3767, Blend weight: 0.0000


7it [00:19,  2.73s/it]

  Solvent: Cyclohexane, Outlier score: 4.1844, Blend weight: 0.4322


8it [00:22,  2.71s/it]

  Solvent: DMA [N,N-Dimethylacetamide], Outlier score: 2.8753, Blend weight: 0.0000


9it [00:24,  2.73s/it]

  Solvent: Decanol, Outlier score: 2.8970, Blend weight: 0.0000


10it [00:27,  2.74s/it]

  Solvent: Diethyl Ether [Ether], Outlier score: 1.3061, Blend weight: 0.0000


11it [00:30,  2.77s/it]

  Solvent: Dihydrolevoglucosenone (Cyrene), Outlier score: 2.3906, Blend weight: 0.0000


12it [00:33,  2.76s/it]

  Solvent: Dimethyl Carbonate, Outlier score: 1.0743, Blend weight: 0.0000


13it [00:35,  2.74s/it]

  Solvent: Ethanol, Outlier score: 1.7259, Blend weight: 0.0000


14it [00:38,  2.74s/it]

  Solvent: Ethyl Acetate, Outlier score: 0.9985, Blend weight: 0.0000


15it [00:41,  2.72s/it]

  Solvent: Ethyl Lactate, Outlier score: 1.3360, Blend weight: 0.0000


16it [00:44,  2.70s/it]

  Solvent: Ethylene Glycol [1,2-Ethanediol], Outlier score: 3.6865, Blend weight: 0.1934


17it [00:46,  2.73s/it]

  Solvent: IPA [Propan-2-ol], Outlier score: 1.7483, Blend weight: 0.0000


18it [00:49,  2.72s/it]

  Solvent: MTBE [tert-Butylmethylether], Outlier score: 1.1296, Blend weight: 0.0000


19it [00:52,  2.70s/it]

  Solvent: Methanol, Outlier score: 2.3689, Blend weight: 0.0000


20it [00:54,  2.69s/it]

  Solvent: Methyl Propionate, Outlier score: 1.0343, Blend weight: 0.0000


21it [00:57,  2.68s/it]

  Solvent: THF [Tetrahydrofuran], Outlier score: 1.0429, Blend weight: 0.0000


22it [01:00,  2.68s/it]

  Solvent: Water.2,2,2-Trifluoroethanol, Outlier score: 2.8643, Blend weight: 0.0000


23it [01:02,  2.67s/it]

  Solvent: Water.Acetonitrile, Outlier score: 2.7194, Blend weight: 0.0000


24it [01:05,  2.67s/it]

24it [01:05,  2.73s/it]

  Solvent: tert-Butanol [2-Methylpropan-2-ol], Outlier score: 2.1456, Blend weight: 0.0000
Single solvent predictions shape: (656, 6)
Single solvent CV MSE: 0.015360





In [9]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = ExtrapolationAwareMLPModelV2(data='full', blend_threshold=1.0) # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_full_data = pd.DataFrame(all_predictions)
print(f"Full data predictions shape: {submission_full_data.shape}")

# Calculate CV score for full data
X, Y = load_data("full")
split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_mse = []
for fold_idx, split in enumerate(split_generator):
    (train_X, train_Y), (test_X, test_Y) = split
    fold_preds = submission_full_data[submission_full_data['fold'] == fold_idx]
    pred_values = fold_preds[['target_1', 'target_2', 'target_3']].values
    true_values = test_Y.values
    mse = ((pred_values - true_values) ** 2).mean()
    all_mse.append(mse)
print(f"Full data CV MSE: {np.mean(all_mse):.6f}")

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:04,  4.66s/it]

2it [00:09,  4.65s/it]

3it [00:13,  4.42s/it]

4it [00:17,  4.31s/it]

5it [00:22,  4.44s/it]

6it [00:27,  4.67s/it]

7it [00:32,  4.75s/it]

8it [00:37,  4.74s/it]

9it [00:41,  4.73s/it]

10it [00:46,  4.70s/it]

11it [00:51,  4.82s/it]

12it [00:56,  4.89s/it]

13it [01:01,  4.81s/it]

14it [01:06,  4.89s/it]

15it [01:11,  4.94s/it]

16it [01:15,  4.87s/it]

17it [01:20,  4.85s/it]

18it [01:25,  4.92s/it]

19it [01:30,  4.84s/it]

20it [01:35,  4.91s/it]

21it [01:40,  4.83s/it]

22it [01:44,  4.79s/it]

23it [01:49,  4.76s/it]

24it [01:54,  4.85s/it]

24it [01:54,  4.78s/it]

Full data predictions shape: (2454, 6)
Full data CV MSE: 0.019177





In [10]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

# Also save to /home/submission/
import shutil
shutil.copy("submission.csv", "/home/submission/submission.csv")

print(f"Submission saved. Shape: {submission.shape}")
print(f"Predictions range: target_1 [{submission['target_1'].min():.3f}, {submission['target_1'].max():.3f}]")
print(f"Predictions range: target_2 [{submission['target_2'].min():.3f}, {submission['target_2'].max():.3f}]")
print(f"Predictions range: target_3 [{submission['target_3'].min():.3f}, {submission['target_3'].max():.3f}]")

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

Submission saved. Shape: (3110, 7)
Predictions range: target_1 [0.000, 0.533]
Predictions range: target_2 [0.000, 0.565]
Predictions range: target_3 [0.001, 0.991]
