In [None]:
# Experiment 069: Fixed Extrapolation Detection
# 
# FIX: Compare test solvents to ALL 24 solvents, not just training fold solvents
# This addresses the fundamental flaw identified by the evaluator
#
# Key changes from exp_068:
# 1. Compare to ALL 24 solvents, not just training fold solvents
# 2. Use k=3 nearest neighbors, not k=1
# 3. Normalize by mean inter-solvent distance
# 4. Only blend for TRUE outliers (distance > mean + threshold * std)
# 5. Use a good base model (MLP with proper training)

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import pdist
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
torch.set_default_dtype(torch.double)

print('Imports successful')

In [None]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    features = pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)
    return features

def generate_leave_one_out_splits(X, Y):
    all_solvents = X["SOLVENT NAME"].unique()
    for solvent_name in sorted(all_solvents):
        train_idcs_mask = X["SOLVENT NAME"] != solvent_name
        test_idcs_mask = X["SOLVENT NAME"] == solvent_name
        train_X = X[train_idcs_mask]
        train_Y = Y[train_idcs_mask]
        test_X = X[test_idcs_mask]
        test_Y = Y[test_idcs_mask]
        yield (train_X, train_Y), (test_X, test_Y)

def generate_leave_one_ramp_out_splits(X, Y):
    all_solvents_A = X["SOLVENT A NAME"].unique()
    all_solvents_B = X["SOLVENT B NAME"].unique()
    all_solvents = np.union1d(all_solvents_A, all_solvents_B)
    for solvent_name in sorted(all_solvents):
        train_idcs_mask = (X["SOLVENT A NAME"] != solvent_name) & (X["SOLVENT B NAME"] != solvent_name)
        test_idcs_mask = (X["SOLVENT A NAME"] == solvent_name) | (X["SOLVENT B NAME"] == solvent_name)
        train_X = X[train_idcs_mask]
        train_Y = Y[train_idcs_mask]
        test_X = X[test_idcs_mask]
        test_Y = Y[test_idcs_mask]
        yield (train_X, train_Y), (test_X, test_Y)

# Load Spange descriptors for ALL solvents (global)
SPANGE_DF = load_features('spange_descriptors')
print(f'Spange descriptors shape: {SPANGE_DF.shape}')
print(f'All solvents: {list(SPANGE_DF.index)}')
print('Data loading functions defined')

In [None]:
from abc import ABC, abstractmethod

class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError
    def featurize(X, Y):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass
    def train_model(self, X_train, y_train):
        raise NotImplementedError
    def predict(self):
        raise NotImplementedError

In [None]:
# Featurizers
class PrecomputedFeaturizer(SmilesFeaturizer):
    def __init__(self, features='spange_descriptors'):
        self.features = features
        self.featurizer = load_features(self.features)
        self.feats_dim = self.featurizer.shape[1] + 2

    def featurize(self, X):
        X_numeric = X[INPUT_LABELS_NUMERIC]
        X_smiles_feat = self.featurizer.loc[X["SOLVENT NAME"]]
        X_numeric_tensor = torch.tensor(X_numeric.values)
        X_smiles_feat_tensor = torch.tensor(X_smiles_feat.values)
        X_out = torch.cat((X_numeric_tensor, X_smiles_feat_tensor), dim=1)
        return X_out

class PrecomputedFeaturizerMixed(SmilesFeaturizer):
    def __init__(self, features='spange_descriptors'):
        self.features = features
        self.featurizer = load_features(self.features)
        self.feats_dim = self.featurizer.shape[1] + 2

    def featurize(self, X):
        X_numeric = X[INPUT_LABELS_NUMERIC]
        X_smiles_A_feat = self.featurizer.loc[X["SOLVENT A NAME"]]
        X_smiles_B_feat = self.featurizer.loc[X["SOLVENT B NAME"]]
        X_pct = X["SolventB%"].values.reshape(-1, 1)
        X_smiles_feat = X_smiles_A_feat.values * (1 - X_pct/100) + X_smiles_B_feat.values * (X_pct/100)
        X_numeric_tensor = torch.tensor(X_numeric.values)
        X_smiles_feat_tensor = torch.tensor(X_smiles_feat)
        X_out = torch.cat((X_numeric_tensor, X_smiles_feat_tensor), dim=1)
        return X_out

print('Featurizers defined')

In [None]:
# Fixed Extrapolation-Aware MLP Model
# Key fix: Compare to ALL solvents, not just training fold solvents

class FixedExtrapolationAwareMLPModel(nn.Module, BaseModel):
    def __init__(self, features='spange_descriptors', hidden_dims=[64, 64], output_dim=3, 
                 dropout=0.0, data='single', blend_threshold=2.0):
        super(FixedExtrapolationAwareMLPModel, self).__init__()
        self.data_type = data
        self.blend_threshold = blend_threshold
        
        if data == 'single':
            self.smiles_featurizer = PrecomputedFeaturizer(features=features)
        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixed(features=features)
        
        input_dim = self.smiles_featurizer.feats_dim
        prev_dim = input_dim
        layers = []
        for h_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, h_dim))
            layers.append(nn.ReLU())
            if dropout > 0:
                layers.append(nn.Dropout(dropout))
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, output_dim))
        layers.append(nn.Sigmoid())  # Ensure [0,1] output
        self.model = nn.Sequential(*layers)
        
        # CRITICAL FIX: Pre-compute extrapolation detection on ALL solvents
        # This is done ONCE at initialization, not per fold
        self.all_solvent_features = SPANGE_DF.values  # All 24-26 solvents
        self.solvent_scaler = StandardScaler()
        self.scaled_all_solvents = self.solvent_scaler.fit_transform(self.all_solvent_features)
        
        # Compute mean inter-solvent distance for normalization
        self.mean_inter_solvent_dist = np.mean(pdist(self.scaled_all_solvents))
        print(f"Mean inter-solvent distance: {self.mean_inter_solvent_dist:.4f}")
        
        # Fit NN on ALL solvents
        self.nn_model = NearestNeighbors(n_neighbors=3, metric='euclidean')
        self.nn_model.fit(self.scaled_all_solvents)
        
        self.train_Y = None
        self.input_scaler = StandardScaler()
    
    def compute_extrapolation_score(self, X):
        """Compute extrapolation score based on distance to nearest solvents from ALL solvents."""
        if self.data_type == 'single':
            solvent_names = X["SOLVENT NAME"].values
            test_features = SPANGE_DF.loc[solvent_names].values
        else:
            # For mixtures, use weighted average of solvent features
            solvent_a_names = X["SOLVENT A NAME"].values
            solvent_b_names = X["SOLVENT B NAME"].values
            pct = X["SolventB%"].values.reshape(-1, 1) / 100
            feat_a = SPANGE_DF.loc[solvent_a_names].values
            feat_b = SPANGE_DF.loc[solvent_b_names].values
            test_features = feat_a * (1 - pct) + feat_b * pct
        
        # Scale test features using the same scaler fitted on ALL solvents
        test_scaled = self.solvent_scaler.transform(test_features)
        
        # Compute distance to nearest 3 solvents (from ALL solvents)
        distances, indices = self.nn_model.kneighbors(test_scaled)
        avg_dist = distances.mean(axis=1)
        
        # Normalize by mean inter-solvent distance
        normalized_dist = avg_dist / self.mean_inter_solvent_dist
        
        return normalized_dist
    
    def train_model(self, X_train, y_train):
        self.train_Y = y_train.values if hasattr(y_train, 'values') else y_train
        
        X_featurized = self.smiles_featurizer.featurize(X_train)
        X_scaled = self.input_scaler.fit_transform(X_featurized.numpy())
        X_tensor = torch.tensor(X_scaled)
        Y_tensor = torch.tensor(y_train.values if hasattr(y_train, 'values') else y_train)
        
        dataset = TensorDataset(X_tensor, Y_tensor)
        dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
        
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
        criterion = nn.MSELoss()
        
        self.train()
        for epoch in range(200):
            for batch_X, batch_Y in dataloader:
                optimizer.zero_grad()
                pred = self.model(batch_X)
                loss = criterion(pred, batch_Y)
                loss.backward()
                optimizer.step()
    
    def predict(self, X):
        self.eval()
        with torch.no_grad():
            X_featurized = self.smiles_featurizer.featurize(X)
            X_scaled = self.input_scaler.transform(X_featurized.numpy())
            X_tensor = torch.tensor(X_scaled)
            raw_pred = self.model(X_tensor).numpy()
        
        # Compute extrapolation scores
        extrap_scores = self.compute_extrapolation_score(X)
        
        # Only blend for TRUE outliers (normalized_dist > 1.0 + threshold)
        # blend_weight = 0 for normal solvents, increases for outliers
        blend_weights = np.clip((extrap_scores - 1.0) / self.blend_threshold, 0, 1)
        
        # Compute population mean from training data
        mean_pred = self.train_Y.mean(axis=0)
        
        # Blend: for outliers, move toward mean
        blended = (1 - blend_weights.reshape(-1, 1)) * raw_pred + blend_weights.reshape(-1, 1) * mean_pred
        
        # Debug: print blend weights for first few samples
        if len(extrap_scores) <= 20:
            print(f"  Extrap scores: {extrap_scores[:5]}")
            print(f"  Blend weights: {blend_weights[:5]}")
        
        return torch.tensor(blended)

print('FixedExtrapolationAwareMLPModel defined')

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = FixedExtrapolationAwareMLPModel(blend_threshold=2.0) # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_single_solvent = pd.DataFrame(all_predictions)
print(f"Single solvent predictions shape: {submission_single_solvent.shape}")

# Calculate CV score for single solvent
X, Y = load_data("single_solvent")
split_generator = generate_leave_one_out_splits(X, Y)
all_mse = []
for fold_idx, split in enumerate(split_generator):
    (train_X, train_Y), (test_X, test_Y) = split
    fold_preds = submission_single_solvent[submission_single_solvent['fold'] == fold_idx]
    pred_values = fold_preds[['target_1', 'target_2', 'target_3']].values
    true_values = test_Y.values
    mse = ((pred_values - true_values) ** 2).mean()
    all_mse.append(mse)
print(f"Single solvent CV MSE: {np.mean(all_mse):.6f}")

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = FixedExtrapolationAwareMLPModel(data='full', blend_threshold=2.0) # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_full_data = pd.DataFrame(all_predictions)
print(f"Full data predictions shape: {submission_full_data.shape}")

# Calculate CV score for full data
X, Y = load_data("full")
split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_mse = []
for fold_idx, split in enumerate(split_generator):
    (train_X, train_Y), (test_X, test_Y) = split
    fold_preds = submission_full_data[submission_full_data['fold'] == fold_idx]
    pred_values = fold_preds[['target_1', 'target_2', 'target_3']].values
    true_values = test_Y.values
    mse = ((pred_values - true_values) ** 2).mean()
    all_mse.append(mse)
print(f"Full data CV MSE: {np.mean(all_mse):.6f}")

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

# Also save to /home/submission/
import shutil
shutil.copy("submission.csv", "/home/submission/submission.csv")

print(f"Submission saved. Shape: {submission.shape}")
print(f"Predictions range: target_1 [{submission['target_1'].min():.3f}, {submission['target_1'].max():.3f}]")
print(f"Predictions range: target_2 [{submission['target_2'].min():.3f}, {submission['target_2'].max():.3f}]")
print(f"Predictions range: target_3 [{submission['target_3'].min():.3f}, {submission['target_3'].max():.3f}]")

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################