# Experiment 108: Chemical Similarity-Based Extrapolation Detection

## Goal
Use chemical similarity (Tanimoto on Morgan fingerprints) to detect when we're extrapolating to solvents that are very different from training solvents. When similarity is low, blend toward a conservative prediction.

## Key Insight
The CV-LB relationship is LB = 4.29 Ã— CV + 0.0528. The intercept (0.0528) > target (0.0347). We need to CHANGE this relationship by being more conservative on truly novel solvents.

## Approach
1. Use the best base model (CatBoost + XGBoost ensemble)
2. Compute Morgan fingerprints for all solvents
3. For each test sample, compute max Tanimoto similarity to training solvents
4. If similarity < threshold, blend toward training mean
5. Test different similarity thresholds and blend weights

In [1]:
import numpy as np
import pandas as pd
import torch
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Add data path
sys.path.append('/home/data/')

# Override the load functions to use local paths
DATA_PATH = '/home/data/'

TARGET_LABELS = ['Product 2', 'Product 3', 'SM']

def load_data_local(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}catechol_full_data_yields.csv')
        INPUT_LABELS = ['SOLVENT A NAME', 'SOLVENT B NAME', 'SolventB%', 'Temperature', 'Residence Time']
    else:
        df = pd.read_csv(f'{DATA_PATH}catechol_single_solvent_yields.csv')
        INPUT_LABELS = ['SOLVENT NAME', 'Temperature', 'Residence Time']
    
    X = df[INPUT_LABELS]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features_local(name="spange_descriptors"):
    assert name in ["spange_descriptors", "acs_pca_descriptors", "drfps_catechol", "fragprints", "smiles"]
    features = pd.read_csv(f'{DATA_PATH}{name}_lookup.csv', index_col=0)
    return features

# Import the split generators from utils
from utils import generate_leave_one_out_splits, generate_leave_one_ramp_out_splits

# Override load functions
load_data = load_data_local
load_features = load_features_local

# Get input labels from utils
from utils import INPUT_LABELS_FULL_SOLVENT, INPUT_LABELS_SINGLE_SOLVENT, INPUT_LABELS_NUMERIC

print("Imports successful")

Imports successful


In [2]:
# Test RDKit and compute fingerprints for all solvents
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs

# Load SMILES lookup
smiles_lookup = load_features("smiles")
print(f"SMILES lookup shape: {smiles_lookup.shape}")
print(f"Columns: {smiles_lookup.columns.tolist()}")
print(f"\nFirst few entries:")
print(smiles_lookup.head())

SMILES lookup shape: (26, 1)
Columns: ['solvent smiles']

First few entries:
                                           solvent smiles
SOLVENT NAME                                             
Cyclohexane                                      C1CCCCC1
Ethyl Acetate                                   O=C(OCC)C
Acetic Acid                                       CC(=O)O
2-Methyltetrahydrofuran [2-MeTHF]              O1C(C)CCC1
1,1,1,3,3,3-Hexafluoropropan-2-ol  C(C(F)(F)F)(C(F)(F)F)O


In [3]:
# Compute Morgan fingerprints for all solvents
def get_morgan_fp(smiles, radius=2, n_bits=2048):
    """Compute Morgan fingerprint for a SMILES string"""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)

# Get the SMILES column name
smiles_col = [c for c in smiles_lookup.columns if 'SMILES' in c.upper()][0]
print(f"SMILES column: {smiles_col}")

# Compute fingerprints for all solvents
solvent_fps = {}
for solvent_name in smiles_lookup.index:
    smiles = smiles_lookup.loc[solvent_name, smiles_col]
    fp = get_morgan_fp(smiles)
    if fp is not None:
        solvent_fps[solvent_name] = fp
    else:
        print(f"Warning: Could not compute fingerprint for {solvent_name}: {smiles}")

print(f"\nComputed fingerprints for {len(solvent_fps)} solvents")

SMILES column: solvent smiles

Computed fingerprints for 26 solvents




In [4]:
# Compute Tanimoto similarity matrix between all solvents
def compute_tanimoto_similarity(fp1, fp2):
    """Compute Tanimoto similarity between two fingerprints"""
    return DataStructs.TanimotoSimilarity(fp1, fp2)

# Compute similarity matrix
solvent_names = list(solvent_fps.keys())
n_solvents = len(solvent_names)
similarity_matrix = np.zeros((n_solvents, n_solvents))

for i, name1 in enumerate(solvent_names):
    for j, name2 in enumerate(solvent_names):
        similarity_matrix[i, j] = compute_tanimoto_similarity(solvent_fps[name1], solvent_fps[name2])

print(f"Similarity matrix shape: {similarity_matrix.shape}")
print(f"\nSimilarity statistics:")
print(f"  Min: {similarity_matrix.min():.4f}")
print(f"  Max: {similarity_matrix.max():.4f}")
print(f"  Mean (off-diagonal): {similarity_matrix[~np.eye(n_solvents, dtype=bool)].mean():.4f}")

Similarity matrix shape: (26, 26)

Similarity statistics:
  Min: 0.0000
  Max: 1.0000
  Mean (off-diagonal): 0.1081


In [5]:
# Base classes and feature engineering (from ens-model kernel)
from abc import ABC, abstractmethod
from functools import reduce

torch.set_default_dtype(torch.double)

class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError
    def featurize(X, Y):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass
    def train_model(self, X_train, y_train):
        raise NotImplementedError
    def predict(self):
        raise NotImplementedError

_SOLVENT_TABLE_CACHE = None

def feature_priority(name: str) -> int:
    if name.startswith("spange_"): return 5
    if name.startswith("acs_"): return 4
    if name.startswith("drfps_"): return 3
    if name.startswith("frag_"): return 2
    if name.startswith("smiles_"): return 1
    return 0

def filter_correlated_features(df, threshold=0.8):
    numeric_df = df.select_dtypes(include=[np.number])
    if numeric_df.shape[1] == 0:
        return df, []
    std = numeric_df.std(axis=0)
    constant_cols = std[std == 0].index.tolist()
    if constant_cols:
        numeric_df = numeric_df.drop(columns=constant_cols)
    corr = numeric_df.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)).fillna(0.0)
    cols = upper.columns.tolist()
    to_drop = set()
    high_corr_pairs = []
    for i, col_i in enumerate(cols):
        for j in range(i + 1, len(cols)):
            col_j = cols[j]
            cval = upper.iloc[i, j]
            if cval > threshold:
                high_corr_pairs.append((col_i, col_j, cval))
    for col_i, col_j, cval in high_corr_pairs:
        if col_i in to_drop or col_j in to_drop:
            continue
        p_i = feature_priority(col_i)
        p_j = feature_priority(col_j)
        if p_i > p_j:
            drop = col_j
        elif p_j > p_i:
            drop = col_i
        else:
            idx_i = df.columns.get_loc(col_i)
            idx_j = df.columns.get_loc(col_j)
            drop = col_i if idx_i > idx_j else col_j
        to_drop.add(drop)
    all_to_drop = list(set(constant_cols).union(to_drop))
    df_filtered = df.drop(columns=all_to_drop, errors="ignore")
    return df_filtered, all_to_drop

def add_numeric_features(X_numeric):
    X_num = X_numeric.copy()
    cols = set(X_num.columns)
    if {"Temperature", "Residence Time"} <= cols:
        X_num["Temperature"] = X_num["Temperature"] + 273.15
        T = X_num["Temperature"]
        rt = X_num["Residence Time"]
        X_num["T_x_RT"] = T * rt
        X_num["RT_log"] = np.log(rt + 1e-6)
        X_num["T_inv"] = 1 / T
        X_num["RT_scaled"] = rt / rt.mean()
    return X_num

def build_solvent_feature_table(threshold=0.90):
    global _SOLVENT_TABLE_CACHE
    if _SOLVENT_TABLE_CACHE is not None:
        return _SOLVENT_TABLE_CACHE
    print(">>> Building solvent feature table...")
    sources = ["spange_descriptors", "acs_pca_descriptors", "drfps_catechol", "fragprints", "smiles"]
    dfs = []
    for src in sources:
        df_src = load_features(src).copy()
        if "SOLVENT NAME" not in df_src.columns:
            df_src = df_src.reset_index().rename(columns={"index": "SOLVENT NAME"})
        if src in ["drfps_catechol", "fragprints"]:
            prefix = "drfps" if src == "drfps_catechol" else "frag"
            df_src = df_src.loc[:, (df_src != 0).any(axis=0)]
            df_src = df_src.loc[:, (df_src != 1).any(axis=0)]
            values = df_src.drop(columns={"SOLVENT NAME"})
            count = values.sum(axis=0).T
            drop_cols = count[count == 1].index
            df_src = df_src.drop(columns=drop_cols)
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"{prefix}_{c}" for c in cols_to_rename})
        else:
            if src == "spange_descriptors": prefix = "spange"
            elif src == "acs_pca_descriptors": prefix = "acs"
            elif src == "smiles": prefix = "smiles"
            else: prefix = src
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"{prefix}_{c}" for c in cols_to_rename})
        smiles_like = [c for c in df_src.columns if "SMILES" in c.upper()]
        df_src = df_src.drop(columns=smiles_like, errors="ignore")
        df_src = df_src.set_index("SOLVENT NAME")
        dfs.append(df_src)
    featurizer = reduce(lambda l, r: l.join(r, how="inner"), dfs)
    print(f"Combined feature table shape (before corr filter): {featurizer.shape}")
    featurizer_filtered, dropped_cols = filter_correlated_features(featurizer, threshold=threshold)
    print(f"Final solvent feature table shape: {featurizer_filtered.shape}")
    _SOLVENT_TABLE_CACHE = featurizer_filtered
    return featurizer_filtered

print("Feature engineering functions defined")

Feature engineering functions defined


In [6]:
# Featurizers
class PrecomputedFeaturizer(SmilesFeaturizer):
    def __init__(self):
        self.featurizer = build_solvent_feature_table()
        dummy_num = pd.DataFrame([[0] * len(INPUT_LABELS_NUMERIC)], columns=INPUT_LABELS_NUMERIC)
        numeric_dim = add_numeric_features(dummy_num).shape[1]
        self.feats_dim = numeric_dim + self.featurizer.shape[1]

    def featurize(self, X):
        X_numeric = add_numeric_features(X[INPUT_LABELS_NUMERIC].copy())
        X_solvent = self.featurizer.loc[X["SOLVENT NAME"]]
        X_out = np.concatenate([X_numeric.values, X_solvent.values], axis=1)
        return torch.tensor(X_out, dtype=torch.double)

class PrecomputedFeaturizerMixed(SmilesFeaturizer):
    def __init__(self):
        self.featurizer = build_solvent_feature_table()
        dummy_num = pd.DataFrame([[0] * len(INPUT_LABELS_NUMERIC)], columns=INPUT_LABELS_NUMERIC)
        numeric_dim = add_numeric_features(dummy_num).shape[1]
        self.feats_dim = numeric_dim + self.featurizer.shape[1]

    def featurize(self, X):
        X_numeric = add_numeric_features(X[INPUT_LABELS_NUMERIC].copy())
        A = self.featurizer.loc[X["SOLVENT A NAME"]].values
        B = self.featurizer.loc[X["SOLVENT B NAME"]].values
        frac_B = X["SolventB%"].values.reshape(-1, 1)
        frac_A = 1 - frac_B
        mixed = A * frac_A + B * frac_B
        X_out = np.concatenate([X_numeric.values, mixed], axis=1)
        return torch.tensor(X_out, dtype=torch.double)

print("Featurizers defined")

Featurizers defined


In [7]:
# CatBoost Model
from catboost import CatBoostRegressor

class CatBoostModel(BaseModel):
    def __init__(self, data="single", verbose=False, random_state=42):
        self.data_mode = data
        self.verbose = verbose
        self.random_state = random_state
        if data == "single":
            self.smiles_featurizer = PrecomputedFeaturizer()
            self.cat_params = dict(
                random_seed=random_state, loss_function="MultiRMSE",
                depth=3, learning_rate=0.07, n_estimators=1050,
                l2_leaf_reg=3.5, bootstrap_type="Bayesian",
                bagging_temperature=0.225, grow_policy="SymmetricTree",
                rsm=0.75, verbose=verbose,
            )
        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixed()
            self.cat_params = dict(
                random_seed=random_state, loss_function="MultiRMSE",
                depth=3, learning_rate=0.06, n_estimators=1100,
                l2_leaf_reg=2.5, bootstrap_type="Bayesian",
                bagging_temperature=0.25, grow_policy="SymmetricTree",
                rsm=0.75, verbose=verbose,
            )
        self.model = None
        self.n_targets = None

    def train_model(self, train_X, train_Y, device=None, verbose=False):
        X_tensor = self.smiles_featurizer.featurize(train_X)
        X_np = X_tensor.detach().cpu().numpy()
        Y_np = train_Y.values
        self.n_targets = Y_np.shape[1]
        self.model = CatBoostRegressor(**self.cat_params)
        self.model.fit(X_np, Y_np)

    def predict(self, X):
        X_tensor = self.smiles_featurizer.featurize(X)
        X_np = X_tensor.detach().cpu().numpy()
        out = self.model.predict(X_np)
        out = np.asarray(out)
        if out.ndim == 1:
            out = out.reshape(-1, 1)
        out = np.clip(out, a_min=0.0, a_max=None)
        if out.shape[1] > 1:
            totals = out.sum(axis=1, keepdims=True)
            divisor = np.maximum(totals, 1.0)
            out = out / divisor
        return torch.tensor(out, dtype=torch.double)

print("CatBoostModel defined")

CatBoostModel defined


In [8]:
# XGBoost Model
from xgboost import XGBRegressor

class XGBModel(BaseModel):
    def __init__(self, data="single", random_state=42, verbose=False):
        self.data_mode = data
        self.verbose = verbose
        self.random_state = random_state
        if data == "single":
            self.smiles_featurizer = PrecomputedFeaturizer()
            self.xgb_params = dict(
                random_state=random_state, objective="reg:squarederror",
                tree_method="hist", subsample=0.5, reg_lambda=0.6,
                reg_alpha=0.0, n_estimators=1000, min_child_weight=1,
                max_depth=4, max_delta_step=1, learning_rate=0.02,
                grow_policy="depthwise", gamma=0.0, colsample_bytree=0.3,
                colsample_bylevel=0.6,
            )
        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixed()
            self.xgb_params = dict(
                random_state=random_state, objective="reg:squarederror",
                tree_method="approx", subsample=0.5, reg_lambda=0.6,
                reg_alpha=0.0, n_estimators=1000, min_child_weight=1,
                max_depth=4, max_delta_step=1, learning_rate=0.02,
                grow_policy="lossguide", gamma=0.0, colsample_bytree=0.3,
                colsample_bylevel=0.6,
            )
        self.models = None
        self.n_targets = None

    def train_model(self, train_X, train_Y, device=None, verbose=False):
        X_tensor = self.smiles_featurizer.featurize(train_X)
        X_np = X_tensor.detach().cpu().numpy()
        Y_np = train_Y.values
        self.n_targets = Y_np.shape[1]
        self.models = []
        for t in range(self.n_targets):
            model_t = XGBRegressor(**self.xgb_params)
            model_t.fit(X_np, Y_np[:, t])
            self.models.append(model_t)

    def predict(self, X):
        X_tensor = self.smiles_featurizer.featurize(X)
        X_np = X_tensor.detach().cpu().numpy()
        preds_list = [m.predict(X_np) for m in self.models]
        out = np.column_stack(preds_list)
        out = np.clip(out, a_min=0.0, a_max=None)
        if out.shape[1] > 1:
            totals = out.sum(axis=1, keepdims=True)
            divisor = np.maximum(totals, 1.0)
            out = out / divisor
        return torch.tensor(out, dtype=torch.double)

print("XGBModel defined")

XGBModel defined


In [9]:
# Ensemble Model
class EnsembleModel(BaseModel):
    def __init__(self, data="single", verbose=False):
        self.data_mode = data
        self.verbose = verbose
        if data == "single":
            self.weights = {"catboost": 0.65, "xgb": 0.35}
        else:
            self.weights = {"catboost": 0.60, "xgb": 0.40}
        self.catboost_model = CatBoostModel(data=data, verbose=verbose)
        self.xgb_model = XGBModel(data=data, verbose=verbose)

    def train_model(self, train_X, train_Y, device=None, verbose=False):
        self.catboost_model.train_model(train_X, train_Y, device, verbose)
        self.xgb_model.train_model(train_X, train_Y, device, verbose)

    def predict(self, X):
        cat_pred = self.catboost_model.predict(X).numpy()
        xgb_pred = self.xgb_model.predict(X).numpy()
        w_cat = self.weights["catboost"]
        w_xgb = self.weights["xgb"]
        out = w_cat * cat_pred + w_xgb * xgb_pred
        out = np.clip(out, a_min=0.0, a_max=None)
        if out.shape[1] > 1:
            totals = out.sum(axis=1, keepdims=True)
            divisor = np.maximum(totals, 1.0)
            out = out / divisor
        return torch.tensor(out, dtype=torch.double)

print("EnsembleModel defined")

EnsembleModel defined


In [10]:
# SimilarityAwareModel - THE KEY INNOVATION
class SimilarityAwareModel(BaseModel):
    """
    Uses chemical similarity (Tanimoto on Morgan fingerprints) to detect
    when we're extrapolating to solvents that are very different from
    training solvents. When similarity is low, blend toward training mean.
    
    Key insight:
    - Chemical similarity is a domain-specific measure of how "different"
      a test solvent is from training solvents
    - If a test solvent is very different from all training solvents,
      we should be more conservative
    - This directly targets the intercept problem by making predictions
      more conservative for truly novel solvents
    """
    def __init__(self, data="single", similarity_threshold=0.5, blend_weight=0.3, verbose=False):
        self.data_mode = data
        self.similarity_threshold = similarity_threshold
        self.blend_weight = blend_weight
        self.verbose = verbose
        self.base_model = EnsembleModel(data=data, verbose=verbose)
        self.train_fps = {}  # Fingerprints for training solvents
        self.train_mean = None
        
    def train_model(self, train_X, train_Y, device=None, verbose=False):
        # Store training mean for blending
        self.train_mean = train_Y.values.mean(axis=0)
        
        # Get unique training solvents and their fingerprints
        if self.data_mode == "single":
            train_solvents = train_X['SOLVENT NAME'].unique()
        else:
            train_solvents = list(set(train_X['SOLVENT A NAME'].unique()) | 
                                  set(train_X['SOLVENT B NAME'].unique()))
        
        # Store fingerprints for training solvents
        self.train_fps = {name: solvent_fps[name] for name in train_solvents if name in solvent_fps}
        
        # Train base model
        self.base_model.train_model(train_X, train_Y, device, verbose)
        
        if self.verbose or verbose:
            print(f"[SimilarityAwareModel] Training solvents: {len(self.train_fps)}")
            print(f"[SimilarityAwareModel] Train mean: {self.train_mean}")
        
    def compute_max_similarity(self, solvent_name):
        """Compute max Tanimoto similarity to any training solvent"""
        if solvent_name not in solvent_fps:
            return 0.0
        test_fp = solvent_fps[solvent_name]
        
        max_sim = 0.0
        for train_fp in self.train_fps.values():
            sim = compute_tanimoto_similarity(test_fp, train_fp)
            max_sim = max(max_sim, sim)
        return max_sim
        
    def predict(self, X):
        # Get base predictions
        base_pred = self.base_model.predict(X).numpy()
        
        # Compute similarity for each sample
        if self.data_mode == "single":
            solvents = X['SOLVENT NAME'].values
            similarities = np.array([self.compute_max_similarity(s) for s in solvents])
        else:
            # For mixtures, use weighted average of similarities
            solvents_a = X['SOLVENT A NAME'].values
            solvents_b = X['SOLVENT B NAME'].values
            frac_b = X['SolventB%'].values
            frac_a = 1 - frac_b
            
            sim_a = np.array([self.compute_max_similarity(s) for s in solvents_a])
            sim_b = np.array([self.compute_max_similarity(s) for s in solvents_b])
            similarities = sim_a * frac_a + sim_b * frac_b
        
        # Blend toward mean when similarity is low
        # weight = 0 when similarity >= threshold, weight increases when similarity < threshold
        weight = np.clip((self.similarity_threshold - similarities) / self.similarity_threshold * self.blend_weight, 0, self.blend_weight)
        weight = weight.reshape(-1, 1)
        
        # Blend: (1 - weight) * base_pred + weight * train_mean
        final_pred = (1 - weight) * base_pred + weight * self.train_mean
        
        # Clip and renormalize
        final_pred = np.clip(final_pred, 0, 1)
        if final_pred.shape[1] > 1:
            totals = final_pred.sum(axis=1, keepdims=True)
            divisor = np.maximum(totals, 1.0)
            final_pred = final_pred / divisor
        
        return torch.tensor(final_pred, dtype=torch.double)

print("SimilarityAwareModel defined")

SimilarityAwareModel defined


In [11]:
# Evaluation function
import tqdm

def evaluate_model(model_class, data_mode, **kwargs):
    """Evaluate a model using leave-one-out CV"""
    if data_mode == "single":
        X, Y = load_data("single_solvent")
        split_generator = generate_leave_one_out_splits(X, Y)
    else:
        X, Y = load_data("full")
        split_generator = generate_leave_one_ramp_out_splits(X, Y)
    
    all_predictions = []
    
    for fold_idx, split in tqdm.tqdm(enumerate(split_generator), desc=f"{data_mode}"):
        (train_X, train_Y), (test_X, test_Y) = split
        
        model = model_class(data=data_mode, **kwargs)
        model.train_model(train_X, train_Y)
        predictions = model.predict(test_X)
        
        predictions_df = pd.DataFrame(
            predictions.numpy(),
            columns=test_Y.columns,
            index=test_Y.index
        )
        all_predictions.append(predictions_df)
    
    all_predictions = pd.concat(all_predictions)
    Y_true = Y.loc[all_predictions.index]
    mse = ((all_predictions - Y_true) ** 2).mean().mean()
    return mse, all_predictions

print("Evaluation function defined")

Evaluation function defined


In [12]:
# First, evaluate baseline EnsembleModel
print("=" * 60)
print("Evaluating baseline EnsembleModel")
print("=" * 60)

baseline_single_mse, _ = evaluate_model(EnsembleModel, "single")
print(f"\nBaseline Single Solvent MSE: {baseline_single_mse:.6f}")

baseline_full_mse, _ = evaluate_model(EnsembleModel, "full")
print(f"Baseline Full Data MSE: {baseline_full_mse:.6f}")

baseline_combined = (baseline_single_mse * 656 + baseline_full_mse * 1227) / (656 + 1227)
print(f"\nBaseline Combined MSE: {baseline_combined:.6f}")

Evaluating baseline EnsembleModel


single: 0it [00:00, ?it/s]

>>> Building solvent feature table...
Combined feature table shape (before corr filter): (24, 113)
Final solvent feature table shape: (24, 64)


single: 1it [00:01,  1.36s/it]

single: 2it [00:02,  1.11s/it]

single: 3it [00:03,  1.02s/it]

single: 4it [00:04,  1.02s/it]

single: 5it [00:05,  1.02it/s]

single: 6it [00:06,  1.05it/s]

single: 7it [00:06,  1.06it/s]

single: 8it [00:07,  1.07it/s]

single: 9it [00:08,  1.08it/s]

single: 10it [00:09,  1.07it/s]

single: 11it [00:10,  1.08it/s]

single: 12it [00:11,  1.08it/s]

single: 13it [00:12,  1.09it/s]

single: 14it [00:13,  1.09it/s]

single: 15it [00:14,  1.05it/s]

single: 16it [00:15,  1.02it/s]

single: 17it [00:16,  1.03it/s]

single: 18it [00:17,  1.05it/s]

single: 19it [00:18,  1.04it/s]

single: 20it [00:19,  1.04it/s]

single: 21it [00:20,  1.06it/s]

single: 22it [00:21,  1.08it/s]

single: 23it [00:21,  1.09it/s]

single: 24it [00:22,  1.10it/s]

single: 24it [00:22,  1.05it/s]





Baseline Single Solvent MSE: 0.008175


full: 0it [00:00, ?it/s]

full: 1it [00:02,  2.10s/it]

full: 2it [00:04,  2.09s/it]

full: 3it [00:06,  2.03s/it]

full: 4it [00:08,  2.00s/it]

full: 5it [00:10,  1.98s/it]

full: 6it [00:12,  2.03s/it]

full: 7it [00:14,  2.02s/it]

full: 8it [00:16,  2.10s/it]

full: 9it [00:18,  2.11s/it]

full: 10it [00:20,  2.15s/it]

full: 11it [00:22,  2.15s/it]

full: 12it [00:25,  2.14s/it]

full: 13it [00:27,  2.14s/it]

full: 13it [00:27,  2.09s/it]

Baseline Full Data MSE: 0.009784

Baseline Combined MSE: 0.009223





In [13]:
# Test SimilarityAwareModel with different parameters
test_configs = [
    {'similarity_threshold': 0.3, 'blend_weight': 0.2},
    {'similarity_threshold': 0.4, 'blend_weight': 0.2},
    {'similarity_threshold': 0.5, 'blend_weight': 0.2},
    {'similarity_threshold': 0.5, 'blend_weight': 0.3},
    {'similarity_threshold': 0.6, 'blend_weight': 0.3},
]

results = []

print("\n" + "=" * 60)
print("Testing SimilarityAwareModel with different parameters")
print("=" * 60)

for config in test_configs:
    print(f"\n--- Config: {config} ---")
    
    single_mse, _ = evaluate_model(SimilarityAwareModel, "single", **config)
    print(f"Single Solvent MSE: {single_mse:.6f}")
    
    full_mse, _ = evaluate_model(SimilarityAwareModel, "full", **config)
    print(f"Full Data MSE: {full_mse:.6f}")
    
    combined = (single_mse * 656 + full_mse * 1227) / (656 + 1227)
    print(f"Combined MSE: {combined:.6f}")
    
    results.append({
        **config,
        'single_mse': single_mse,
        'full_mse': full_mse,
        'combined_mse': combined
    })

results_df = pd.DataFrame(results)
print("\n" + "=" * 60)
print("Summary of Results")
print("=" * 60)
print(results_df.to_string(index=False))


Testing SimilarityAwareModel with different parameters

--- Config: {'similarity_threshold': 0.3, 'blend_weight': 0.2} ---


single: 0it [00:00, ?it/s]

single: 1it [00:00,  1.09it/s]

single: 2it [00:01,  1.10it/s]

single: 3it [00:02,  1.05it/s]

single: 4it [00:03,  1.05it/s]

single: 5it [00:04,  1.04it/s]

single: 6it [00:05,  1.06it/s]

single: 7it [00:06,  1.06it/s]

single: 8it [00:07,  1.07it/s]

single: 9it [00:08,  1.07it/s]

single: 10it [00:09,  1.05it/s]

single: 11it [00:10,  1.06it/s]

single: 12it [00:11,  1.07it/s]

single: 13it [00:12,  1.07it/s]

single: 14it [00:13,  1.06it/s]

single: 15it [00:14,  1.07it/s]

single: 16it [00:15,  1.07it/s]

single: 17it [00:15,  1.07it/s]

single: 18it [00:16,  1.08it/s]

single: 19it [00:17,  1.08it/s]

single: 20it [00:18,  1.09it/s]

single: 21it [00:19,  1.09it/s]

single: 22it [00:20,  1.10it/s]

single: 23it [00:21,  1.10it/s]

single: 24it [00:22,  1.10it/s]

single: 24it [00:22,  1.08it/s]




Single Solvent MSE: 0.008210


full: 0it [00:00, ?it/s]

full: 1it [00:02,  2.22s/it]

full: 2it [00:04,  2.23s/it]

full: 3it [00:06,  2.20s/it]

full: 4it [00:08,  2.19s/it]

full: 5it [00:10,  2.11s/it]

full: 6it [00:13,  2.17s/it]

full: 7it [00:15,  2.17s/it]

full: 8it [00:17,  2.19s/it]

full: 9it [00:19,  2.19s/it]

full: 10it [00:21,  2.16s/it]

full: 11it [00:23,  2.16s/it]

full: 12it [00:26,  2.17s/it]

full: 13it [00:28,  2.20s/it]

full: 13it [00:28,  2.18s/it]




Full Data MSE: 0.009729
Combined MSE: 0.009200

--- Config: {'similarity_threshold': 0.4, 'blend_weight': 0.2} ---


single: 0it [00:00, ?it/s]

single: 1it [00:00,  1.04it/s]

single: 2it [00:01,  1.02it/s]

single: 3it [00:02,  1.03it/s]

single: 4it [00:03,  1.06it/s]

single: 5it [00:04,  1.07it/s]

single: 6it [00:05,  1.08it/s]

single: 7it [00:06,  1.09it/s]

single: 8it [00:07,  1.09it/s]

single: 9it [00:08,  1.08it/s]

single: 10it [00:09,  1.09it/s]

single: 11it [00:10,  1.07it/s]

single: 12it [00:11,  1.07it/s]

single: 13it [00:12,  1.08it/s]

single: 14it [00:13,  1.09it/s]

single: 15it [00:14,  1.06it/s]

single: 16it [00:14,  1.07it/s]

single: 17it [00:15,  1.07it/s]

single: 18it [00:16,  1.06it/s]

single: 19it [00:17,  1.07it/s]

single: 20it [00:18,  1.04it/s]

single: 21it [00:19,  1.05it/s]

single: 22it [00:20,  1.06it/s]

single: 23it [00:21,  1.07it/s]

single: 24it [00:22,  1.08it/s]

single: 24it [00:22,  1.07it/s]




Single Solvent MSE: 0.008330


full: 0it [00:00, ?it/s]

full: 1it [00:01,  1.96s/it]

full: 2it [00:04,  2.24s/it]

full: 3it [00:06,  2.22s/it]

full: 4it [00:08,  2.13s/it]

full: 5it [00:10,  2.07s/it]

full: 6it [00:12,  2.10s/it]

full: 7it [00:14,  2.04s/it]

full: 8it [00:16,  2.08s/it]

full: 9it [00:19,  2.15s/it]

full: 10it [00:21,  2.12s/it]

full: 11it [00:23,  2.13s/it]

full: 12it [00:25,  2.14s/it]

full: 13it [00:27,  2.18s/it]

full: 13it [00:27,  2.13s/it]




Full Data MSE: 0.009766
Combined MSE: 0.009265

--- Config: {'similarity_threshold': 0.5, 'blend_weight': 0.2} ---


single: 0it [00:00, ?it/s]

single: 1it [00:00,  1.08it/s]

single: 2it [00:01,  1.09it/s]

single: 3it [00:02,  1.09it/s]

single: 4it [00:03,  1.10it/s]

single: 5it [00:04,  1.08it/s]

single: 6it [00:05,  1.08it/s]

single: 7it [00:06,  1.09it/s]

single: 8it [00:07,  1.08it/s]

single: 9it [00:08,  1.08it/s]

single: 10it [00:09,  1.09it/s]

single: 11it [00:10,  1.08it/s]

single: 12it [00:11,  1.06it/s]

single: 13it [00:12,  1.06it/s]

single: 14it [00:12,  1.07it/s]

single: 15it [00:13,  1.07it/s]

single: 16it [00:14,  1.07it/s]

single: 17it [00:15,  1.03it/s]

single: 18it [00:16,  1.04it/s]

single: 19it [00:17,  1.06it/s]

single: 20it [00:18,  1.05it/s]

single: 21it [00:19,  1.06it/s]

single: 22it [00:20,  1.06it/s]

single: 23it [00:21,  1.08it/s]

single: 24it [00:22,  1.09it/s]

single: 24it [00:22,  1.07it/s]




Single Solvent MSE: 0.008528


full: 0it [00:00, ?it/s]

full: 1it [00:02,  2.19s/it]

full: 2it [00:04,  2.15s/it]

full: 3it [00:06,  2.22s/it]

full: 4it [00:08,  2.12s/it]

full: 5it [00:10,  2.11s/it]

full: 6it [00:12,  2.17s/it]

full: 7it [00:14,  2.11s/it]

full: 8it [00:17,  2.16s/it]

full: 9it [00:19,  2.20s/it]

full: 10it [00:21,  2.20s/it]

full: 11it [00:23,  2.18s/it]

full: 12it [00:26,  2.19s/it]

full: 13it [00:28,  2.20s/it]

full: 13it [00:28,  2.17s/it]




Full Data MSE: 0.009908
Combined MSE: 0.009427

--- Config: {'similarity_threshold': 0.5, 'blend_weight': 0.3} ---


single: 0it [00:00, ?it/s]

single: 1it [00:00,  1.05it/s]

single: 2it [00:01,  1.07it/s]

single: 3it [00:02,  1.09it/s]

single: 4it [00:03,  1.09it/s]

single: 5it [00:04,  1.05it/s]

single: 6it [00:05,  1.07it/s]

single: 7it [00:06,  1.07it/s]

single: 8it [00:07,  1.08it/s]

single: 9it [00:08,  1.08it/s]

single: 10it [00:09,  1.08it/s]

single: 11it [00:10,  1.08it/s]

single: 12it [00:11,  1.07it/s]

single: 13it [00:12,  1.08it/s]

single: 14it [00:12,  1.09it/s]

single: 15it [00:13,  1.09it/s]

single: 16it [00:14,  1.09it/s]

single: 17it [00:15,  1.09it/s]

single: 18it [00:16,  1.08it/s]

single: 19it [00:17,  1.08it/s]

single: 20it [00:18,  1.01it/s]

single: 21it [00:19,  1.03it/s]

single: 22it [00:20,  1.05it/s]

single: 23it [00:21,  1.05it/s]

single: 24it [00:22,  1.06it/s]

single: 24it [00:22,  1.07it/s]




Single Solvent MSE: 0.008860


full: 0it [00:00, ?it/s]

full: 1it [00:02,  2.06s/it]

full: 2it [00:04,  2.16s/it]

full: 3it [00:06,  2.15s/it]

full: 4it [00:08,  2.22s/it]

full: 5it [00:10,  2.21s/it]

full: 6it [00:13,  2.18s/it]

full: 7it [00:15,  2.21s/it]

full: 8it [00:17,  2.22s/it]

full: 9it [00:19,  2.26s/it]

full: 10it [00:22,  2.24s/it]

full: 11it [00:24,  2.23s/it]

full: 12it [00:26,  2.22s/it]

full: 13it [00:28,  2.25s/it]

full: 13it [00:28,  2.22s/it]




Full Data MSE: 0.010138
Combined MSE: 0.009693

--- Config: {'similarity_threshold': 0.6, 'blend_weight': 0.3} ---


single: 0it [00:00, ?it/s]

single: 1it [00:00,  1.09it/s]

single: 2it [00:01,  1.07it/s]

single: 3it [00:02,  1.08it/s]

single: 4it [00:03,  1.08it/s]

single: 5it [00:04,  1.07it/s]

single: 6it [00:05,  1.08it/s]

single: 7it [00:06,  1.07it/s]

single: 8it [00:07,  1.08it/s]

single: 9it [00:08,  1.08it/s]

single: 10it [00:09,  1.09it/s]

single: 11it [00:10,  1.08it/s]

single: 12it [00:11,  1.08it/s]

single: 13it [00:12,  1.09it/s]

single: 14it [00:12,  1.08it/s]

single: 15it [00:13,  1.09it/s]

single: 16it [00:14,  1.07it/s]

single: 17it [00:15,  1.07it/s]

single: 18it [00:16,  1.07it/s]

single: 19it [00:17,  1.07it/s]

single: 20it [00:18,  1.07it/s]

single: 21it [00:19,  1.08it/s]

single: 22it [00:20,  1.08it/s]

single: 23it [00:21,  1.07it/s]

single: 24it [00:22,  1.07it/s]

single: 24it [00:22,  1.08it/s]




Single Solvent MSE: 0.009168


full: 0it [00:00, ?it/s]

full: 1it [00:02,  2.21s/it]

full: 2it [00:04,  2.17s/it]

full: 3it [00:06,  2.24s/it]

full: 4it [00:08,  2.22s/it]

full: 5it [00:10,  2.19s/it]

full: 6it [00:13,  2.21s/it]

full: 7it [00:15,  2.21s/it]

full: 8it [00:17,  2.21s/it]

full: 9it [00:19,  2.22s/it]

full: 10it [00:21,  2.17s/it]

full: 11it [00:24,  2.13s/it]

full: 12it [00:26,  2.11s/it]

full: 13it [00:28,  2.11s/it]

full: 13it [00:28,  2.17s/it]

Full Data MSE: 0.010386
Combined MSE: 0.009962

Summary of Results
 similarity_threshold  blend_weight  single_mse  full_mse  combined_mse
                  0.3           0.2    0.008210  0.009729      0.009200
                  0.4           0.2    0.008330  0.009766      0.009265
                  0.5           0.2    0.008528  0.009908      0.009427
                  0.5           0.3    0.008860  0.010138      0.009693
                  0.6           0.3    0.009168  0.010386      0.009962





In [14]:
# Find best configuration
best_idx = results_df['combined_mse'].idxmin()
best_config = results_df.loc[best_idx]

print(f"\nBest configuration:")
print(f"  similarity_threshold: {best_config['similarity_threshold']}")
print(f"  blend_weight: {best_config['blend_weight']}")
print(f"  Combined MSE: {best_config['combined_mse']:.6f}")
print(f"\nBaseline combined MSE: {baseline_combined:.6f}")
print(f"Improvement: {(baseline_combined - best_config['combined_mse']) / baseline_combined * 100:.2f}%")


Best configuration:
  similarity_threshold: 0.3
  blend_weight: 0.2
  Combined MSE: 0.009200

Baseline combined MSE: 0.009223
Improvement: 0.25%


In [15]:
# Save metrics
import json

metrics = {
    'baseline_single_mse': float(baseline_single_mse),
    'baseline_full_mse': float(baseline_full_mse),
    'baseline_combined_mse': float(baseline_combined),
    'best_similarity_threshold': float(best_config['similarity_threshold']),
    'best_blend_weight': float(best_config['blend_weight']),
    'best_combined_mse': float(best_config['combined_mse']),
    'all_results': results,
    'cv_score': float(best_config['combined_mse']),
    'notes': 'Chemical similarity-based extrapolation detection using Tanimoto similarity on Morgan fingerprints.'
}

with open('/home/code/experiments/108_chemical_similarity/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print("Metrics saved")

Metrics saved


## Submission Cells

Using the best configuration found above.

In [None]:
# Set best parameters for submission
best_st = best_config['similarity_threshold']
best_bw = best_config['blend_weight']
print(f"Using similarity_threshold={best_st}, blend_weight={best_bw}")

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = SimilarityAwareModel(data='single', similarity_threshold=best_st, blend_weight=best_bw) # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)
    predictions = model.predict(test_X)

    predictions_df = pd.DataFrame(
        predictions.numpy(),
        columns=test_Y.columns,
        index=test_Y.index
    )
    all_predictions.append(predictions_df)

submission_single_solvent = pd.concat(all_predictions)

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = SimilarityAwareModel(data='full', similarity_threshold=best_st, blend_weight=best_bw) # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)
    predictions = model.predict(test_X)

    predictions_df = pd.DataFrame(
        predictions.numpy(),
        columns=test_Y.columns,
        index=test_Y.index
    )
    all_predictions.append(predictions_df)

submission_full_data = pd.concat(all_predictions)

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/code/experiments/108_chemical_similarity/submission.csv", index=True)

# Also copy to main submission folder
import shutil
shutil.copy("/home/code/experiments/108_chemical_similarity/submission.csv", "/home/submission/submission.csv")

print(f"Submission shape: {submission.shape}")
print(f"Submission saved to /home/submission/submission.csv")

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################