# Experiment 122: Solvent Similarity-Based Prediction Weighting

**Goal**: Address the extrapolation problem by weighting predictions based on similarity to training solvents.

**Key Insight**: The CV-LB intercept (0.0546) > target (0.0347) means we're making large errors on unseen solvents. By detecting when we're extrapolating (low similarity to training) and blending toward the population mean, we might reduce these errors.

**Approach**:
1. Compute Tanimoto similarity between test solvent and all training solvents
2. If max similarity is low (extrapolating), blend prediction toward training mean
3. If max similarity is high (interpolating), trust model prediction

**CRITICAL**: The model class `SimilarityWeightedModel` will be used in BOTH CV computation AND submission cells.

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import catboost as cb
import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
torch.set_default_dtype(torch.double)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [2]:
# RDKit imports for similarity computation
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
print('RDKit imported successfully')

RDKit imported successfully


In [3]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [4]:
# Load feature lookups
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFP_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)
SMILES_DF = pd.read_csv(f'{DATA_PATH}/smiles_lookup.csv', index_col=0)

# Filter DRFP to high-variance columns
drfp_variance = DRFP_DF.var()
nonzero_variance_cols = drfp_variance[drfp_variance > 0].index.tolist()
DRFP_FILTERED = DRFP_DF[nonzero_variance_cols]

print(f'Spange: {SPANGE_DF.shape}, DRFP filtered: {DRFP_FILTERED.shape}, ACS PCA: {ACS_PCA_DF.shape}')
print(f'SMILES: {SMILES_DF.shape}')

Spange: (26, 13), DRFP filtered: (24, 122), ACS PCA: (24, 5)
SMILES: (26, 1)


In [5]:
# Pre-compute Morgan fingerprints for all solvents
MORGAN_FP = {}
for solvent in SMILES_DF.index:
    smiles = SMILES_DF.loc[solvent, 'solvent smiles']
    if isinstance(smiles, str) and '.' not in smiles:  # Skip mixtures
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
            MORGAN_FP[solvent] = fp

print(f'Pre-computed Morgan fingerprints for {len(MORGAN_FP)} solvents')
print(f'Solvents with fingerprints: {list(MORGAN_FP.keys())[:5]}...')

Pre-computed Morgan fingerprints for 23 solvents
Solvents with fingerprints: ['Cyclohexane', 'Ethyl Acetate', 'Acetic Acid', '2-Methyltetrahydrofuran [2-MeTHF]', '1,1,1,3,3,3-Hexafluoropropan-2-ol']...




In [6]:
# Featurizer
class Featurizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.drfp_df = DRFP_FILTERED
        self.acs_pca_df = ACS_PCA_DF
        self.feats_dim = 2 + 3 + self.spange_df.shape[1] + self.drfp_df.shape[1] + self.acs_pca_df.shape[1]

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            A_drfp = self.drfp_df.loc[X["SOLVENT A NAME"]].values
            B_drfp = self.drfp_df.loc[X["SOLVENT B NAME"]].values
            A_acs = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
            B_acs = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            if flip:
                X_spange = B_spange * (1 - (1-pct)) + A_spange * (1-pct)
                X_drfp = B_drfp * (1 - (1-pct)) + A_drfp * (1-pct)
                X_acs = B_acs * (1 - (1-pct)) + A_acs * (1-pct)
            else:
                X_spange = A_spange * (1 - pct) + B_spange * pct
                X_drfp = A_drfp * (1 - pct) + B_drfp * pct
                X_acs = A_acs * (1 - pct) + B_acs * pct
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
            X_drfp = self.drfp_df.loc[X["SOLVENT NAME"]].values
            X_acs = self.acs_pca_df.loc[X["SOLVENT NAME"]].values
        
        return np.hstack([X_kinetic, X_spange, X_drfp, X_acs])

print(f'Feature dimension: {Featurizer().feats_dim}')

Feature dimension: 145


In [7]:
# Similarity-Weighted Model
class SimilarityWeightedModel:
    """Model that weights predictions based on similarity to training solvents.
    
    Key insight: For test solvents very different from training, blend toward
    the population mean to reduce extrapolation error.
    
    This is the SAME class used in both CV and submission cells.
    """
    def __init__(self, data='single', blend_threshold=0.5, blend_weight=0.3):
        self.data_type = data
        self.blend_threshold = blend_threshold  # Below this similarity, blend toward mean
        self.blend_weight = blend_weight  # How much to blend toward mean
        self.featurizer = Featurizer(mixed=(data=='full'))
        self.scaler = StandardScaler()
        self.catboost_models = []
        self.xgboost_models = []
        self.train_mean = None
        self.train_solvents = None
        
    def train_model(self, X_train, y_train):
        # Featurize
        X_feat = self.featurizer.featurize(X_train)
        y_vals = y_train.values
        
        # Store training mean and solvents
        self.train_mean = y_vals.mean(axis=0)
        if self.data_type == 'full':
            self.train_solvents = set(X_train["SOLVENT A NAME"].unique()) | set(X_train["SOLVENT B NAME"].unique())
        else:
            self.train_solvents = set(X_train["SOLVENT NAME"].unique())
        
        # Scale features
        X_scaled = self.scaler.fit_transform(X_feat)
        
        # Train separate models for each target
        for i in range(3):
            # CatBoost
            cb_model = cb.CatBoostRegressor(
                iterations=500,
                learning_rate=0.05,
                depth=6,
                l2_leaf_reg=3,
                random_seed=42,
                verbose=False
            )
            cb_model.fit(X_scaled, y_vals[:, i])
            self.catboost_models.append(cb_model)
            
            # XGBoost
            xgb_model = xgb.XGBRegressor(
                n_estimators=500,
                learning_rate=0.05,
                max_depth=6,
                reg_lambda=1,
                random_state=42,
                verbosity=0
            )
            xgb_model.fit(X_scaled, y_vals[:, i])
            self.xgboost_models.append(xgb_model)
    
    def _get_max_similarity(self, test_solvent):
        """Get maximum Tanimoto similarity between test solvent and training solvents."""
        if test_solvent not in MORGAN_FP:
            return 0.0  # Unknown solvent, treat as low similarity
        
        test_fp = MORGAN_FP[test_solvent]
        max_sim = 0.0
        
        for train_solvent in self.train_solvents:
            if train_solvent in MORGAN_FP:
                train_fp = MORGAN_FP[train_solvent]
                sim = DataStructs.TanimotoSimilarity(test_fp, train_fp)
                max_sim = max(max_sim, sim)
        
        return max_sim
    
    def predict(self, X):
        X_feat = self.featurizer.featurize(X)
        X_scaled = self.scaler.transform(X_feat)
        
        # Get predictions from both models
        preds = []
        for i in range(3):
            cb_pred = self.catboost_models[i].predict(X_scaled)
            xgb_pred = self.xgboost_models[i].predict(X_scaled)
            # Average ensemble
            pred = (cb_pred + xgb_pred) / 2
            preds.append(pred)
        
        pred = np.column_stack(preds)
        
        # Apply similarity-based blending
        if self.data_type == 'full':
            test_solvents = X["SOLVENT A NAME"].values  # Use primary solvent
        else:
            test_solvents = X["SOLVENT NAME"].values
        
        for idx, test_solvent in enumerate(test_solvents):
            max_sim = self._get_max_similarity(test_solvent)
            
            if max_sim < self.blend_threshold:
                # Low similarity - blend toward mean
                # The lower the similarity, the more we blend
                blend_factor = self.blend_weight * (1 - max_sim / self.blend_threshold)
                pred[idx] = (1 - blend_factor) * pred[idx] + blend_factor * self.train_mean
        
        pred = np.clip(pred, 0, 1)
        
        return torch.tensor(pred)

print('SimilarityWeightedModel defined - will be used in both CV and submission cells')

SimilarityWeightedModel defined - will be used in both CV and submission cells


In [8]:
# Load data
X_single, Y_single = load_data("single_solvent")
X_full, Y_full = load_data("full")
print(f"Single solvent: {len(X_single)} samples")
print(f"Full data: {len(X_full)} samples")

Single solvent: 656 samples
Full data: 1227 samples


In [None]:
# Cross-validation to compute CV score
print("Computing CV score...")

# Single solvent CV
single_mses = []

for fold_idx, split in enumerate(generate_leave_one_out_splits(X_single, Y_single)):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = SimilarityWeightedModel(data='single', blend_threshold=0.5, blend_weight=0.3)  # SAME CLASS AS SUBMISSION
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X).numpy()
    targets = test_Y.values
    
    mse = np.mean((predictions - targets) ** 2)
    single_mses.append(mse)
    
    if fold_idx % 6 == 0:
        print(f"  Fold {fold_idx}: MSE = {mse:.6f}")

single_mse = np.mean(single_mses)
print(f"\nSingle solvent MSE: {single_mse:.6f}")

In [None]:
# Full data CV
full_mses = []

for fold_idx, split in enumerate(generate_leave_one_ramp_out_splits(X_full, Y_full)):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = SimilarityWeightedModel(data='full', blend_threshold=0.5, blend_weight=0.3)  # SAME CLASS AS SUBMISSION
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X).numpy()
    targets = test_Y.values
    
    mse = np.mean((predictions - targets) ** 2)
    full_mses.append(mse)
    
    if fold_idx % 3 == 0:
        print(f"  Fold {fold_idx}: MSE = {mse:.6f}")

full_mse = np.mean(full_mses)
print(f"\nFull data MSE: {full_mse:.6f}")

In [None]:
# Combined CV score
cv_score = (single_mse + full_mse) / 2
print(f"\n=== CV Results ===")
print(f"Single solvent MSE: {single_mse:.6f}")
print(f"Full data MSE: {full_mse:.6f}")
print(f"Combined CV score: {cv_score:.6f}")

# Save metrics
import json
metrics = {
    'cv_score': cv_score,
    'single_mse': single_mse,
    'full_mse': full_mse,
    'blend_threshold': 0.5,
    'blend_weight': 0.3
}
with open('/home/code/experiments/122_similarity_weighting/metrics.json', 'w') as f:
    json.dump(metrics, f)

print(f"\nComparison with best CV: 0.0081")
print(f"This experiment: {cv_score:.6f}")
if cv_score < 0.0081:
    print("IMPROVEMENT! This is better than best CV.")
else:
    print(f"No improvement. Difference: {cv_score - 0.0081:.6f}")

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = SimilarityWeightedModel(data='single', blend_threshold=0.5, blend_weight=0.3)  # SAME CLASS AS CV
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = SimilarityWeightedModel(data='full', blend_threshold=0.5, blend_weight=0.3)  # SAME CLASS AS CV
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

print(f"Submission saved with {len(submission)} rows")
print(submission.head())

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################