# Experiment 082: Solvent Similarity Weighting (Continuous)

**Rationale**: Instead of discrete clustering (which failed in exp_081), use CONTINUOUS similarity weighting:
1. For each test solvent, compute similarity to ALL training solvents using Spange descriptors
2. When test solvent is dissimilar to all training solvents, blend predictions toward population mean
3. This addresses extrapolation by being conservative for dissimilar solvents

**Key difference from exp_081**: Uses continuous similarity weights, not discrete clusters.

In [1]:
import sys
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import xgboost as xgb
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

print('Imports done')

Imports done


In [2]:
# Local data loading functions
def load_data(data_type):
    if data_type == "single_solvent":
        df = pd.read_csv('/home/data/catechol_single_solvent_yields.csv')
        X = df[['Residence Time', 'Temperature', 'SOLVENT NAME']]
        Y = df[['SM', 'Product 2', 'Product 3']]
    elif data_type == "full":
        df = pd.read_csv('/home/data/catechol_full_data_yields.csv')
        X = df[['Residence Time', 'Temperature', 'SOLVENT A NAME', 'SOLVENT B NAME', 'SolventB%']]
        Y = df[['SM', 'Product 2', 'Product 3']]
    return X, Y

def load_features(feature_type):
    if feature_type == 'spange_descriptors':
        return pd.read_csv('/home/data/spange_descriptors_lookup.csv', index_col=0)

print('Data functions defined')

Data functions defined


In [3]:
# Official CV split functions (DO NOT MODIFY)
from typing import Any, Generator

def generate_leave_one_out_splits(
    X: pd.DataFrame, Y: pd.DataFrame
) -> Generator[
    tuple[tuple[pd.DataFrame, pd.DataFrame], tuple[pd.DataFrame, pd.DataFrame]],
    Any,
    None,
]:
    for solvent in X["SOLVENT NAME"].unique():
        train_mask = X["SOLVENT NAME"] != solvent
        test_mask = X["SOLVENT NAME"] == solvent
        yield (
            (X[train_mask], Y[train_mask]),
            (X[test_mask], Y[test_mask]),
        )

def generate_leave_one_ramp_out_splits(
    X: pd.DataFrame, Y: pd.DataFrame
) -> Generator[
    tuple[tuple[pd.DataFrame, pd.DataFrame], tuple[pd.DataFrame, pd.DataFrame]],
    Any,
    None,
]:
    ramps = X["SOLVENT A NAME"].astype(str) + "_" + X["SOLVENT B NAME"].astype(str)
    for ramp in ramps.unique():
        train_mask = ramps != ramp
        test_mask = ramps == ramp
        yield (
            (X[train_mask], Y[train_mask]),
            (X[test_mask], Y[test_mask]),
        )

print('CV split functions defined')

CV split functions defined


In [4]:
# Similarity-weighted model
class SimilarityWeightedModel:
    """Model that uses continuous similarity weighting for conservative predictions."""
    
    def __init__(self, data='single', blend_strength=0.3):
        self.data = data
        self.mixed = (data == 'full')
        self.blend_strength = blend_strength  # How much to blend toward mean for dissimilar solvents
        
        # Load Spange descriptors
        self.spange = load_features('spange_descriptors')
        self.scaler_spange = StandardScaler()
        self.spange_scaled = self.scaler_spange.fit_transform(self.spange.values)
        self.spange_scaled_df = pd.DataFrame(self.spange_scaled, index=self.spange.index)
        
        # Feature scaler
        self.scaler = StandardScaler()
        
        # Population mean (will be computed from training data)
        self.population_mean = None
        
        # Training solvent descriptors (for similarity computation)
        self.train_solvent_descriptors = None
        self.train_solvents = None
        
    def _compute_similarity(self, test_solvent_desc, train_solvent_descs):
        """Compute similarity between test solvent and all training solvents.
        Returns a value between 0 (dissimilar) and 1 (identical).
        """
        # Use negative euclidean distance converted to similarity
        distances = euclidean_distances(test_solvent_desc.reshape(1, -1), train_solvent_descs)[0]
        
        # Convert to similarity: higher distance = lower similarity
        # Use exponential decay: similarity = exp(-distance / scale)
        scale = np.median(distances) + 1e-6  # Adaptive scale based on typical distances
        similarities = np.exp(-distances / scale)
        
        # Return max similarity (how similar is test to its most similar training solvent)
        return np.max(similarities)
    
    def _get_features(self, X):
        """Extract features from data."""
        if self.mixed:
            res_time = X['Residence Time'].values.reshape(-1, 1)
            temp = X['Temperature'].values.reshape(-1, 1)
            sb_pct = X['SolventB%'].values.reshape(-1, 1) / 100.0
            
            # Get solvent features
            feats_a = self.spange.loc[X['SOLVENT A NAME']].values
            feats_b = self.spange.loc[X['SOLVENT B NAME']].values
            
            # Linear mixing
            solvent_feats = (1 - sb_pct) * feats_a + sb_pct * feats_b
            
            combined = np.hstack([res_time, temp, sb_pct, solvent_feats])
        else:
            res_time = X['Residence Time'].values.reshape(-1, 1)
            temp = X['Temperature'].values.reshape(-1, 1)
            solvent_feats = self.spange.loc[X['SOLVENT NAME']].values
            
            combined = np.hstack([res_time, temp, solvent_feats])
        
        return combined.astype(np.float32)
    
    def _get_solvent_descriptors(self, X):
        """Get scaled solvent descriptors for similarity computation."""
        if self.mixed:
            sb_pct = X['SolventB%'].values.reshape(-1, 1) / 100.0
            feats_a = self.spange_scaled_df.loc[X['SOLVENT A NAME']].values
            feats_b = self.spange_scaled_df.loc[X['SOLVENT B NAME']].values
            return (1 - sb_pct) * feats_a + sb_pct * feats_b
        else:
            return self.spange_scaled_df.loc[X['SOLVENT NAME']].values
    
    def train_model(self, train_X, train_Y):
        X_np = self._get_features(train_X)
        y_np = train_Y.values
        
        # Scale features
        X_scaled = self.scaler.fit_transform(X_np)
        
        # Store population mean
        self.population_mean = y_np.mean(axis=0)
        
        # Store training solvent descriptors for similarity computation
        self.train_solvent_descriptors = self._get_solvent_descriptors(train_X)
        # Get unique training solvents
        if self.mixed:
            self.train_solvents = set(train_X['SOLVENT A NAME'].unique()) | set(train_X['SOLVENT B NAME'].unique())
        else:
            self.train_solvents = set(train_X['SOLVENT NAME'].unique())
        
        # Train CatBoost model
        self.models = []
        for t in range(3):
            model = CatBoostRegressor(
                iterations=500,
                learning_rate=0.05,
                depth=6,
                random_state=SEED,
                verbose=False
            )
            model.fit(X_scaled, y_np[:, t])
            self.models.append(model)
    
    def predict(self, test_X):
        X_np = self._get_features(test_X)
        X_scaled = self.scaler.transform(X_np)
        
        # Get model predictions
        preds = np.column_stack([m.predict(X_scaled) for m in self.models])
        
        # Compute similarity for each test sample
        test_solvent_descs = self._get_solvent_descriptors(test_X)
        
        # Get unique training solvent descriptors
        if self.mixed:
            train_unique_solvents = list(self.train_solvents)
        else:
            train_unique_solvents = list(self.train_solvents)
        train_unique_descs = self.spange_scaled_df.loc[train_unique_solvents].values
        
        # Compute similarity and blend
        final_preds = np.zeros_like(preds)
        for i in range(len(test_X)):
            similarity = self._compute_similarity(test_solvent_descs[i], train_unique_descs)
            
            # Blend factor: 0 = use model prediction, 1 = use population mean
            # When similarity is low, blend more toward mean
            blend_factor = self.blend_strength * (1 - similarity)
            
            final_preds[i] = (1 - blend_factor) * preds[i] + blend_factor * self.population_mean
        
        # Clip to [0, 1]
        final_preds = np.clip(final_preds, 0, 1)
        
        return torch.tensor(final_preds)

print('SimilarityWeightedModel defined')

SimilarityWeightedModel defined


In [5]:
# Run CV for single solvent data
import tqdm

X, Y = load_data("single_solvent")
print(f"Single solvent data: {len(X)} samples, {len(X['SOLVENT NAME'].unique())} solvents")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []
fold_mses = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=24):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = SimilarityWeightedModel(data='single', blend_strength=0.3)
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X)
    predictions_np = predictions.detach().cpu().numpy()
    
    # Calculate fold MSE
    fold_mse = np.mean((predictions_np - test_Y.values) ** 2)
    fold_mses.append(fold_mse)
    
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_single_solvent = pd.DataFrame(all_predictions)
print(f"\nSingle solvent CV MSE: {np.mean(fold_mses):.6f} ± {np.std(fold_mses):.6f}")

Single solvent data: 656 samples, 24 solvents


  0%|          | 0/24 [00:00<?, ?it/s]

  4%|▍         | 1/24 [00:00<00:11,  1.99it/s]

  8%|▊         | 2/24 [00:00<00:10,  2.12it/s]

 12%|█▎        | 3/24 [00:01<00:09,  2.13it/s]

 17%|█▋        | 4/24 [00:01<00:09,  2.13it/s]

 21%|██        | 5/24 [00:02<00:08,  2.14it/s]

 25%|██▌       | 6/24 [00:02<00:08,  2.17it/s]

 29%|██▉       | 7/24 [00:03<00:07,  2.21it/s]

 33%|███▎      | 8/24 [00:03<00:07,  2.18it/s]

 38%|███▊      | 9/24 [00:04<00:06,  2.17it/s]

 42%|████▏     | 10/24 [00:04<00:06,  2.21it/s]

 46%|████▌     | 11/24 [00:05<00:05,  2.20it/s]

 50%|█████     | 12/24 [00:05<00:05,  2.17it/s]

 54%|█████▍    | 13/24 [00:06<00:05,  2.17it/s]

 58%|█████▊    | 14/24 [00:06<00:04,  2.15it/s]

 62%|██████▎   | 15/24 [00:06<00:04,  2.13it/s]

 67%|██████▋   | 16/24 [00:07<00:03,  2.16it/s]

 71%|███████   | 17/24 [00:07<00:03,  2.18it/s]

 75%|███████▌  | 18/24 [00:08<00:02,  2.19it/s]

 79%|███████▉  | 19/24 [00:08<00:02,  2.17it/s]

 83%|████████▎ | 20/24 [00:09<00:01,  2.15it/s]

 88%|████████▊ | 21/24 [00:09<00:01,  2.16it/s]

 92%|█████████▏| 22/24 [00:10<00:00,  2.18it/s]

 96%|█████████▌| 23/24 [00:10<00:00,  2.18it/s]

100%|██████████| 24/24 [00:11<00:00,  2.23it/s]

100%|██████████| 24/24 [00:11<00:00,  2.17it/s]


Single solvent CV MSE: 0.012712 ± 0.011217





In [6]:
# Run CV for full (mixture) data
X, Y = load_data("full")
print(f"Full data: {len(X)} samples")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []
fold_mses = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=13):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = SimilarityWeightedModel(data='full', blend_strength=0.3)
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X)
    predictions_np = predictions.detach().cpu().numpy()
    
    # Calculate fold MSE
    fold_mse = np.mean((predictions_np - test_Y.values) ** 2)
    fold_mses.append(fold_mse)
    
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_full_data = pd.DataFrame(all_predictions)
print(f"\nFull data CV MSE: {np.mean(fold_mses):.6f} ± {np.std(fold_mses):.6f}")

Full data: 1227 samples


  0%|          | 0/13 [00:00<?, ?it/s]

  8%|▊         | 1/13 [00:00<00:09,  1.28it/s]

 15%|█▌        | 2/13 [00:01<00:08,  1.28it/s]

 23%|██▎       | 3/13 [00:02<00:07,  1.29it/s]

 31%|███       | 4/13 [00:03<00:06,  1.29it/s]

 38%|███▊      | 5/13 [00:04<00:06,  1.22it/s]

 46%|████▌     | 6/13 [00:04<00:05,  1.23it/s]

 54%|█████▍    | 7/13 [00:05<00:04,  1.24it/s]

 62%|██████▏   | 8/13 [00:06<00:03,  1.25it/s]

 69%|██████▉   | 9/13 [00:07<00:03,  1.26it/s]

 77%|███████▋  | 10/13 [00:07<00:02,  1.26it/s]

 85%|████████▍ | 11/13 [00:08<00:01,  1.25it/s]

 92%|█████████▏| 12/13 [00:09<00:00,  1.25it/s]

100%|██████████| 13/13 [00:10<00:00,  1.27it/s]

100%|██████████| 13/13 [00:10<00:00,  1.26it/s]


Full data CV MSE: 0.015364 ± 0.007802





In [None]:
# Combine and save submission
submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"

print(f"Submission shape: {submission.shape}")

# Save
submission.to_csv("/home/submission/submission.csv", index=True)
print(f"\nSubmission saved to /home/submission/submission.csv")

# Verify
submission_check = pd.read_csv("/home/submission/submission.csv")
print(f"\nSubmission rows: {len(submission_check)}")

# Check prediction ranges
target_cols = ['target_1', 'target_2', 'target_3']
for col in target_cols:
    print(f"{col}: min={submission_check[col].min():.4f}, max={submission_check[col].max():.4f}")

In [None]:
# Calculate overall CV score
print("="*50)
print("EXPERIMENT 082 COMPLETE")
print("="*50)