# Solvent Similarity Features Experiment

**CRITICAL INSIGHT**: The CV-LB relationship has intercept (0.0533) > target (0.0347).
- Even with CV = 0, LB would be 0.0533
- We CANNOT reach the target by improving CV alone
- We need to REDUCE THE INTERCEPT

**Hypothesis**: The CV-LB gap is due to extrapolation to unseen solvents. If we add features that measure similarity to training solvents, the model might generalize better.

**Implementation**:
- For each sample, compute distance to all training solvents in Spange descriptor space
- Add features: min distance, mean distance, max distance, n_close
- This gives the model information about how "novel" the test solvent is

**Why this might work**:
- The model can learn to be more conservative when predicting for novel solvents
- This is a form of domain adaptation

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import lightgbm as lgb
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel
from sklearn.preprocessing import StandardScaler
import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.set_default_dtype(torch.double)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [2]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [3]:
# Load feature lookups
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFP_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)

# Filter DRFP to high-variance columns
drfp_variance = DRFP_DF.var()
nonzero_variance_cols = drfp_variance[drfp_variance > 0].index.tolist()
DRFP_FILTERED = DRFP_DF[nonzero_variance_cols]

print(f'Spange: {SPANGE_DF.shape}, DRFP filtered: {DRFP_FILTERED.shape}, ACS PCA: {ACS_PCA_DF.shape}')

Spange: (26, 13), DRFP filtered: (24, 122), ACS PCA: (24, 5)


In [4]:
# Featurizer with Solvent Similarity Features
class SimilarityFeaturizer:
    """Featurizer that includes solvent similarity features.
    
    For each sample, computes distance to all training solvents in Spange descriptor space.
    This gives the model information about how "novel" the test solvent is.
    """
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.drfp_df = DRFP_FILTERED
        self.acs_pca_df = ACS_PCA_DF
        self.train_solvents = None  # Set during training
        self.train_spange_scaled = None  # Scaled Spange descriptors of training solvents
        self.spange_scaler = StandardScaler()
        
        # Base features: 2 (time, temp) + 3 (kinetics) + 13 (spange) + 122 (drfp) + 5 (acs) = 145
        # Similarity features: 4 (min_dist, mean_dist, max_dist, n_close)
        self.base_feats_dim = 2 + 3 + self.spange_df.shape[1] + self.drfp_df.shape[1] + self.acs_pca_df.shape[1]
        self.sim_feats_dim = 4
        self.feats_dim = self.base_feats_dim + self.sim_feats_dim  # 149 features

    def fit_train_solvents(self, train_X):
        """Store training solvents for similarity computation."""
        if self.mixed:
            # For mixed solvents, use both A and B solvents
            solvents_a = train_X["SOLVENT A NAME"].unique()
            solvents_b = train_X["SOLVENT B NAME"].unique()
            self.train_solvents = list(set(solvents_a) | set(solvents_b))
        else:
            self.train_solvents = train_X["SOLVENT NAME"].unique().tolist()
        
        # Get Spange descriptors for training solvents and scale them
        train_spange = np.array([self.spange_df.loc[s].values for s in self.train_solvents])
        self.train_spange_scaled = self.spange_scaler.fit_transform(train_spange)
    
    def compute_similarity_features(self, solvent_name):
        """Compute similarity features for a single solvent."""
        solvent_spange = self.spange_df.loc[solvent_name].values.reshape(1, -1)
        solvent_spange_scaled = self.spange_scaler.transform(solvent_spange)[0]
        
        # Compute distances to all training solvents
        distances = np.linalg.norm(self.train_spange_scaled - solvent_spange_scaled, axis=1)
        
        # Similarity features
        min_dist = np.min(distances)
        mean_dist = np.mean(distances)
        max_dist = np.max(distances)
        median_dist = np.median(distances)
        n_close = np.sum(distances < median_dist)
        
        return np.array([min_dist, mean_dist, max_dist, n_close])

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            A_drfp = self.drfp_df.loc[X["SOLVENT A NAME"]].values
            B_drfp = self.drfp_df.loc[X["SOLVENT B NAME"]].values
            A_acs = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
            B_acs = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            if flip:
                X_spange = B_spange * (1 - (1-pct)) + A_spange * (1-pct)
                X_drfp = B_drfp * (1 - (1-pct)) + A_drfp * (1-pct)
                X_acs = B_acs * (1 - (1-pct)) + A_acs * (1-pct)
            else:
                X_spange = A_spange * (1 - pct) + B_spange * pct
                X_drfp = A_drfp * (1 - pct) + B_drfp * pct
                X_acs = A_acs * (1 - pct) + B_acs * pct
            
            # Compute similarity features for mixed solvents (average of A and B)
            sim_features = []
            for i in range(len(X)):
                sim_a = self.compute_similarity_features(X.iloc[i]["SOLVENT A NAME"])
                sim_b = self.compute_similarity_features(X.iloc[i]["SOLVENT B NAME"])
                # Weighted average based on percentage
                sim = sim_a * (1 - pct[i, 0]) + sim_b * pct[i, 0]
                sim_features.append(sim)
            X_sim = np.array(sim_features)
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
            X_drfp = self.drfp_df.loc[X["SOLVENT NAME"]].values
            X_acs = self.acs_pca_df.loc[X["SOLVENT NAME"]].values
            
            # Compute similarity features for single solvents
            sim_features = []
            for solvent in X["SOLVENT NAME"]:
                sim = self.compute_similarity_features(solvent)
                sim_features.append(sim)
            X_sim = np.array(sim_features)
        
        return np.hstack([X_kinetic, X_spange, X_drfp, X_acs, X_sim])
    
    def featurize_torch(self, X, flip=False):
        return torch.tensor(self.featurize(X, flip))

print(f'SimilarityFeaturizer feature dimension: {SimilarityFeaturizer().feats_dim}')

SimilarityFeaturizer feature dimension: 149


In [5]:
# Simple Featurizer for GP (no similarity features, just Spange + kinetics)
class SimpleFeaturizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.feats_dim = 2 + 3 + self.spange_df.shape[1]  # 18 features

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            if flip:
                X_spange = B_spange * (1 - (1-pct)) + A_spange * (1-pct)
            else:
                X_spange = A_spange * (1 - pct) + B_spange * pct
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
        
        return np.hstack([X_kinetic, X_spange])

print(f'Simple feature dimension (for GP): {SimpleFeaturizer().feats_dim}')

Simple feature dimension (for GP): 18


In [6]:
# Enhanced MLP with residual connections
class EnhancedMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, output_dim=3, dropout=0.2):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, hidden_dim)
        self.ln1 = nn.LayerNorm(hidden_dim)
        self.hidden1 = nn.Linear(hidden_dim, hidden_dim)
        self.ln2 = nn.LayerNorm(hidden_dim)
        self.hidden2 = nn.Linear(hidden_dim, hidden_dim)
        self.ln3 = nn.LayerNorm(hidden_dim)
        self.output = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.act = nn.GELU()
        
    def forward(self, x):
        x = self.act(self.ln1(self.input_proj(x)))
        x = self.dropout(x)
        residual = x
        x = self.act(self.ln2(self.hidden1(x)))
        x = self.dropout(x)
        x = x + residual
        residual = x
        x = self.act(self.ln3(self.hidden2(x)))
        x = self.dropout(x)
        x = x + residual
        return self.output(x)

print('EnhancedMLP defined')

EnhancedMLP defined


In [7]:
# GP + MLP + LGBM Ensemble with Similarity Features
class SimilarityEnsemble:
    """Ensemble with solvent similarity features.
    
    Uses the best ensemble weights from exp_035: GP(0.15) + MLP(0.55) + LGBM(0.3)
    But with additional similarity features that tell the model how "novel" the test solvent is.
    """
    def __init__(self, data='single', gp_weight=0.15, mlp_weight=0.55, lgbm_weight=0.3):
        self.data_type = data
        self.gp_weight = gp_weight
        self.mlp_weight = mlp_weight
        self.lgbm_weight = lgbm_weight
        
        # Featurizers
        self.sim_featurizer = SimilarityFeaturizer(mixed=(data == 'full'))
        self.simple_featurizer = SimpleFeaturizer(mixed=(data == 'full'))
        
        # Scalers
        self.sim_scaler = StandardScaler()
        self.simple_scaler = StandardScaler()
        self.y_scaler = StandardScaler()
        
        # Models
        self.mlp = EnhancedMLP(self.sim_featurizer.feats_dim, hidden_dim=128, output_dim=3, dropout=0.2).to(device)
        self.gps = []
        self.lgbm_models = []
        
    def train_model(self, X, Y):
        # Fit training solvents for similarity computation
        self.sim_featurizer.fit_train_solvents(X)
        
        # Featurize
        X_sim = self.sim_featurizer.featurize(X)
        X_simple = self.simple_featurizer.featurize(X)
        Y_np = Y.values
        
        # Scale
        X_sim_scaled = self.sim_scaler.fit_transform(X_sim)
        X_simple_scaled = self.simple_scaler.fit_transform(X_simple)
        Y_scaled = self.y_scaler.fit_transform(Y_np)
        
        # Train MLP
        self._train_mlp(X_sim_scaled, Y_scaled)
        
        # Train GPs (one per target)
        self.gps = []
        for i in range(3):
            kernel = ConstantKernel(1.0) * Matern(length_scale=1.0, nu=2.5) + WhiteKernel(noise_level=0.1)
            gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=3, normalize_y=True, random_state=42)
            gp.fit(X_simple_scaled, Y_scaled[:, i])
            self.gps.append(gp)
        
        # Train LGBM (one per target)
        self.lgbm_models = []
        for i in range(3):
            lgbm_model = lgb.LGBMRegressor(
                n_estimators=200, learning_rate=0.05, max_depth=6,
                num_leaves=31, min_child_samples=10, reg_alpha=0.1, reg_lambda=0.1,
                random_state=42, verbose=-1
            )
            lgbm_model.fit(X_sim_scaled, Y_scaled[:, i])
            self.lgbm_models.append(lgbm_model)
    
    def _train_mlp(self, X_scaled, Y_scaled):
        X_tensor = torch.tensor(X_scaled, dtype=torch.double).to(device)
        Y_tensor = torch.tensor(Y_scaled, dtype=torch.double).to(device)
        
        dataset = TensorDataset(X_tensor, Y_tensor)
        loader = DataLoader(dataset, batch_size=32, shuffle=True)
        
        optimizer = torch.optim.AdamW(self.mlp.parameters(), lr=1e-3, weight_decay=1e-4)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=150)
        
        target_weights = torch.tensor([1.0, 1.0, 2.0], dtype=torch.double).to(device)
        
        self.mlp.train()
        for epoch in range(150):
            for X_batch, Y_batch in loader:
                optimizer.zero_grad()
                pred = self.mlp(X_batch)
                loss = ((pred - Y_batch) ** 2 * target_weights).mean()
                loss.backward()
                optimizer.step()
            scheduler.step()
    
    def predict(self, X):
        # Featurize
        X_sim = self.sim_featurizer.featurize(X)
        X_simple = self.simple_featurizer.featurize(X)
        
        # Scale
        X_sim_scaled = self.sim_scaler.transform(X_sim)
        X_simple_scaled = self.simple_scaler.transform(X_simple)
        
        # MLP prediction
        self.mlp.eval()
        with torch.no_grad():
            X_tensor = torch.tensor(X_sim_scaled, dtype=torch.double).to(device)
            mlp_pred_scaled = self.mlp(X_tensor).cpu().numpy()
        
        # GP prediction
        gp_pred_scaled = np.column_stack([gp.predict(X_simple_scaled) for gp in self.gps])
        
        # LGBM prediction
        lgbm_pred_scaled = np.column_stack([lgbm.predict(X_sim_scaled) for lgbm in self.lgbm_models])
        
        # Ensemble
        ensemble_pred_scaled = (
            self.gp_weight * gp_pred_scaled + 
            self.mlp_weight * mlp_pred_scaled + 
            self.lgbm_weight * lgbm_pred_scaled
        )
        
        # Inverse transform
        ensemble_pred = self.y_scaler.inverse_transform(ensemble_pred_scaled)
        ensemble_pred = np.clip(ensemble_pred, 0, 1)
        
        return torch.tensor(ensemble_pred)
    
    def predict_with_tta(self, X):
        if self.data_type == 'single':
            return self.predict(X)
        
        pred1 = self.predict(X)
        
        X_flip = X.copy()
        X_flip["SOLVENT A NAME"] = X["SOLVENT B NAME"]
        X_flip["SOLVENT B NAME"] = X["SOLVENT A NAME"]
        X_flip["SolventB%"] = 1 - X["SolventB%"]
        
        pred2 = self.predict(X_flip)
        
        return (pred1 + pred2) / 2

print('SimilarityEnsemble defined with GP(0.15) + MLP(0.55) + LGBM(0.3) + Similarity Features')

SimilarityEnsemble defined with GP(0.15) + MLP(0.55) + LGBM(0.3) + Similarity Features


In [8]:
# Test the model
X_single, Y_single = load_data("single_solvent")
print(f"Single solvent data: {X_single.shape[0]} samples, {len(X_single['SOLVENT NAME'].unique())} unique solvents")

X_full, Y_full = load_data("full")
ramps = X_full[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
print(f"Full data: {X_full.shape[0]} samples, {len(ramps)} unique ramps")

Single solvent data: 656 samples, 24 unique solvents
Full data: 1227 samples, 13 unique ramps


In [9]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = SimilarityEnsemble(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:23, 23.49s/it]

2it [00:50, 25.32s/it]

3it [01:10, 23.15s/it]

4it [01:30, 21.91s/it]

5it [01:58, 24.18s/it]

6it [02:25, 24.94s/it]

7it [02:51, 25.22s/it]

8it [03:15, 25.03s/it]

9it [03:42, 25.70s/it]

10it [04:07, 25.30s/it]

11it [04:35, 26.06s/it]

12it [05:01, 26.24s/it]

13it [05:26, 25.67s/it]

14it [05:50, 25.24s/it]

15it [06:14, 24.94s/it]

16it [06:42, 25.81s/it]

17it [07:07, 25.61s/it]

18it [07:34, 25.98s/it]

19it [08:00, 26.08s/it]

20it [08:26, 25.93s/it]

21it [08:51, 25.62s/it]

22it [09:14, 24.87s/it]

23it [09:40, 25.21s/it]

24it [10:03, 24.54s/it]

24it [10:03, 25.14s/it]




In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = SimilarityEnsemble(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict_with_tta(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
# Calculate CV score (for verification only - NOT part of submission)
X_single, Y_single = load_data("single_solvent")
X_full, Y_full = load_data("full")

# Get actuals in same order as predictions
actuals_single = []
for solvent in sorted(X_single["SOLVENT NAME"].unique()):
    mask = X_single["SOLVENT NAME"] == solvent
    actuals_single.append(Y_single[mask].values)
actuals_single = np.vstack(actuals_single)

actuals_full = []
ramps = X_full[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
for _, row in ramps.iterrows():
    mask = (X_full["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X_full["SOLVENT B NAME"] == row["SOLVENT B NAME"])
    actuals_full.append(Y_full[mask].values)
actuals_full = np.vstack(actuals_full)

# Get predictions
preds_single = submission_single_solvent[['target_1', 'target_2', 'target_3']].values
preds_full = submission_full_data[['target_1', 'target_2', 'target_3']].values

# Calculate MSE
mse_single = np.mean((actuals_single - preds_single) ** 2)
mse_full = np.mean((actuals_full - preds_full) ** 2)
n_single = len(actuals_single)
n_full = len(actuals_full)
overall_mse = (mse_single * n_single + mse_full * n_full) / (n_single + n_full)

print(f'\n=== SIMILARITY FEATURES CV SCORE ===')
print(f'Single Solvent MSE: {mse_single:.6f} (n={n_single})')
print(f'Full Data MSE: {mse_full:.6f} (n={n_full})')
print(f'Overall MSE: {overall_mse:.6f}')

print(f'\n=== COMPARISON ===')
print(f'exp_032 (best CV, GP 0.15 + MLP 0.55 + LGBM 0.3): CV 0.008194')
print(f'exp_046 (SIMILARITY FEATURES): CV {overall_mse:.6f}')

if overall_mse < 0.008194:
    improvement = (0.008194 - overall_mse) / 0.008194 * 100
    print(f'\n✓ IMPROVEMENT: {improvement:.2f}% better than exp_032!')
else:
    degradation = (overall_mse - 0.008194) / 0.008194 * 100
    print(f'\n✗ WORSE: {degradation:.2f}% worse than exp_032')
    
# Estimate LB using old relationship
estimated_lb = 4.23 * overall_mse + 0.0533
print(f'\nEstimated LB (using old relationship): {estimated_lb:.4f}')
print(f'Best LB so far: 0.0877')
print(f'Target: 0.0347')
print(f'\nKey question: Do Similarity Features change the CV-LB relationship?')
print(f'If they reduce the intercept, we might be able to reach the target.')