In [1]:
# Experiment 072: Nearest Neighbor Blending (Instead of Global Mean)
#
# Key fix from exp_071: Instead of blending toward GLOBAL mean, blend toward
# the mean of k NEAREST training solvents. This preserves chemical similarity.
#
# For HFIP (highest outlier), this means blending toward other fluorinated alcohols
# like TFE, not toward the global average.
#
# Also: Disable extrapolation detection for full data (it hurt performance 383%)

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import lightgbm as lgb
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist
import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.set_default_dtype(torch.double)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [2]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [3]:
# Load feature lookups
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFP_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)

# Filter DRFP to high-variance columns
drfp_variance = DRFP_DF.var()
nonzero_variance_cols = drfp_variance[drfp_variance > 0].index.tolist()
DRFP_FILTERED = DRFP_DF[nonzero_variance_cols]

print(f'Spange: {SPANGE_DF.shape}, DRFP filtered: {DRFP_FILTERED.shape}, ACS PCA: {ACS_PCA_DF.shape}')

Spange: (26, 13), DRFP filtered: (24, 122), ACS PCA: (24, 5)


In [4]:
# Pre-compute outlier scores for all solvents (excluding self)
def compute_solvent_outlier_scores(k=3):
    solvent_scaler = StandardScaler()
    scaled_features = solvent_scaler.fit_transform(SPANGE_DF.values)
    
    outlier_scores = {}
    for i, solvent in enumerate(SPANGE_DF.index):
        other_indices = [j for j in range(len(SPANGE_DF)) if j != i]
        other_features = scaled_features[other_indices]
        distances = cdist([scaled_features[i]], other_features, metric='euclidean')[0]
        k_nearest_dist = np.sort(distances)[:k].mean()
        outlier_scores[solvent] = k_nearest_dist
    
    return outlier_scores, solvent_scaler, scaled_features

SOLVENT_OUTLIER_SCORES, SOLVENT_SCALER, SCALED_SOLVENT_FEATURES = compute_solvent_outlier_scores(k=3)
mean_outlier_score = np.mean(list(SOLVENT_OUTLIER_SCORES.values()))
std_outlier_score = np.std(list(SOLVENT_OUTLIER_SCORES.values()))

print(f"Pre-computed outlier scores. Mean: {mean_outlier_score:.4f}, Std: {std_outlier_score:.4f}")
print("\nTop outliers:")
for solvent, score in sorted(SOLVENT_OUTLIER_SCORES.items(), key=lambda x: x[1], reverse=True)[:6]:
    z = (score - mean_outlier_score) / std_outlier_score
    print(f"  {solvent}: {score:.4f} (z={z:.2f})")

Pre-computed outlier scores. Mean: 2.2408, Std: 1.0424

Top outliers:
  1,1,1,3,3,3-Hexafluoropropan-2-ol: 4.5701 (z=2.23)
  Cyclohexane: 4.1844 (z=1.86)
  Water: 3.7000 (z=1.40)
  Ethylene Glycol [1,2-Ethanediol]: 3.6865 (z=1.39)
  2,2,2-Trifluoroethanol: 3.5894 (z=1.29)
  Decanol: 2.8970 (z=0.63)


In [5]:
from abc import ABC, abstractmethod

class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError
    def featurize(X, Y):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass
    def train_model(self, X_train, y_train):
        raise NotImplementedError
    def predict(self):
        raise NotImplementedError

In [6]:
# Full Featurizer (for MLP and LGBM) - 145 features
class FullFeaturizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.drfp_df = DRFP_FILTERED
        self.acs_pca_df = ACS_PCA_DF
        self.feats_dim = 2 + 3 + self.spange_df.shape[1] + self.drfp_df.shape[1] + self.acs_pca_df.shape[1]

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            A_drfp = self.drfp_df.loc[X["SOLVENT A NAME"]].values
            B_drfp = self.drfp_df.loc[X["SOLVENT B NAME"]].values
            A_acs = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
            B_acs = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1) / 100.0
            X_spange = A_spange * (1 - pct) + B_spange * pct
            X_drfp = A_drfp * (1 - pct) + B_drfp * pct
            X_acs = A_acs * (1 - pct) + B_acs * pct
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
            X_drfp = self.drfp_df.loc[X["SOLVENT NAME"]].values
            X_acs = self.acs_pca_df.loc[X["SOLVENT NAME"]].values
        
        return np.hstack([X_kinetic, X_spange, X_drfp, X_acs])

print(f'Full feature dimension: {FullFeaturizer().feats_dim}')

Full feature dimension: 145


In [7]:
# Simple Featurizer (for GP) - 18 features
class SimpleFeaturizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.feats_dim = 2 + 3 + self.spange_df.shape[1]

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1) / 100.0
            X_spange = A_spange * (1 - pct) + B_spange * pct
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
        
        return np.hstack([X_kinetic, X_spange])

print(f'Simple feature dimension: {SimpleFeaturizer().feats_dim}')

Simple feature dimension: 18


In [8]:
# GP+MLP+LGBM Ensemble with Nearest Neighbor Blending
# Key fix: Blend toward k nearest training solvents, NOT global mean

class NNBlendGPMLPLGBMEnsemble(BaseModel):
    def __init__(self, data='single', blend_threshold=1.5, k_neighbors=3,
                 gp_weight=0.15, mlp_weight=0.55, lgbm_weight=0.30,
                 apply_blend_to_full=False):  # Disable blending for full data by default
        self.data_type = data
        self.blend_threshold = blend_threshold
        self.k_neighbors = k_neighbors
        self.gp_weight = gp_weight
        self.mlp_weight = mlp_weight
        self.lgbm_weight = lgbm_weight
        self.apply_blend_to_full = apply_blend_to_full
        
        self.mixed = (data == 'full')
        self.full_featurizer = FullFeaturizer(mixed=self.mixed)
        self.simple_featurizer = SimpleFeaturizer(mixed=self.mixed)
        
        self.gp_scaler = StandardScaler()
        self.mlp_scaler = StandardScaler()
        self.lgbm_scaler = StandardScaler()
        
        self.gp_models = []
        self.lgbm_models = []
        self.mlp = None
        
        self.train_Y = None
        self.training_solvents = None
        self.solvent_mean_predictions = {}  # Store mean prediction per solvent
    
    def get_blend_weight(self, solvent_name):
        """Get blend weight based on pre-computed outlier score."""
        score = SOLVENT_OUTLIER_SCORES.get(solvent_name, mean_outlier_score)
        z_score = (score - mean_outlier_score) / std_outlier_score
        blend_weight = np.clip((z_score - self.blend_threshold) / 2.0, 0, 1)
        return blend_weight
    
    def get_nearest_training_mean(self, test_solvent):
        """Get mean prediction for k nearest training solvents."""
        if test_solvent not in SPANGE_DF.index:
            return self.train_Y.mean(axis=0)  # Fallback to global mean
        
        test_features = SPANGE_DF.loc[test_solvent].values
        test_scaled = SOLVENT_SCALER.transform([test_features])[0]
        
        distances = []
        for train_solvent in self.training_solvents:
            if train_solvent not in SPANGE_DF.index:
                continue
            train_idx = list(SPANGE_DF.index).index(train_solvent)
            train_scaled = SCALED_SOLVENT_FEATURES[train_idx]
            dist = np.linalg.norm(test_scaled - train_scaled)
            distances.append((train_solvent, dist))
        
        distances.sort(key=lambda x: x[1])
        nearest = distances[:self.k_neighbors]
        
        # Get mean of nearest solvents' mean predictions
        nearest_preds = []
        for solvent, _ in nearest:
            if solvent in self.solvent_mean_predictions:
                nearest_preds.append(self.solvent_mean_predictions[solvent])
        
        if len(nearest_preds) == 0:
            return self.train_Y.mean(axis=0)
        
        return np.mean(nearest_preds, axis=0)
    
    def train_model(self, X_train, y_train):
        self.train_Y = y_train.values if hasattr(y_train, 'values') else y_train
        
        # Store training solvents and their mean predictions
        if self.data_type == 'single':
            self.training_solvents = X_train["SOLVENT NAME"].unique().tolist()
            for solvent in self.training_solvents:
                mask = X_train["SOLVENT NAME"] == solvent
                self.solvent_mean_predictions[solvent] = y_train[mask].values.mean(axis=0)
        else:
            self.training_solvents = list(set(X_train["SOLVENT A NAME"].unique()) | set(X_train["SOLVENT B NAME"].unique()))
        
        # Prepare features
        X_simple = self.simple_featurizer.featurize(X_train)
        X_full = self.full_featurizer.featurize(X_train)
        
        X_gp = self.gp_scaler.fit_transform(X_simple)
        X_mlp = self.mlp_scaler.fit_transform(X_full)
        X_lgbm = self.lgbm_scaler.fit_transform(X_full)
        
        Y = self.train_Y
        
        # Train GP models
        self.gp_models = []
        kernel = ConstantKernel(1.0) * Matern(length_scale=1.0, nu=2.5) + WhiteKernel(noise_level=0.1)
        for i in range(3):
            gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=3, random_state=42)
            gp.fit(X_gp, Y[:, i])
            self.gp_models.append(gp)
        
        # Train LGBM models
        self.lgbm_models = []
        lgbm_params = {
            'objective': 'regression', 'metric': 'mse', 'boosting_type': 'gbdt',
            'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.8,
            'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': -1, 'seed': 42
        }
        for i in range(3):
            train_data = lgb.Dataset(X_lgbm, label=Y[:, i])
            model = lgb.train(lgbm_params, train_data, num_boost_round=200)
            self.lgbm_models.append(model)
        
        # Train MLP
        input_dim = X_mlp.shape[1]
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, 128), nn.ReLU(), nn.Dropout(0.1),
            nn.Linear(128, 64), nn.ReLU(), nn.Dropout(0.1),
            nn.Linear(64, 3), nn.Sigmoid()
        ).double().to(device)
        
        X_tensor = torch.tensor(X_mlp).to(device)
        Y_tensor = torch.tensor(Y).to(device)
        dataset = TensorDataset(X_tensor, Y_tensor)
        dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
        
        optimizer = torch.optim.Adam(self.mlp.parameters(), lr=0.001, weight_decay=1e-5)
        criterion = nn.MSELoss()
        
        self.mlp.train()
        for epoch in range(200):
            for batch_X, batch_Y in dataloader:
                optimizer.zero_grad()
                pred = self.mlp(batch_X)
                loss = criterion(pred, batch_Y)
                loss.backward()
                optimizer.step()
    
    def predict(self, X):
        # Prepare features
        X_simple = self.simple_featurizer.featurize(X)
        X_full = self.full_featurizer.featurize(X)
        
        X_gp = self.gp_scaler.transform(X_simple)
        X_mlp = self.mlp_scaler.transform(X_full)
        X_lgbm = self.lgbm_scaler.transform(X_full)
        
        # Get predictions from each model
        gp_preds = np.column_stack([gp.predict(X_gp) for gp in self.gp_models])
        lgbm_preds = np.column_stack([model.predict(X_lgbm) for model in self.lgbm_models])
        
        self.mlp.eval()
        with torch.no_grad():
            X_tensor = torch.tensor(X_mlp).to(device)
            mlp_preds = self.mlp(X_tensor).cpu().numpy()
        
        # Ensemble prediction
        raw_pred = self.gp_weight * gp_preds + self.mlp_weight * mlp_preds + self.lgbm_weight * lgbm_preds
        
        # Apply nearest neighbor blending (only for single solvents, or if explicitly enabled for full)
        if self.data_type == 'single' or self.apply_blend_to_full:
            if self.data_type == 'single':
                solvent_names = X["SOLVENT NAME"].values
                unique_solvents = list(set(solvent_names))
                
                # For each unique solvent, compute blend weight and nearest neighbor mean
                blended = raw_pred.copy()
                for solvent in unique_solvents:
                    blend_weight = self.get_blend_weight(solvent)
                    if blend_weight > 0:
                        nn_mean = self.get_nearest_training_mean(solvent)
                        mask = solvent_names == solvent
                        blended[mask] = (1 - blend_weight) * raw_pred[mask] + blend_weight * nn_mean
            else:
                blended = raw_pred  # Don't blend for full data
        else:
            blended = raw_pred
        
        # Clip to [0, 1]
        blended = np.clip(blended, 0, 1)
        
        return torch.tensor(blended)

print('NNBlendGPMLPLGBMEnsemble defined')

NNBlendGPMLPLGBMEnsemble defined


In [9]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = NNBlendGPMLPLGBMEnsemble(data='single', blend_threshold=1.5, k_neighbors=3) # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_single_solvent = pd.DataFrame(all_predictions)
print(f"Single solvent predictions shape: {submission_single_solvent.shape}")

# Calculate CV score for single solvent
X, Y = load_data("single_solvent")
split_generator = generate_leave_one_out_splits(X, Y)
all_mse = []
for fold_idx, split in enumerate(split_generator):
    (train_X, train_Y), (test_X, test_Y) = split
    fold_preds = submission_single_solvent[submission_single_solvent['fold'] == fold_idx]
    pred_values = fold_preds[['target_1', 'target_2', 'target_3']].values
    true_values = test_Y.values
    mse = ((pred_values - true_values) ** 2).mean()
    all_mse.append(mse)
    solvent = test_X["SOLVENT NAME"].iloc[0]
    blend_weight = model.get_blend_weight(solvent)
    if blend_weight > 0:
        print(f"  Fold {fold_idx} ({solvent}): MSE={mse:.6f}, blend_weight={blend_weight:.3f}")
print(f"Single solvent CV MSE: {np.mean(all_mse):.6f}")

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:36, 36.91s/it]

2it [01:06, 32.68s/it]

3it [01:31, 29.34s/it]

4it [01:58, 28.35s/it]

5it [02:31, 30.00s/it]

6it [03:03, 30.46s/it]

7it [03:36, 31.48s/it]

8it [04:04, 30.38s/it]

9it [04:38, 31.33s/it]

10it [05:10, 31.69s/it]

11it [05:43, 31.92s/it]

12it [06:14, 31.87s/it]

13it [06:47, 32.05s/it]

14it [07:17, 31.52s/it]

15it [07:52, 32.56s/it]

16it [08:25, 32.66s/it]

17it [09:00, 33.29s/it]

18it [09:33, 33.32s/it]

19it [10:05, 32.84s/it]

20it [10:35, 32.20s/it]

21it [11:06, 31.61s/it]

22it [11:43, 33.27s/it]

23it [12:11, 31.70s/it]

24it [12:43, 31.75s/it]

24it [12:43, 31.80s/it]

Single solvent predictions shape: (656, 6)
  Fold 0 (1,1,1,3,3,3-Hexafluoropropan-2-ol): MSE=0.035719, blend_weight=0.367
  Fold 6 (Cyclohexane): MSE=0.006270, blend_weight=0.182
Single solvent CV MSE: 0.008623





In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = NNBlendGPMLPLGBMEnsemble(data='full', blend_threshold=1.5, apply_blend_to_full=False) # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_full_data = pd.DataFrame(all_predictions)
print(f"Full data predictions shape: {submission_full_data.shape}")

# Calculate CV score for full data
X, Y = load_data("full")
split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_mse = []
for fold_idx, split in enumerate(split_generator):
    (train_X, train_Y), (test_X, test_Y) = split
    fold_preds = submission_full_data[submission_full_data['fold'] == fold_idx]
    pred_values = fold_preds[['target_1', 'target_2', 'target_3']].values
    true_values = test_Y.values
    mse = ((pred_values - true_values) ** 2).mean()
    all_mse.append(mse)
print(f"Full data CV MSE: {np.mean(all_mse):.6f}")

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

# Also save to /home/submission/
import shutil
shutil.copy("submission.csv", "/home/submission/submission.csv")

print(f"Submission saved. Shape: {submission.shape}")
print(f"Predictions range: target_1 [{submission['target_1'].min():.3f}, {submission['target_1'].max():.3f}]")
print(f"Predictions range: target_2 [{submission['target_2'].min():.3f}, {submission['target_2'].max():.3f}]")
print(f"Predictions range: target_3 [{submission['target_3'].min():.3f}, {submission['target_3'].max():.3f}]")

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################