In [1]:
# Experiment 071: GP+MLP+LGBM Ensemble with Extrapolation Detection
#
# Combines the best model (exp_030 GP+MLP+LGBM, CV=0.0083) with extrapolation detection
# to try to reduce the CV-LB intercept.
#
# Key outlier solvents (z-score > 1.0):
# - HFIP: 4.57 (z=2.24) -> blend_weight=0.617
# - Cyclohexane: 4.18 (z=1.86) -> blend_weight=0.432
# - Water: 3.70 (z=1.40) -> blend_weight=0.200
# - Ethylene Glycol: 3.69 (z=1.39) -> blend_weight=0.193
# - TFE: 3.59 (z=1.29) -> blend_weight=0.147

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import lightgbm as lgb
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist
import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.set_default_dtype(torch.double)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [2]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [3]:
# Load feature lookups
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFP_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)

# Filter DRFP to high-variance columns
drfp_variance = DRFP_DF.var()
nonzero_variance_cols = drfp_variance[drfp_variance > 0].index.tolist()
DRFP_FILTERED = DRFP_DF[nonzero_variance_cols]

print(f'Spange: {SPANGE_DF.shape}, DRFP filtered: {DRFP_FILTERED.shape}, ACS PCA: {ACS_PCA_DF.shape}')

Spange: (26, 13), DRFP filtered: (24, 122), ACS PCA: (24, 5)


In [4]:
# Pre-compute outlier scores for all solvents (excluding self)
# This identifies which solvents are "outliers" in the solvent space

def compute_solvent_outlier_scores(k=3):
    """Compute outlier score for each solvent based on distance to k nearest OTHER solvents."""
    solvent_scaler = StandardScaler()
    scaled_features = solvent_scaler.fit_transform(SPANGE_DF.values)
    
    outlier_scores = {}
    for i, solvent in enumerate(SPANGE_DF.index):
        other_indices = [j for j in range(len(SPANGE_DF)) if j != i]
        other_features = scaled_features[other_indices]
        distances = cdist([scaled_features[i]], other_features, metric='euclidean')[0]
        k_nearest_dist = np.sort(distances)[:k].mean()
        outlier_scores[solvent] = k_nearest_dist
    
    return outlier_scores

SOLVENT_OUTLIER_SCORES = compute_solvent_outlier_scores(k=3)
mean_outlier_score = np.mean(list(SOLVENT_OUTLIER_SCORES.values()))
std_outlier_score = np.std(list(SOLVENT_OUTLIER_SCORES.values()))

print(f"Pre-computed outlier scores. Mean: {mean_outlier_score:.4f}, Std: {std_outlier_score:.4f}")
print("\nTop outliers:")
for solvent, score in sorted(SOLVENT_OUTLIER_SCORES.items(), key=lambda x: x[1], reverse=True)[:6]:
    z = (score - mean_outlier_score) / std_outlier_score
    print(f"  {solvent}: {score:.4f} (z={z:.2f})")

Pre-computed outlier scores. Mean: 2.2408, Std: 1.0424

Top outliers:
  1,1,1,3,3,3-Hexafluoropropan-2-ol: 4.5701 (z=2.23)
  Cyclohexane: 4.1844 (z=1.86)
  Water: 3.7000 (z=1.40)
  Ethylene Glycol [1,2-Ethanediol]: 3.6865 (z=1.39)
  2,2,2-Trifluoroethanol: 3.5894 (z=1.29)
  Decanol: 2.8970 (z=0.63)


In [5]:
from abc import ABC, abstractmethod

class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError
    def featurize(X, Y):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass
    def train_model(self, X_train, y_train):
        raise NotImplementedError
    def predict(self):
        raise NotImplementedError

In [6]:
# Full Featurizer (for MLP and LGBM) - 145 features
class FullFeaturizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.drfp_df = DRFP_FILTERED
        self.acs_pca_df = ACS_PCA_DF
        self.feats_dim = 2 + 3 + self.spange_df.shape[1] + self.drfp_df.shape[1] + self.acs_pca_df.shape[1]

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            A_drfp = self.drfp_df.loc[X["SOLVENT A NAME"]].values
            B_drfp = self.drfp_df.loc[X["SOLVENT B NAME"]].values
            A_acs = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
            B_acs = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1) / 100.0
            if flip:
                X_spange = B_spange * (1 - pct) + A_spange * pct
                X_drfp = B_drfp * (1 - pct) + A_drfp * pct
                X_acs = B_acs * (1 - pct) + A_acs * pct
            else:
                X_spange = A_spange * (1 - pct) + B_spange * pct
                X_drfp = A_drfp * (1 - pct) + B_drfp * pct
                X_acs = A_acs * (1 - pct) + B_acs * pct
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
            X_drfp = self.drfp_df.loc[X["SOLVENT NAME"]].values
            X_acs = self.acs_pca_df.loc[X["SOLVENT NAME"]].values
        
        return np.hstack([X_kinetic, X_spange, X_drfp, X_acs])
    
    def featurize_torch(self, X, flip=False):
        return torch.tensor(self.featurize(X, flip))

print(f'Full feature dimension: {FullFeaturizer().feats_dim}')

Full feature dimension: 145


In [7]:
# Simple Featurizer (for GP) - 18 features (Spange + Arrhenius kinetics)
class SimpleFeaturizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.feats_dim = 2 + 3 + self.spange_df.shape[1]  # 18 features

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1) / 100.0
            if flip:
                X_spange = B_spange * (1 - pct) + A_spange * pct
            else:
                X_spange = A_spange * (1 - pct) + B_spange * pct
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
        
        return np.hstack([X_kinetic, X_spange])

print(f'Simple feature dimension (for GP): {SimpleFeaturizer().feats_dim}')

Simple feature dimension (for GP): 18


In [8]:
# GP+MLP+LGBM Ensemble with Extrapolation Detection

class ExtrapolationAwareGPMLPLGBMEnsemble(BaseModel):
    def __init__(self, data='single', blend_threshold=1.0, gp_weight=0.15, mlp_weight=0.55, lgbm_weight=0.30):
        self.data_type = data
        self.blend_threshold = blend_threshold
        self.gp_weight = gp_weight
        self.mlp_weight = mlp_weight
        self.lgbm_weight = lgbm_weight
        
        self.mixed = (data == 'full')
        self.full_featurizer = FullFeaturizer(mixed=self.mixed)
        self.simple_featurizer = SimpleFeaturizer(mixed=self.mixed)
        
        self.gp_scaler = StandardScaler()
        self.mlp_scaler = StandardScaler()
        self.lgbm_scaler = StandardScaler()
        
        self.gp_models = []  # One per target
        self.lgbm_models = []  # One per target
        self.mlp = None
        
        self.train_Y = None
    
    def get_blend_weight(self, solvent_name):
        """Get blend weight based on pre-computed outlier score."""
        score = SOLVENT_OUTLIER_SCORES.get(solvent_name, mean_outlier_score)
        z_score = (score - mean_outlier_score) / std_outlier_score
        blend_weight = np.clip((z_score - self.blend_threshold) / 2.0, 0, 1)
        return blend_weight
    
    def train_model(self, X_train, y_train):
        self.train_Y = y_train.values if hasattr(y_train, 'values') else y_train
        
        # Prepare features
        X_simple = self.simple_featurizer.featurize(X_train)
        X_full = self.full_featurizer.featurize(X_train)
        
        X_gp = self.gp_scaler.fit_transform(X_simple)
        X_mlp = self.mlp_scaler.fit_transform(X_full)
        X_lgbm = self.lgbm_scaler.fit_transform(X_full)
        
        Y = self.train_Y
        
        # Train GP models (one per target)
        self.gp_models = []
        kernel = ConstantKernel(1.0) * Matern(length_scale=1.0, nu=2.5) + WhiteKernel(noise_level=0.1)
        for i in range(3):
            gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=3, random_state=42)
            gp.fit(X_gp, Y[:, i])
            self.gp_models.append(gp)
        
        # Train LGBM models (one per target)
        self.lgbm_models = []
        lgbm_params = {
            'objective': 'regression',
            'metric': 'mse',
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': -1,
            'seed': 42
        }
        for i in range(3):
            train_data = lgb.Dataset(X_lgbm, label=Y[:, i])
            model = lgb.train(lgbm_params, train_data, num_boost_round=200)
            self.lgbm_models.append(model)
        
        # Train MLP
        input_dim = X_mlp.shape[1]
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, 3),
            nn.Sigmoid()  # Ensure [0,1] output
        ).double().to(device)
        
        X_tensor = torch.tensor(X_mlp).to(device)
        Y_tensor = torch.tensor(Y).to(device)
        
        dataset = TensorDataset(X_tensor, Y_tensor)
        dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
        
        optimizer = torch.optim.Adam(self.mlp.parameters(), lr=0.001, weight_decay=1e-5)
        criterion = nn.MSELoss()
        
        self.mlp.train()
        for epoch in range(200):
            for batch_X, batch_Y in dataloader:
                optimizer.zero_grad()
                pred = self.mlp(batch_X)
                loss = criterion(pred, batch_Y)
                loss.backward()
                optimizer.step()
    
    def predict(self, X):
        # Prepare features
        X_simple = self.simple_featurizer.featurize(X)
        X_full = self.full_featurizer.featurize(X)
        
        X_gp = self.gp_scaler.transform(X_simple)
        X_mlp = self.mlp_scaler.transform(X_full)
        X_lgbm = self.lgbm_scaler.transform(X_full)
        
        # GP predictions
        gp_preds = np.column_stack([gp.predict(X_gp) for gp in self.gp_models])
        
        # LGBM predictions
        lgbm_preds = np.column_stack([model.predict(X_lgbm) for model in self.lgbm_models])
        
        # MLP predictions
        self.mlp.eval()
        with torch.no_grad():
            X_tensor = torch.tensor(X_mlp).to(device)
            mlp_preds = self.mlp(X_tensor).cpu().numpy()
        
        # Ensemble prediction
        raw_pred = self.gp_weight * gp_preds + self.mlp_weight * mlp_preds + self.lgbm_weight * lgbm_preds
        
        # Apply extrapolation detection
        if self.data_type == 'single':
            solvent_names = X["SOLVENT NAME"].values
            blend_weights = np.array([self.get_blend_weight(s) for s in solvent_names])
        else:
            # For mixtures, use max of both solvents' blend weights
            solvent_a_names = X["SOLVENT A NAME"].values
            solvent_b_names = X["SOLVENT B NAME"].values
            blend_weights_a = np.array([self.get_blend_weight(s) for s in solvent_a_names])
            blend_weights_b = np.array([self.get_blend_weight(s) for s in solvent_b_names])
            blend_weights = np.maximum(blend_weights_a, blend_weights_b)
        
        # Compute population mean from training data
        mean_pred = self.train_Y.mean(axis=0)
        
        # Blend: for outliers, move toward mean
        blended = (1 - blend_weights.reshape(-1, 1)) * raw_pred + blend_weights.reshape(-1, 1) * mean_pred
        
        # Clip to [0, 1]
        blended = np.clip(blended, 0, 1)
        
        return torch.tensor(blended)

print('ExtrapolationAwareGPMLPLGBMEnsemble defined')

ExtrapolationAwareGPMLPLGBMEnsemble defined


In [9]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = ExtrapolationAwareGPMLPLGBMEnsemble(data='single', blend_threshold=1.0) # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_single_solvent = pd.DataFrame(all_predictions)
print(f"Single solvent predictions shape: {submission_single_solvent.shape}")

# Calculate CV score for single solvent
X, Y = load_data("single_solvent")
split_generator = generate_leave_one_out_splits(X, Y)
all_mse = []
for fold_idx, split in enumerate(split_generator):
    (train_X, train_Y), (test_X, test_Y) = split
    fold_preds = submission_single_solvent[submission_single_solvent['fold'] == fold_idx]
    pred_values = fold_preds[['target_1', 'target_2', 'target_3']].values
    true_values = test_Y.values
    mse = ((pred_values - true_values) ** 2).mean()
    all_mse.append(mse)
print(f"Single solvent CV MSE: {np.mean(all_mse):.6f}")

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:31, 31.27s/it]

2it [01:00, 29.97s/it]

3it [01:27, 28.55s/it]

4it [01:54, 28.12s/it]

5it [02:28, 30.18s/it]

6it [02:59, 30.44s/it]

7it [03:32, 31.43s/it]

8it [04:00, 30.07s/it]

9it [04:32, 30.85s/it]

10it [05:07, 32.11s/it]

11it [05:41, 32.65s/it]

12it [06:13, 32.48s/it]

13it [06:44, 32.15s/it]

14it [07:15, 31.77s/it]

15it [07:50, 32.53s/it]

16it [08:23, 32.72s/it]

17it [09:00, 34.02s/it]

18it [09:33, 33.62s/it]

19it [10:04, 33.08s/it]

20it [10:35, 32.27s/it]

21it [11:05, 31.72s/it]

22it [11:41, 32.90s/it]

23it [12:09, 31.61s/it]

24it [12:42, 32.03s/it]

24it [12:42, 31.79s/it]

Single solvent predictions shape: (656, 6)
Single solvent CV MSE: 0.009978





In [10]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = ExtrapolationAwareGPMLPLGBMEnsemble(data='full', blend_threshold=1.0) # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_full_data = pd.DataFrame(all_predictions)
print(f"Full data predictions shape: {submission_full_data.shape}")

# Calculate CV score for full data
X, Y = load_data("full")
split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_mse = []
for fold_idx, split in enumerate(split_generator):
    (train_X, train_Y), (test_X, test_Y) = split
    fold_preds = submission_full_data[submission_full_data['fold'] == fold_idx]
    pred_values = fold_preds[['target_1', 'target_2', 'target_3']].values
    true_values = test_Y.values
    mse = ((pred_values - true_values) ** 2).mean()
    all_mse.append(mse)
print(f"Full data CV MSE: {np.mean(all_mse):.6f}")

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [01:38, 98.45s/it]

2it [03:11, 95.37s/it]

3it [04:44, 94.38s/it]

4it [06:25, 96.91s/it]

5it [08:18, 102.64s/it]

6it [09:41, 95.89s/it] 

7it [11:44, 104.95s/it]

8it [13:19, 101.59s/it]

9it [14:57, 100.53s/it]

10it [16:37, 100.40s/it]

11it [18:22, 101.68s/it]

12it [20:04, 101.78s/it]

13it [21:50, 103.18s/it]

13it [21:50, 100.81s/it]

Full data predictions shape: (1227, 6)
Full data CV MSE: 0.040984





In [11]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

# Also save to /home/submission/
import shutil
shutil.copy("submission.csv", "/home/submission/submission.csv")

print(f"Submission saved. Shape: {submission.shape}")
print(f"Predictions range: target_1 [{submission['target_1'].min():.3f}, {submission['target_1'].max():.3f}]")
print(f"Predictions range: target_2 [{submission['target_2'].min():.3f}, {submission['target_2'].max():.3f}]")
print(f"Predictions range: target_3 [{submission['target_3'].min():.3f}, {submission['target_3'].max():.3f}]")

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

Submission saved. Shape: (1883, 7)
Predictions range: target_1 [0.000, 0.398]
Predictions range: target_2 [0.000, 0.377]
Predictions range: target_3 [0.000, 1.000]


In [12]:
# Debug: Check submission shape
print(f"Single solvent shape: {submission_single_solvent.shape}")
print(f"Full data shape: {submission_full_data.shape}")

# Check expected shapes
X_single, Y_single = load_data("single_solvent")
X_full, Y_full = load_data("full")
print(f"Expected single: {len(Y_single)}")
print(f"Expected full: {len(Y_full)}")

Single solvent shape: (656, 6)
Full data shape: (1227, 6)
Expected single: 656
Expected full: 1227


In [13]:
# Check number of folds for full data\nX_full, Y_full = load_data(\"full\")\nramps = X_full[[\"SOLVENT A NAME\", \"SOLVENT B NAME\"]].drop_duplicates()\nprint(f\"Number of unique ramps: {len(ramps)}\")\nprint(ramps)

In [14]:
# Check number of folds for full data
X_full, Y_full = load_data("full")
ramps = X_full[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
print(f"Number of unique ramps: {len(ramps)}")
print(ramps)

Number of unique ramps: 13
                          SOLVENT A NAME                     SOLVENT B NAME
0                               Methanol   Ethylene Glycol [1,2-Ethanediol]
122    1,1,1,3,3,3-Hexafluoropropan-2-ol  2-Methyltetrahydrofuran [2-MeTHF]
246                          Cyclohexane                  IPA [Propan-2-ol]
350                   Water.Acetonitrile                       Acetonitrile
475                         Acetonitrile           Acetonitrile.Acetic Acid
600    2-Methyltetrahydrofuran [2-MeTHF]              Diethyl Ether [Ether]
724               2,2,2-Trifluoroethanol       Water.2,2,2-Trifluoroethanol
849          DMA [N,N-Dimethylacetamide]                            Decanol
959                              Ethanol              THF [Tetrahydrofuran]
1086     Dihydrolevoglucosenone (Cyrene)                      Ethyl Acetate
1122        MTBE [tert-Butylmethylether]                     Butanone [MEK]
1156  tert-Butanol [2-Methylpropan-2-ol]                 Dime

In [16]:
# Check if all solvents in full data are in the lookup tables
X_full, Y_full = load_data("full")
all_solvents_full = set(X_full["SOLVENT A NAME"].unique()) | set(X_full["SOLVENT B NAME"].unique())
print(f"Solvents in full data: {all_solvents_full}")
print(f"\nSolvents in DRFP: {set(DRFP_FILTERED.index)}")
print(f"\nMissing from DRFP: {all_solvents_full - set(DRFP_FILTERED.index)}")
print(f"\nMissing from ACS_PCA: {all_solvents_full - set(ACS_PCA_DF.index)}")

Solvents in full data: {'Ethylene Glycol [1,2-Ethanediol]', '1,1,1,3,3,3-Hexafluoropropan-2-ol', 'DMA [N,N-Dimethylacetamide]', 'Butanone [MEK]', 'Water.2,2,2-Trifluoroethanol', 'Ethyl Lactate', 'Methyl Propionate', 'Dimethyl Carbonate', 'Diethyl Ether [Ether]', 'Ethyl Acetate', 'Cyclohexane', 'Methanol', 'IPA [Propan-2-ol]', 'Water.Acetonitrile', 'Ethanol', 'Decanol', '2-Methyltetrahydrofuran [2-MeTHF]', 'Acetonitrile', 'MTBE [tert-Butylmethylether]', 'tert-Butanol [2-Methylpropan-2-ol]', 'Dihydrolevoglucosenone (Cyrene)', '2,2,2-Trifluoroethanol', 'THF [Tetrahydrofuran]', 'Acetonitrile.Acetic Acid'}

Solvents in DRFP: {'Butanone [MEK]', 'Dimethyl Carbonate', 'Diethyl Ether [Ether]', 'Ethyl Acetate', 'IPA [Propan-2-ol]', 'Water.Acetonitrile', 'tert-Butanol [2-Methylpropan-2-ol]', 'Ethylene Glycol [1,2-Ethanediol]', 'THF [Tetrahydrofuran]', '1,1,1,3,3,3-Hexafluoropropan-2-ol', 'DMA [N,N-Dimethylacetamide]', 'Water.2,2,2-Trifluoroethanol', 'Ethyl Lactate', 'Methyl Propionate', 'Cyclohex