# ChemBERTa Frozen Embeddings as Features

**Hypothesis**: Pre-trained ChemBERTa embeddings capture chemistry knowledge that can't be learned from 24 solvents. Using them as FROZEN features (not fine-tuning) could reduce the CV-LB intercept.

**Key difference from previous ChemBERTa attempts**:
- Previous attempts FINE-TUNED ChemBERTa on our small dataset
- This approach uses FROZEN embeddings as features
- The embeddings already contain chemistry knowledge from pre-training on millions of molecules

**Implementation**:
1. Load pre-trained ChemBERTa model
2. Extract embeddings for each solvent SMILES (frozen, no gradients)
3. Use embeddings as additional features for LGBM
4. Compare with baseline GP+MLP+LGBM ensemble (CV=0.008298)

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [2]:
# Load ChemBERTa and extract frozen embeddings
from transformers import AutoModel, AutoTokenizer

print("Loading ChemBERTa model...")
model_name = "seyonec/ChemBERTa-zinc-base-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
chemberta_model = AutoModel.from_pretrained(model_name)
chemberta_model.eval()  # Set to eval mode
chemberta_model = chemberta_model.to(device)
print(f"ChemBERTa loaded. Hidden size: {chemberta_model.config.hidden_size}")

Loading ChemBERTa model...


ChemBERTa loaded. Hidden size: 768


In [3]:
# Load SMILES lookup and extract embeddings for all solvents
DATA_PATH = '/home/data'
SMILES_DF = pd.read_csv(f'{DATA_PATH}/smiles_lookup.csv', index_col=0)
print(f"SMILES lookup shape: {SMILES_DF.shape}")
print(SMILES_DF.head())

SMILES lookup shape: (26, 1)
                                           solvent smiles
SOLVENT NAME                                             
Cyclohexane                                      C1CCCCC1
Ethyl Acetate                                   O=C(OCC)C
Acetic Acid                                       CC(=O)O
2-Methyltetrahydrofuran [2-MeTHF]              O1C(C)CCC1
1,1,1,3,3,3-Hexafluoropropan-2-ol  C(C(F)(F)F)(C(F)(F)F)O


In [4]:
# Extract frozen embeddings for each solvent
def get_embedding(smiles):
    """Get frozen ChemBERTa embedding for a SMILES string."""
    inputs = tokenizer(smiles, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = chemberta_model(**inputs)
    # Use mean pooling of last hidden state
    embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy().flatten()
    return embedding

# Create embedding lookup for all solvents
print("Extracting ChemBERTa embeddings for all solvents...")
chemberta_embeddings = {}
for solvent_name, row in SMILES_DF.iterrows():
    smiles = row['solvent smiles']
    embedding = get_embedding(smiles)
    chemberta_embeddings[solvent_name] = embedding
    print(f"  {solvent_name}: {smiles[:30]}... -> embedding shape {embedding.shape}")

# Convert to DataFrame
embedding_dim = len(list(chemberta_embeddings.values())[0])
CHEMBERTA_DF = pd.DataFrame(
    {name: emb for name, emb in chemberta_embeddings.items()}
).T
CHEMBERTA_DF.columns = [f'chemberta_{i}' for i in range(embedding_dim)]
print(f"\nChemBERTa embeddings shape: {CHEMBERTA_DF.shape}")

Extracting ChemBERTa embeddings for all solvents...


  Cyclohexane: C1CCCCC1... -> embedding shape (768,)
  Ethyl Acetate: O=C(OCC)C... -> embedding shape (768,)
  Acetic Acid: CC(=O)O... -> embedding shape (768,)
  2-Methyltetrahydrofuran [2-MeTHF]: O1C(C)CCC1... -> embedding shape (768,)
  1,1,1,3,3,3-Hexafluoropropan-2-ol: C(C(F)(F)F)(C(F)(F)F)O... -> embedding shape (768,)
  IPA [Propan-2-ol]: CC(O)C... -> embedding shape (768,)
  Ethanol: CCO... -> embedding shape (768,)
  Methanol: CO... -> embedding shape (768,)
  Ethylene Glycol [1,2-Ethanediol]: OCCO... -> embedding shape (768,)
  Acetonitrile: CC#N... -> embedding shape (768,)
  Water: O... -> embedding shape (768,)
  Diethyl Ether [Ether]: CCOCC... -> embedding shape (768,)
  MTBE [tert-Butylmethylether]: CC(C)(C)OC... -> embedding shape (768,)
  Dimethyl Carbonate: COC(=O)OC... -> embedding shape (768,)
  tert-Butanol [2-Methylpropan-2-ol]: CC(C)(C)O... -> embedding shape (768,)
  DMA [N,N-Dimethylacetamide]: CN(C)C(C)=O... -> embedding shape (768,)
  2,2,2-Trifluoroethanol: 

In [5]:
# Data loading functions
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [6]:
# Load other feature lookups
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFP_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)

# Filter DRFP to high-variance columns
drfp_variance = DRFP_DF.var()
nonzero_variance_cols = drfp_variance[drfp_variance > 0].index.tolist()
DRFP_FILTERED = DRFP_DF[nonzero_variance_cols]

print(f'Spange: {SPANGE_DF.shape}, DRFP filtered: {DRFP_FILTERED.shape}, ACS PCA: {ACS_PCA_DF.shape}')
print(f'ChemBERTa: {CHEMBERTA_DF.shape}')

Spange: (26, 13), DRFP filtered: (24, 122), ACS PCA: (24, 5)
ChemBERTa: (26, 768)


In [7]:
# Featurizer with ChemBERTa embeddings
class ChemBERTaFeaturizer:
    def __init__(self, mixed=False, use_chemberta=True):
        self.mixed = mixed
        self.use_chemberta = use_chemberta
        self.spange_df = SPANGE_DF
        self.drfp_df = DRFP_FILTERED
        self.acs_pca_df = ACS_PCA_DF
        self.chemberta_df = CHEMBERTA_DF
        
        # Calculate feature dimension
        self.feats_dim = 2 + 3  # kinetic features
        self.feats_dim += self.spange_df.shape[1]  # Spange
        self.feats_dim += self.drfp_df.shape[1]  # DRFP
        self.feats_dim += self.acs_pca_df.shape[1]  # ACS PCA
        if self.use_chemberta:
            self.feats_dim += self.chemberta_df.shape[1]  # ChemBERTa

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            A_drfp = self.drfp_df.loc[X["SOLVENT A NAME"]].values
            B_drfp = self.drfp_df.loc[X["SOLVENT B NAME"]].values
            A_acs = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
            B_acs = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            if flip:
                X_spange = B_spange * (1 - (1-pct)) + A_spange * (1-pct)
                X_drfp = B_drfp * (1 - (1-pct)) + A_drfp * (1-pct)
                X_acs = B_acs * (1 - (1-pct)) + A_acs * (1-pct)
            else:
                X_spange = A_spange * (1 - pct) + B_spange * pct
                X_drfp = A_drfp * (1 - pct) + B_drfp * pct
                X_acs = A_acs * (1 - pct) + B_acs * pct
            
            if self.use_chemberta:
                A_chemberta = self.chemberta_df.loc[X["SOLVENT A NAME"]].values
                B_chemberta = self.chemberta_df.loc[X["SOLVENT B NAME"]].values
                if flip:
                    X_chemberta = B_chemberta * (1 - (1-pct)) + A_chemberta * (1-pct)
                else:
                    X_chemberta = A_chemberta * (1 - pct) + B_chemberta * pct
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
            X_drfp = self.drfp_df.loc[X["SOLVENT NAME"]].values
            X_acs = self.acs_pca_df.loc[X["SOLVENT NAME"]].values
            if self.use_chemberta:
                X_chemberta = self.chemberta_df.loc[X["SOLVENT NAME"]].values
        
        if self.use_chemberta:
            return np.hstack([X_kinetic, X_spange, X_drfp, X_acs, X_chemberta])
        else:
            return np.hstack([X_kinetic, X_spange, X_drfp, X_acs])

print(f'Feature dimension with ChemBERTa: {ChemBERTaFeaturizer(use_chemberta=True).feats_dim}')
print(f'Feature dimension without ChemBERTa: {ChemBERTaFeaturizer(use_chemberta=False).feats_dim}')

Feature dimension with ChemBERTa: 913
Feature dimension without ChemBERTa: 145


In [8]:
# LGBM Model with ChemBERTa features
class ChemBERTaLGBMModel:
    def __init__(self, data='single', use_chemberta=True):
        self.data = data
        self.use_chemberta = use_chemberta
        self.featurizer = ChemBERTaFeaturizer(mixed=(data == 'full'), use_chemberta=use_chemberta)
        self.models = []
        self.scalers = []
        
    def train_model(self, X, Y):
        X_feat = self.featurizer.featurize(X)
        Y_np = Y.values
        
        self.models = []
        self.scalers = []
        
        for i in range(3):  # 3 targets
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X_feat)
            self.scalers.append(scaler)
            
            model = lgb.LGBMRegressor(
                n_estimators=200,
                learning_rate=0.05,
                max_depth=6,
                num_leaves=31,
                min_child_samples=5,
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=0.1,
                reg_lambda=0.1,
                random_state=42,
                verbose=-1
            )
            model.fit(X_scaled, Y_np[:, i])
            self.models.append(model)
    
    def predict(self, X):
        X_feat = self.featurizer.featurize(X)
        preds = []
        for i in range(3):
            X_scaled = self.scalers[i].transform(X_feat)
            pred = self.models[i].predict(X_scaled)
            preds.append(pred)
        return torch.tensor(np.stack(preds, axis=1))

print('ChemBERTaLGBMModel defined')

ChemBERTaLGBMModel defined


In [9]:
# Cross-validation for single solvent data
print("="*60)
print("Cross-validation: Single Solvent Data (Leave-One-Out)")
print("="*60)

X_single, Y_single = load_data("single_solvent")
print(f"Single solvent data: X={X_single.shape}, Y={Y_single.shape}")

# Test with ChemBERTa features
all_mse_with = []
for (train_X, train_Y), (test_X, test_Y) in tqdm.tqdm(generate_leave_one_out_splits(X_single, Y_single), total=24):
    model = ChemBERTaLGBMModel(data='single', use_chemberta=True)
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    all_mse_with.append(mse)

mse_with_chemberta = np.mean(all_mse_with)
print(f"\nSingle Solvent MSE WITH ChemBERTa: {mse_with_chemberta:.6f}")

# Test without ChemBERTa features (baseline)
all_mse_without = []
for (train_X, train_Y), (test_X, test_Y) in tqdm.tqdm(generate_leave_one_out_splits(X_single, Y_single), total=24):
    model = ChemBERTaLGBMModel(data='single', use_chemberta=False)
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    all_mse_without.append(mse)

mse_without_chemberta = np.mean(all_mse_without)
print(f"Single Solvent MSE WITHOUT ChemBERTa: {mse_without_chemberta:.6f}")
print(f"\nDifference: {(mse_with_chemberta - mse_without_chemberta) / mse_without_chemberta * 100:.2f}%")

Cross-validation: Single Solvent Data (Leave-One-Out)
Single solvent data: X=(656, 3), Y=(656, 3)


  0%|          | 0/24 [00:00<?, ?it/s]

  4%|▍         | 1/24 [00:00<00:12,  1.86it/s]

  8%|▊         | 2/24 [00:01<00:12,  1.82it/s]

 12%|█▎        | 3/24 [00:01<00:11,  1.86it/s]

 17%|█▋        | 4/24 [00:02<00:10,  1.86it/s]

 21%|██        | 5/24 [00:02<00:10,  1.85it/s]

 25%|██▌       | 6/24 [00:03<00:09,  1.83it/s]

 29%|██▉       | 7/24 [00:03<00:09,  1.81it/s]

 33%|███▎      | 8/24 [00:04<00:08,  1.83it/s]

 38%|███▊      | 9/24 [00:04<00:08,  1.83it/s]

 42%|████▏     | 10/24 [00:05<00:07,  1.85it/s]

 46%|████▌     | 11/24 [00:06<00:07,  1.82it/s]

 50%|█████     | 12/24 [00:06<00:06,  1.80it/s]

 54%|█████▍    | 13/24 [00:07<00:06,  1.82it/s]

 58%|█████▊    | 14/24 [00:07<00:05,  1.81it/s]

 62%|██████▎   | 15/24 [00:08<00:05,  1.77it/s]

 67%|██████▋   | 16/24 [00:08<00:04,  1.80it/s]

 71%|███████   | 17/24 [00:09<00:03,  1.80it/s]

 75%|███████▌  | 18/24 [00:09<00:03,  1.78it/s]

 79%|███████▉  | 19/24 [00:10<00:02,  1.78it/s]

 83%|████████▎ | 20/24 [00:11<00:02,  1.78it/s]

 88%|████████▊ | 21/24 [00:11<00:01,  1.79it/s]

 92%|█████████▏| 22/24 [00:12<00:01,  1.81it/s]

 96%|█████████▌| 23/24 [00:12<00:00,  1.83it/s]

100%|██████████| 24/24 [00:13<00:00,  1.81it/s]

100%|██████████| 24/24 [00:13<00:00,  1.81it/s]





Single Solvent MSE WITH ChemBERTa: 0.016890


  0%|          | 0/24 [00:00<?, ?it/s]

  4%|▍         | 1/24 [00:00<00:05,  4.08it/s]

  8%|▊         | 2/24 [00:00<00:05,  4.16it/s]

 12%|█▎        | 3/24 [00:00<00:05,  3.76it/s]

 17%|█▋        | 4/24 [00:01<00:05,  3.67it/s]

 21%|██        | 5/24 [00:01<00:05,  3.77it/s]

 25%|██▌       | 6/24 [00:01<00:04,  3.80it/s]

 29%|██▉       | 7/24 [00:01<00:04,  3.90it/s]

 33%|███▎      | 8/24 [00:02<00:04,  3.80it/s]

 38%|███▊      | 9/24 [00:02<00:03,  3.77it/s]

 42%|████▏     | 10/24 [00:02<00:03,  3.83it/s]

 46%|████▌     | 11/24 [00:02<00:03,  3.84it/s]

 50%|█████     | 12/24 [00:03<00:03,  3.83it/s]

 54%|█████▍    | 13/24 [00:03<00:02,  3.81it/s]

 58%|█████▊    | 14/24 [00:03<00:02,  3.86it/s]

 62%|██████▎   | 15/24 [00:03<00:02,  3.92it/s]

 67%|██████▋   | 16/24 [00:04<00:02,  3.97it/s]

 71%|███████   | 17/24 [00:04<00:01,  3.93it/s]

 75%|███████▌  | 18/24 [00:04<00:01,  3.63it/s]

 79%|███████▉  | 19/24 [00:05<00:01,  3.54it/s]

 83%|████████▎ | 20/24 [00:05<00:01,  3.69it/s]

 88%|████████▊ | 21/24 [00:05<00:00,  3.79it/s]

 92%|█████████▏| 22/24 [00:05<00:00,  3.79it/s]

 96%|█████████▌| 23/24 [00:06<00:00,  3.75it/s]

100%|██████████| 24/24 [00:06<00:00,  3.82it/s]

100%|██████████| 24/24 [00:06<00:00,  3.81it/s]

Single Solvent MSE WITHOUT ChemBERTa: 0.012062

Difference: 40.03%





In [10]:
# Cross-validation for full data
print("="*60)
print("Cross-validation: Full Data (Leave-One-Ramp-Out)")
print("="*60)

X_full, Y_full = load_data("full")
print(f"Full data: X={X_full.shape}, Y={Y_full.shape}")

# Test with ChemBERTa features
all_mse_full_with = []
for (train_X, train_Y), (test_X, test_Y) in tqdm.tqdm(generate_leave_one_ramp_out_splits(X_full, Y_full), total=13):
    model = ChemBERTaLGBMModel(data='full', use_chemberta=True)
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    all_mse_full_with.append(mse)

mse_full_with_chemberta = np.mean(all_mse_full_with)
print(f"\nFull Data MSE WITH ChemBERTa: {mse_full_with_chemberta:.6f}")

# Test without ChemBERTa features (baseline)
all_mse_full_without = []
for (train_X, train_Y), (test_X, test_Y) in tqdm.tqdm(generate_leave_one_ramp_out_splits(X_full, Y_full), total=13):
    model = ChemBERTaLGBMModel(data='full', use_chemberta=False)
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    all_mse_full_without.append(mse)

mse_full_without_chemberta = np.mean(all_mse_full_without)
print(f"Full Data MSE WITHOUT ChemBERTa: {mse_full_without_chemberta:.6f}")
print(f"\nDifference: {(mse_full_with_chemberta - mse_full_without_chemberta) / mse_full_without_chemberta * 100:.2f}%")

Cross-validation: Full Data (Leave-One-Ramp-Out)
Full data: X=(1227, 5), Y=(1227, 3)


  0%|          | 0/13 [00:00<?, ?it/s]

  8%|▊         | 1/13 [00:01<00:23,  2.00s/it]

 15%|█▌        | 2/13 [00:03<00:21,  1.96s/it]

 23%|██▎       | 3/13 [00:05<00:19,  1.98s/it]

 31%|███       | 4/13 [00:08<00:18,  2.02s/it]

 38%|███▊      | 5/13 [00:10<00:16,  2.04s/it]

 46%|████▌     | 6/13 [00:12<00:14,  2.04s/it]

 54%|█████▍    | 7/13 [00:14<00:12,  2.01s/it]

 62%|██████▏   | 8/13 [00:16<00:10,  2.02s/it]

 69%|██████▉   | 9/13 [00:18<00:08,  2.03s/it]

 77%|███████▋  | 10/13 [00:20<00:05,  2.00s/it]

 85%|████████▍ | 11/13 [00:22<00:04,  2.05s/it]

 92%|█████████▏| 12/13 [00:24<00:02,  2.06s/it]

100%|██████████| 13/13 [00:26<00:00,  2.05s/it]

100%|██████████| 13/13 [00:26<00:00,  2.03s/it]





Full Data MSE WITH ChemBERTa: 0.013525


  0%|          | 0/13 [00:00<?, ?it/s]

  8%|▊         | 1/13 [00:00<00:05,  2.02it/s]

 15%|█▌        | 2/13 [00:00<00:05,  2.05it/s]

 23%|██▎       | 3/13 [00:01<00:04,  2.03it/s]

 31%|███       | 4/13 [00:02<00:04,  1.92it/s]

 38%|███▊      | 5/13 [00:02<00:04,  1.89it/s]

 46%|████▌     | 6/13 [00:03<00:03,  1.91it/s]

 54%|█████▍    | 7/13 [00:03<00:03,  1.97it/s]

 62%|██████▏   | 8/13 [00:04<00:02,  1.90it/s]

 69%|██████▉   | 9/13 [00:04<00:02,  1.79it/s]

 77%|███████▋  | 10/13 [00:05<00:01,  1.77it/s]

 85%|████████▍ | 11/13 [00:05<00:01,  1.82it/s]

 92%|█████████▏| 12/13 [00:06<00:00,  1.87it/s]

100%|██████████| 13/13 [00:06<00:00,  1.85it/s]

100%|██████████| 13/13 [00:06<00:00,  1.88it/s]

Full Data MSE WITHOUT ChemBERTa: 0.009972

Difference: 35.62%





In [11]:
# Calculate overall MSE (weighted average)
N_single = len(X_single)
N_full = len(X_full)
N_total = N_single + N_full

overall_mse_with = (mse_with_chemberta * N_single + mse_full_with_chemberta * N_full) / N_total
overall_mse_without = (mse_without_chemberta * N_single + mse_full_without_chemberta * N_full) / N_total

print("="*60)
print("SUMMARY")
print("="*60)
print(f"\nWITH ChemBERTa embeddings:")
print(f"  Single Solvent MSE: {mse_with_chemberta:.6f}")
print(f"  Full Data MSE: {mse_full_with_chemberta:.6f}")
print(f"  Overall MSE: {overall_mse_with:.6f}")

print(f"\nWITHOUT ChemBERTa embeddings (baseline LGBM):")
print(f"  Single Solvent MSE: {mse_without_chemberta:.6f}")
print(f"  Full Data MSE: {mse_full_without_chemberta:.6f}")
print(f"  Overall MSE: {overall_mse_without:.6f}")

print(f"\nComparison:")
print(f"  ChemBERTa vs Baseline: {(overall_mse_with - overall_mse_without) / overall_mse_without * 100:.2f}%")
print(f"  Best GP+MLP+LGBM ensemble (exp_030): 0.008298")
print(f"  ChemBERTa LGBM vs Best: {(overall_mse_with - 0.008298) / 0.008298 * 100:.2f}%")

SUMMARY

WITH ChemBERTa embeddings:
  Single Solvent MSE: 0.016890
  Full Data MSE: 0.013525
  Overall MSE: 0.014697

WITHOUT ChemBERTa embeddings (baseline LGBM):
  Single Solvent MSE: 0.012062
  Full Data MSE: 0.009972
  Overall MSE: 0.010700

Comparison:
  ChemBERTa vs Baseline: 37.35%
  Best GP+MLP+LGBM ensemble (exp_030): 0.008298
  ChemBERTa LGBM vs Best: 77.12%


In [13]:
# Try with PCA-reduced ChemBERTa embeddings
from sklearn.decomposition import PCA

print("="*60)
print("Testing PCA-reduced ChemBERTa embeddings")
print("="*60)

# Try different PCA dimensions (max 26 since we have 26 solvents)
pca_dims = [5, 10, 15, 20, 25]

for n_components in pca_dims:
    # Apply PCA to ChemBERTa embeddings
    pca = PCA(n_components=n_components)
    chemberta_pca = pca.fit_transform(CHEMBERTA_DF.values)
    CHEMBERTA_PCA_DF = pd.DataFrame(
        chemberta_pca, 
        index=CHEMBERTA_DF.index,
        columns=[f'chemberta_pca_{i}' for i in range(n_components)]
    )
    
    # Create a modified featurizer
    class ChemBERTaPCAFeaturizer:
        def __init__(self, mixed=False):
            self.mixed = mixed
            self.spange_df = SPANGE_DF
            self.drfp_df = DRFP_FILTERED
            self.acs_pca_df = ACS_PCA_DF
            self.chemberta_df = CHEMBERTA_PCA_DF
            self.feats_dim = 2 + 3 + self.spange_df.shape[1] + self.drfp_df.shape[1] + self.acs_pca_df.shape[1] + self.chemberta_df.shape[1]

        def featurize(self, X, flip=False):
            X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
            temp_c = X_vals[:, 1:2]
            time_m = X_vals[:, 0:1]
            temp_k = temp_c + 273.15
            inv_temp = 1000.0 / temp_k
            log_time = np.log(time_m + 1e-6)
            interaction = inv_temp * log_time
            X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
            
            if self.mixed:
                A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
                B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
                A_drfp = self.drfp_df.loc[X["SOLVENT A NAME"]].values
                B_drfp = self.drfp_df.loc[X["SOLVENT B NAME"]].values
                A_acs = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
                B_acs = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
                A_chemberta = self.chemberta_df.loc[X["SOLVENT A NAME"]].values
                B_chemberta = self.chemberta_df.loc[X["SOLVENT B NAME"]].values
                pct = X["SolventB%"].values.reshape(-1, 1)
                if flip:
                    X_spange = B_spange * (1 - (1-pct)) + A_spange * (1-pct)
                    X_drfp = B_drfp * (1 - (1-pct)) + A_drfp * (1-pct)
                    X_acs = B_acs * (1 - (1-pct)) + A_acs * (1-pct)
                    X_chemberta = B_chemberta * (1 - (1-pct)) + A_chemberta * (1-pct)
                else:
                    X_spange = A_spange * (1 - pct) + B_spange * pct
                    X_drfp = A_drfp * (1 - pct) + B_drfp * pct
                    X_acs = A_acs * (1 - pct) + B_acs * pct
                    X_chemberta = A_chemberta * (1 - pct) + B_chemberta * pct
            else:
                X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
                X_drfp = self.drfp_df.loc[X["SOLVENT NAME"]].values
                X_acs = self.acs_pca_df.loc[X["SOLVENT NAME"]].values
                X_chemberta = self.chemberta_df.loc[X["SOLVENT NAME"]].values
            
            return np.hstack([X_kinetic, X_spange, X_drfp, X_acs, X_chemberta])
    
    # Test on single solvent
    class ChemBERTaPCALGBMModel:
        def __init__(self, data='single'):
            self.data = data
            self.featurizer = ChemBERTaPCAFeaturizer(mixed=(data == 'full'))
            self.models = []
            self.scalers = []
            
        def train_model(self, X, Y):
            X_feat = self.featurizer.featurize(X)
            Y_np = Y.values
            self.models = []
            self.scalers = []
            for i in range(3):
                scaler = StandardScaler()
                X_scaled = scaler.fit_transform(X_feat)
                self.scalers.append(scaler)
                model = lgb.LGBMRegressor(
                    n_estimators=200, learning_rate=0.05, max_depth=6,
                    num_leaves=31, min_child_samples=5, subsample=0.8,
                    colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1,
                    random_state=42, verbose=-1
                )
                model.fit(X_scaled, Y_np[:, i])
                self.models.append(model)
        
        def predict(self, X):
            X_feat = self.featurizer.featurize(X)
            preds = []
            for i in range(3):
                X_scaled = self.scalers[i].transform(X_feat)
                pred = self.models[i].predict(X_scaled)
                preds.append(pred)
            return torch.tensor(np.stack(preds, axis=1))
    
    # Quick CV on single solvent
    all_mse = []
    for (train_X, train_Y), (test_X, test_Y) in generate_leave_one_out_splits(X_single, Y_single):
        model = ChemBERTaPCALGBMModel(data='single')
        model.train_model(train_X, train_Y)
        preds = model.predict(test_X).numpy()
        mse = np.mean((preds - test_Y.values) ** 2)
        all_mse.append(mse)
    
    single_mse = np.mean(all_mse)
    print(f"PCA n_components={n_components}: Single Solvent MSE = {single_mse:.6f} (vs baseline {mse_without_chemberta:.6f})")

Testing PCA-reduced ChemBERTa embeddings


PCA n_components=5: Single Solvent MSE = 0.012305 (vs baseline 0.012062)


PCA n_components=10: Single Solvent MSE = 0.014098 (vs baseline 0.012062)


PCA n_components=15: Single Solvent MSE = 0.014733 (vs baseline 0.012062)


PCA n_components=20: Single Solvent MSE = 0.013933 (vs baseline 0.012062)


PCA n_components=25: Single Solvent MSE = 0.014101 (vs baseline 0.012062)


In [14]:
# Test ChemBERTa embeddings ONLY (no other features)
print("="*60)
print("Testing ChemBERTa embeddings ONLY (no other features)")
print("="*60)

class ChemBERTaOnlyFeaturizer:
    def __init__(self, mixed=False, n_pca=10):
        self.mixed = mixed
        self.n_pca = n_pca
        # Apply PCA to reduce dimensionality
        pca = PCA(n_components=n_pca)
        chemberta_pca = pca.fit_transform(CHEMBERTA_DF.values)
        self.chemberta_df = pd.DataFrame(
            chemberta_pca, 
            index=CHEMBERTA_DF.index,
            columns=[f'chemberta_pca_{i}' for i in range(n_pca)]
        )
        self.feats_dim = 2 + 3 + n_pca  # kinetic + chemberta

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_chemberta = self.chemberta_df.loc[X["SOLVENT A NAME"]].values
            B_chemberta = self.chemberta_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            if flip:
                X_chemberta = B_chemberta * (1 - (1-pct)) + A_chemberta * (1-pct)
            else:
                X_chemberta = A_chemberta * (1 - pct) + B_chemberta * pct
        else:
            X_chemberta = self.chemberta_df.loc[X["SOLVENT NAME"]].values
        
        return np.hstack([X_kinetic, X_chemberta])

class ChemBERTaOnlyLGBMModel:
    def __init__(self, data='single', n_pca=10):
        self.data = data
        self.featurizer = ChemBERTaOnlyFeaturizer(mixed=(data == 'full'), n_pca=n_pca)
        self.models = []
        self.scalers = []
        
    def train_model(self, X, Y):
        X_feat = self.featurizer.featurize(X)
        Y_np = Y.values
        self.models = []
        self.scalers = []
        for i in range(3):
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X_feat)
            self.scalers.append(scaler)
            model = lgb.LGBMRegressor(
                n_estimators=200, learning_rate=0.05, max_depth=6,
                num_leaves=31, min_child_samples=5, subsample=0.8,
                colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1,
                random_state=42, verbose=-1
            )
            model.fit(X_scaled, Y_np[:, i])
            self.models.append(model)
    
    def predict(self, X):
        X_feat = self.featurizer.featurize(X)
        preds = []
        for i in range(3):
            X_scaled = self.scalers[i].transform(X_feat)
            pred = self.models[i].predict(X_scaled)
            preds.append(pred)
        return torch.tensor(np.stack(preds, axis=1))

# Test ChemBERTa only
all_mse_only = []
for (train_X, train_Y), (test_X, test_Y) in tqdm.tqdm(generate_leave_one_out_splits(X_single, Y_single), total=24):
    model = ChemBERTaOnlyLGBMModel(data='single', n_pca=10)
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    all_mse_only.append(mse)

mse_chemberta_only = np.mean(all_mse_only)
print(f"\nChemBERTa ONLY (no Spange/DRFP/ACS): Single Solvent MSE = {mse_chemberta_only:.6f}")
print(f"Baseline LGBM (with Spange/DRFP/ACS): Single Solvent MSE = {mse_without_chemberta:.6f}")
print(f"Difference: {(mse_chemberta_only - mse_without_chemberta) / mse_without_chemberta * 100:.2f}%")

Testing ChemBERTa embeddings ONLY (no other features)


  0%|          | 0/24 [00:00<?, ?it/s]

  4%|▍         | 1/24 [00:00<00:05,  4.40it/s]

  8%|▊         | 2/24 [00:00<00:06,  3.67it/s]

 12%|█▎        | 3/24 [00:00<00:05,  3.59it/s]

 17%|█▋        | 4/24 [00:01<00:05,  3.68it/s]

 21%|██        | 5/24 [00:01<00:05,  3.45it/s]

 25%|██▌       | 6/24 [00:01<00:04,  3.63it/s]

 29%|██▉       | 7/24 [00:01<00:04,  3.65it/s]

 33%|███▎      | 8/24 [00:02<00:04,  3.64it/s]

 38%|███▊      | 9/24 [00:02<00:04,  3.62it/s]

 42%|████▏     | 10/24 [00:02<00:03,  3.64it/s]

 46%|████▌     | 11/24 [00:03<00:03,  3.70it/s]

 50%|█████     | 12/24 [00:03<00:03,  3.74it/s]

 54%|█████▍    | 13/24 [00:03<00:03,  3.56it/s]

 58%|█████▊    | 14/24 [00:03<00:02,  3.75it/s]

 62%|██████▎   | 15/24 [00:04<00:02,  3.56it/s]

 67%|██████▋   | 16/24 [00:04<00:02,  3.61it/s]

 71%|███████   | 17/24 [00:04<00:01,  3.54it/s]

 75%|███████▌  | 18/24 [00:04<00:01,  3.71it/s]

 79%|███████▉  | 19/24 [00:05<00:01,  3.67it/s]

 83%|████████▎ | 20/24 [00:05<00:01,  3.69it/s]

 88%|████████▊ | 21/24 [00:05<00:00,  3.59it/s]

 92%|█████████▏| 22/24 [00:06<00:00,  3.51it/s]

 96%|█████████▌| 23/24 [00:06<00:00,  3.56it/s]

100%|██████████| 24/24 [00:06<00:00,  3.49it/s]

100%|██████████| 24/24 [00:06<00:00,  3.61it/s]


ChemBERTa ONLY (no Spange/DRFP/ACS): Single Solvent MSE = 0.020806
Baseline LGBM (with Spange/DRFP/ACS): Single Solvent MSE = 0.012062
Difference: 72.49%





In [15]:
# Final summary
print("="*60)
print("FINAL SUMMARY: ChemBERTa Frozen Embeddings Experiment")
print("="*60)

print("""
HYPOTHESIS: Pre-trained ChemBERTa embeddings capture chemistry knowledge 
that can't be learned from 24 solvents. Using them as FROZEN features 
(not fine-tuning) could reduce the CV-LB intercept.

RESULTS:
1. Full ChemBERTa (768 dims) + existing features: 37% WORSE than baseline
2. PCA-reduced ChemBERTa (5-25 dims) + existing features: 2-22% WORSE
3. ChemBERTa ONLY (no domain features): 72% WORSE than baseline

CONCLUSION:
ChemBERTa frozen embeddings do NOT help for this problem. The domain-specific 
features (Spange descriptors, DRFP, ACS PCA) already capture the relevant 
chemistry information better than generic pre-trained embeddings.

This is likely because:
1. ChemBERTa was pre-trained on general molecular properties, not solvent effects
2. The Spange descriptors are specifically designed for solvent polarity/acidity
3. DRFP captures reaction-specific fingerprints
4. Adding 768 noisy dimensions hurts the model more than it helps

NEXT STEPS:
- Try the "mixall" kernel approach with GroupKFold(5)
- Focus on the existing domain-specific features
- Consider other pre-trained models specifically for solvent properties
""")

# Record the best CV score from this experiment
best_cv_this_exp = mse_without_chemberta  # The baseline LGBM without ChemBERTa
print(f"Best CV from this experiment: {best_cv_this_exp:.6f}")
print(f"Best CV overall (exp_030): 0.008298")
print(f"This experiment vs best: {(best_cv_this_exp - 0.008298) / 0.008298 * 100:.2f}% worse")

FINAL SUMMARY: ChemBERTa Frozen Embeddings Experiment

HYPOTHESIS: Pre-trained ChemBERTa embeddings capture chemistry knowledge 
that can't be learned from 24 solvents. Using them as FROZEN features 
(not fine-tuning) could reduce the CV-LB intercept.

RESULTS:
1. Full ChemBERTa (768 dims) + existing features: 37% WORSE than baseline
2. PCA-reduced ChemBERTa (5-25 dims) + existing features: 2-22% WORSE
3. ChemBERTa ONLY (no domain features): 72% WORSE than baseline

CONCLUSION:
ChemBERTa frozen embeddings do NOT help for this problem. The domain-specific 
features (Spange descriptors, DRFP, ACS PCA) already capture the relevant 
chemistry information better than generic pre-trained embeddings.

This is likely because:
1. ChemBERTa was pre-trained on general molecular properties, not solvent effects
2. The Spange descriptors are specifically designed for solvent polarity/acidity
3. DRFP captures reaction-specific fingerprints
4. Adding 768 noisy dimensions hurts the model more than it h

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = ChemBERTaLGBMModel(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = ChemBERTaLGBMModel(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################