# Experiment 097: Frozen ChemBERTa Embeddings + Ensemble

**Goal**: Use pre-trained ChemBERTa embeddings as features instead of training from scratch.

**Rationale**:
- GNNs trained from scratch failed (exp_096: CV 452% worse)
- ChemBERTa was pre-trained on 10M+ SMILES strings
- Frozen embeddings capture molecular structure without overfitting
- Combines pre-trained knowledge with our best ensemble approach

**Approach**:
1. Extract ChemBERTa embeddings for all 24 solvents
2. Use embeddings + Arrhenius features for GP+MLP+LGBM ensemble
3. Compare CV to baseline (0.0081)

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel
import lightgbm as lgb
from abc import ABC, abstractmethod
import warnings
warnings.filterwarnings('ignore')

print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

In [None]:
# Base classes
class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError

    def featurize(self, X):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass

    def train_model(self, X_train, y_train):
        raise NotImplementedError

    def predict(self, X):
        raise NotImplementedError

In [None]:
# Load SMILES lookup
SMILES_DF = pd.read_csv(f'{DATA_PATH}/smiles_lookup.csv', index_col=0)
SPANGE_DF = load_features("spange_descriptors")

print(f"SMILES lookup: {SMILES_DF.shape}")
print(f"Spange: {SPANGE_DF.shape}")
print(f"\nSample SMILES:")
print(SMILES_DF.head())

In [None]:
# Load ChemBERTa model and extract embeddings
from transformers import AutoModel, AutoTokenizer

print("Loading ChemBERTa model...")
tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
model = AutoModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
model.eval()

print(f"Model loaded. Hidden size: {model.config.hidden_size}")

In [None]:
# Extract embeddings for all solvents
def extract_chemberta_embedding(smiles, tokenizer, model):
    """Extract ChemBERTa embedding for a single SMILES string."""
    with torch.no_grad():
        inputs = tokenizer(smiles, return_tensors="pt", padding=True, truncation=True, max_length=512)
        outputs = model(**inputs)
        # Mean pool over token dimension (excluding padding)
        attention_mask = inputs['attention_mask']
        token_embeddings = outputs.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        embedding = (sum_embeddings / sum_mask).squeeze().numpy()
        return embedding

# Extract embeddings for all solvents
print("Extracting ChemBERTa embeddings for all solvents...")
embeddings_dict = {}
for solvent_name in SMILES_DF.index:
    smiles = SMILES_DF.loc[solvent_name, 'solvent smiles']
    embedding = extract_chemberta_embedding(smiles, tokenizer, model)
    embeddings_dict[solvent_name] = embedding
    print(f"  {solvent_name}: {embedding.shape}")

# Create DataFrame
CHEMBERTA_DF = pd.DataFrame.from_dict(embeddings_dict, orient='index')
print(f"\nChemBERTa embeddings shape: {CHEMBERTA_DF.shape}")

In [None]:
# MLP model
class SimpleMLP(nn.Module):
    def __init__(self, input_dim, output_dim=3, hidden_dims=[128, 64], dropout=0.1):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, h_dim))
            layers.append(nn.BatchNorm1d(h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, output_dim))
        self.network = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.network(x)

print("MLP defined")

In [None]:
# ChemBERTa Ensemble Model
class ChemBERTaEnsembleModel(BaseModel):
    """Ensemble model using ChemBERTa embeddings + Arrhenius features."""
    
    def __init__(self, data='single', hidden_dims=[128, 64], dropout=0.1, num_epochs=100, lr=0.001):
        self.data = data
        self.hidden_dims = hidden_dims
        self.dropout = dropout
        self.num_epochs = num_epochs
        self.lr = lr
        
        # Models
        self.mlp = None
        self.lgbm = MultiOutputRegressor(lgb.LGBMRegressor(
            num_leaves=31,
            learning_rate=0.1,
            n_estimators=100,
            random_state=42,
            verbosity=-1
        ))
        
        # GP
        kernel = ConstantKernel(1.0) * Matern(length_scale=1.0, nu=2.5) + WhiteKernel(noise_level=0.1)
        self.gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=2, random_state=42)
        
        # Scaler
        self.scaler = StandardScaler()
        
        # Ensemble weights (from best model)
        self.weights = [0.3, 0.4, 0.3]  # GP, MLP, LGBM
        
    def _get_features(self, X):
        """Extract features from input data."""
        # Numeric features with Arrhenius
        time = X['Residence Time'].values.reshape(-1, 1)
        temp = X['Temperature'].values.reshape(-1, 1)
        temp_k = temp + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time + 1e-6)
        interaction = inv_temp * log_time
        
        if self.data == 'single':
            # Single solvent - use ChemBERTa embeddings
            chemberta = CHEMBERTA_DF.loc[X['SOLVENT NAME']].values
            spange = SPANGE_DF.loc[X['SOLVENT NAME']].values
        else:
            # Mixed solvents - weighted average
            pct = X['SolventB%'].values.reshape(-1, 1)
            chemberta_a = CHEMBERTA_DF.loc[X['SOLVENT A NAME']].values
            chemberta_b = CHEMBERTA_DF.loc[X['SOLVENT B NAME']].values
            chemberta = (1 - pct) * chemberta_a + pct * chemberta_b
            
            spange_a = SPANGE_DF.loc[X['SOLVENT A NAME']].values
            spange_b = SPANGE_DF.loc[X['SOLVENT B NAME']].values
            spange = (1 - pct) * spange_a + pct * spange_b
        
        return np.hstack([time, temp, inv_temp, log_time, interaction, chemberta, spange])
    
    def train_model(self, train_X, train_Y, device=None, verbose=False):
        X_np = self._get_features(train_X)
        Y_np = train_Y.values
        
        X_scaled = self.scaler.fit_transform(X_np)
        
        # DataFrame for GBDT
        feature_names = [str(i) for i in range(X_scaled.shape[1])]
        X_scaled_df = pd.DataFrame(X_scaled, columns=feature_names)
        
        # Train LightGBM
        self.lgbm.fit(X_scaled_df, Y_np)
        
        # Train GP (on subset for speed)
        n_gp = min(200, len(X_scaled))
        indices = np.random.choice(len(X_scaled), n_gp, replace=False)
        self.gp.fit(X_scaled[indices], Y_np[indices, 0])  # GP for first target
        
        # Build and train MLP
        input_dim = X_scaled.shape[1]
        self.mlp = SimpleMLP(
            input_dim=input_dim,
            output_dim=3,
            hidden_dims=self.hidden_dims,
            dropout=self.dropout
        )
        
        if device is None:
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.mlp.to(device)
        
        X_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(device)
        Y_tensor = torch.tensor(Y_np, dtype=torch.float32).to(device)
        
        optimizer = torch.optim.Adam(self.mlp.parameters(), lr=self.lr)
        criterion = nn.MSELoss()
        
        dataset = TensorDataset(X_tensor, Y_tensor)
        loader = DataLoader(dataset, batch_size=32, shuffle=True, drop_last=True)
        
        self.mlp.train()
        for epoch in range(self.num_epochs):
            for batch_X, batch_Y in loader:
                optimizer.zero_grad()
                pred = self.mlp(batch_X)
                loss = criterion(pred, batch_Y)
                loss.backward()
                optimizer.step()
    
    def predict(self, X):
        X_np = self._get_features(X)
        X_scaled = self.scaler.transform(X_np)
        
        # DataFrame for GBDT
        feature_names = [str(i) for i in range(X_scaled.shape[1])]
        X_scaled_df = pd.DataFrame(X_scaled, columns=feature_names)
        
        # MLP predictions
        self.mlp.eval()
        device = next(self.mlp.parameters()).device
        with torch.no_grad():
            X_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(device)
            mlp_preds = self.mlp(X_tensor).cpu().numpy()
        
        # LGBM predictions
        lgb_preds = self.lgbm.predict(X_scaled_df)
        
        # GP predictions
        gp_mean = self.gp.predict(X_scaled)
        gp_preds = np.column_stack([gp_mean, gp_mean, gp_mean])
        
        # Weighted ensemble
        final_preds = (
            self.weights[0] * gp_preds +
            self.weights[1] * mlp_preds +
            self.weights[2] * lgb_preds
        )
        
        # Clip to non-negative
        final_preds = np.clip(final_preds, 0, None)
        
        return torch.tensor(final_preds, dtype=torch.double)

print("ChemBERTaEnsembleModel defined")
print(f"Feature dimension: {5 + CHEMBERTA_DF.shape[1] + SPANGE_DF.shape[1]}")

In [None]:
# Test the model
X_single, Y_single = load_data("single_solvent")
print(f"Single solvent data: {X_single.shape}, {Y_single.shape}")

model = ChemBERTaEnsembleModel(data='single', num_epochs=20)
model.train_model(X_single, Y_single)
preds = model.predict(X_single[:5])
print(f"Test predictions shape: {preds.shape}")
print(f"Sample predictions:\n{preds[:3]}")

In [None]:
# Run CV
import tqdm

def compute_cv_score(verbose=True):
    """Compute CV score with ChemBERTa embeddings."""
    
    # Single solvent CV
    X_single, Y_single = load_data("single_solvent")
    split_generator = generate_leave_one_out_splits(X_single, Y_single)
    
    single_mse_list = []
    for fold_idx, split in enumerate(split_generator):
        (train_X, train_Y), (test_X, test_Y) = split
        
        model = ChemBERTaEnsembleModel(data='single', num_epochs=100)
        model.train_model(train_X, train_Y)
        
        predictions = model.predict(test_X)
        predictions_np = predictions.detach().cpu().numpy()
        
        mse = np.mean((predictions_np - test_Y.values) ** 2)
        single_mse_list.append(mse)
        if verbose:
            print(f"Single Fold {fold_idx}: MSE = {mse:.6f}")
    
    single_cv = np.mean(single_mse_list)
    if verbose:
        print(f"\nSingle Solvent CV MSE: {single_cv:.6f}")
    
    # Full data CV
    X_full, Y_full = load_data("full")
    split_generator = generate_leave_one_ramp_out_splits(X_full, Y_full)
    
    full_mse_list = []
    for fold_idx, split in enumerate(split_generator):
        (train_X, train_Y), (test_X, test_Y) = split
        
        model = ChemBERTaEnsembleModel(data='full', num_epochs=100)
        model.train_model(train_X, train_Y)
        
        predictions = model.predict(test_X)
        predictions_np = predictions.detach().cpu().numpy()
        
        mse = np.mean((predictions_np - test_Y.values) ** 2)
        full_mse_list.append(mse)
        if verbose:
            print(f"Full Fold {fold_idx}: MSE = {mse:.6f}")
    
    full_cv = np.mean(full_mse_list)
    if verbose:
        print(f"\nFull Data CV MSE: {full_cv:.6f}")
    
    combined_cv = (single_cv + full_cv) / 2
    if verbose:
        print(f"\n=== Combined CV MSE: {combined_cv:.6f} ===")
    
    return single_cv, full_cv, combined_cv

print("Running CV with ChemBERTa embeddings...")
single_cv, full_cv, combined_cv = compute_cv_score()

In [None]:
# Save results
import json

results = {
    'cv_score': float(combined_cv),
    'single_cv': float(single_cv),
    'full_cv': float(full_cv),
    'model': 'ChemBERTaEnsembleModel (GP+MLP+LGBM with ChemBERTa embeddings)',
    'baseline_cv': 0.0081,
    'improvement': f"{(0.0081 - combined_cv) / 0.0081 * 100:.2f}%"
}

with open('/home/code/experiments/097_chemberta_embeddings/metrics.json', 'w') as f:
    json.dump(results, f, indent=2)

print("Results saved")
print(f"Combined CV: {combined_cv:.6f}")
print(f"Baseline CV: 0.0081")
print(f"Improvement: {(0.0081 - combined_cv) / 0.0081 * 100:.2f}%")

## Generate Submission (if CV is better than baseline)

The following cells follow the official template structure.

**CRITICAL**: The model class in submission cells MUST match the CV computation class (`ChemBERTaEnsembleModel`).

In [None]:
# Check if we should generate submission
if combined_cv < 0.0081:
    print(f"CV {combined_cv:.6f} is BETTER than baseline 0.0081!")
    print("Generating submission...")
    GENERATE_SUBMISSION = True
else:
    print(f"CV {combined_cv:.6f} is WORSE than baseline 0.0081")
    print("Not generating submission.")
    GENERATE_SUBMISSION = False

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = ChemBERTaEnsembleModel(data='single', num_epochs=100)  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = ChemBERTaEnsembleModel(data='full', num_epochs=100)  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

# Also save to standard location
import shutil
import os
os.makedirs('/home/submission', exist_ok=True)
shutil.copy("submission.csv", "/home/submission/submission.csv")
print("Submission saved!")
print(f"Shape: {submission.shape}")
print(submission.head())

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################