# Experiment 041: ChemBERTa Pre-trained Embeddings

**Hypothesis:** Pre-trained molecular embeddings from ChemBERTa (trained on millions of molecules) will transfer better to unseen solvents than models trained from scratch.

**Approach:**
1. Use ChemBERTa to get 768-dim embeddings for each solvent SMILES
2. Combine with Arrhenius kinetics features
3. Feed into MLP model
4. Test on single fold first, then full CV if promising

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Check for transformers
try:
    from transformers import AutoModel, AutoTokenizer
    print('Transformers available')
except ImportError:
    print('Installing transformers...')
    import subprocess
    subprocess.run(['pip', 'install', 'transformers', '-q'])
    from transformers import AutoModel, AutoTokenizer
    print('Transformers installed')

np.random.seed(42)
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Transformers available
Using device: cuda


In [9]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["SM", "Product 2", "Product 3"]]  # Correct column names
    return X, Y

print('Data loading functions defined')

Data loading functions defined


In [3]:
# Load SMILES lookup
SMILES_DF = pd.read_csv(f'{DATA_PATH}/smiles_lookup.csv', index_col=0)
print(f'SMILES lookup: {len(SMILES_DF)} solvents')
print(SMILES_DF.head())

SMILES lookup: 26 solvents
                                           solvent smiles
SOLVENT NAME                                             
Cyclohexane                                      C1CCCCC1
Ethyl Acetate                                   O=C(OCC)C
Acetic Acid                                       CC(=O)O
2-Methyltetrahydrofuran [2-MeTHF]              O1C(C)CCC1
1,1,1,3,3,3-Hexafluoropropan-2-ol  C(C(F)(F)F)(C(F)(F)F)O


In [4]:
# ChemBERTa Featurizer
class ChemBERTaFeaturizer:
    def __init__(self):
        print('Loading ChemBERTa model...')
        self.tokenizer = AutoTokenizer.from_pretrained('seyonec/ChemBERTa-zinc-base-v1')
        self.model = AutoModel.from_pretrained('seyonec/ChemBERTa-zinc-base-v1')
        self.model.eval()
        self.model.to(device)
        print(f'ChemBERTa loaded on {device}')
        
        # Cache for embeddings
        self.cache = {}
    
    def get_embedding(self, smiles):
        """Get 768-dim embedding for a SMILES string."""
        if smiles in self.cache:
            return self.cache[smiles]
        
        # Handle mixture SMILES by averaging components
        if '.' in smiles:
            components = smiles.split('.')
            embeddings = [self._get_single_embedding(s) for s in components]
            embedding = np.mean(embeddings, axis=0)
        else:
            embedding = self._get_single_embedding(smiles)
        
        self.cache[smiles] = embedding
        return embedding
    
    def _get_single_embedding(self, smiles):
        """Get embedding for a single SMILES (no mixtures)."""
        with torch.no_grad():
            inputs = self.tokenizer(smiles, return_tensors='pt', padding=True, truncation=True, max_length=512)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            outputs = self.model(**inputs)
            # Use [CLS] token embedding
            embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
        return embedding

# Initialize featurizer
featurizer = ChemBERTaFeaturizer()

Loading ChemBERTa model...


ChemBERTa loaded on cuda


In [5]:
# Pre-compute embeddings for all solvents
SOLVENT_EMBEDDINGS = {}
for solvent_name in SMILES_DF.index:
    smiles = SMILES_DF.loc[solvent_name, 'solvent smiles']
    try:
        embedding = featurizer.get_embedding(smiles)
        SOLVENT_EMBEDDINGS[solvent_name] = embedding
        print(f"{solvent_name}: embedding shape {embedding.shape}")
    except Exception as e:
        print(f"ERROR {solvent_name}: {e}")

print(f"\nTotal embeddings: {len(SOLVENT_EMBEDDINGS)}")

Cyclohexane: embedding shape (768,)
Ethyl Acetate: embedding shape (768,)
Acetic Acid: embedding shape (768,)
2-Methyltetrahydrofuran [2-MeTHF]: embedding shape (768,)
1,1,1,3,3,3-Hexafluoropropan-2-ol: embedding shape (768,)
IPA [Propan-2-ol]: embedding shape (768,)
Ethanol: embedding shape (768,)
Methanol: embedding shape (768,)
Ethylene Glycol [1,2-Ethanediol]: embedding shape (768,)
Acetonitrile: embedding shape (768,)
Water: embedding shape (768,)
Diethyl Ether [Ether]: embedding shape (768,)
MTBE [tert-Butylmethylether]: embedding shape (768,)
Dimethyl Carbonate: embedding shape (768,)
tert-Butanol [2-Methylpropan-2-ol]: embedding shape (768,)
DMA [N,N-Dimethylacetamide]: embedding shape (768,)
2,2,2-Trifluoroethanol: embedding shape (768,)
Dihydrolevoglucosenone (Cyrene): embedding shape (768,)
Decanol: embedding shape (768,)
Butanone [MEK]: embedding shape (768,)
Ethyl Lactate: embedding shape (768,)
Methyl Propionate: embedding shape (768,)
THF [Tetrahydrofuran]: embedding sha

In [10]:
# ChemBERTa MLP Model
class ChemBERTaMLPModel(nn.Module):
    def __init__(self, input_dim, hidden_dims=[256, 128, 64]):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, h_dim),
                nn.BatchNorm1d(h_dim),
                nn.ReLU(),
                nn.Dropout(0.3)
            ])
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, 3))  # 3 outputs: SM, Product 1, Product 2
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

print('ChemBERTaMLPModel defined')

ChemBERTaMLPModel defined


In [11]:
# ChemBERTa Wrapper for training and prediction
class ChemBERTaWrapper:
    def __init__(self, data='single', n_models=3):
        self.data_type = data
        self.n_models = n_models
        self.models = []
        self.scalers = []
        self.solvent_embeddings = SOLVENT_EMBEDDINGS
    
    def _get_features(self, X):
        """Extract features: ChemBERTa embeddings + Arrhenius kinetics."""
        features_list = []
        
        for idx, row in X.iterrows():
            # Kinetics features
            time_m = row['Residence Time']
            temp_c = row['Temperature']
            temp_k = temp_c + 273.15
            
            kinetics = np.array([
                time_m,
                temp_c,
                1.0 / temp_k,  # Arrhenius
                np.log(time_m + 1),  # log time
                time_m / temp_k  # interaction
            ])
            
            # ChemBERTa embedding
            if self.data_type == 'single':
                solvent_name = row['SOLVENT NAME']
                embedding = self.solvent_embeddings[solvent_name]
            else:
                # Full solvent (mixture)
                solvent_a = row['SOLVENT A NAME']
                solvent_b = row['SOLVENT B NAME']
                pct_b = row['SolventB%'] / 100.0
                
                emb_a = self.solvent_embeddings[solvent_a]
                emb_b = self.solvent_embeddings[solvent_b]
                embedding = (1 - pct_b) * emb_a + pct_b * emb_b
            
            # Combine
            features = np.concatenate([kinetics, embedding])
            features_list.append(features)
        
        return np.array(features_list, dtype=np.float32)
    
    def train_model(self, X_train, y_train, epochs=200):
        """Train ensemble of MLP models."""
        X_feat = self._get_features(X_train)
        y_np = y_train.values.astype(np.float32)
        
        self.models = []
        self.scalers = []
        
        for i in range(self.n_models):
            # Scale features
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X_feat)
            self.scalers.append(scaler)
            
            # Create model
            input_dim = X_scaled.shape[1]  # 5 kinetics + 768 ChemBERTa = 773
            model = ChemBERTaMLPModel(input_dim).to(device)
            
            # Training
            optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
            
            X_tensor = torch.tensor(X_scaled).to(device)
            y_tensor = torch.tensor(y_np).to(device)
            
            dataset = TensorDataset(X_tensor, y_tensor)
            loader = DataLoader(dataset, batch_size=32, shuffle=True)
            
            model.train()
            for epoch in range(epochs):
                for X_batch, y_batch in loader:
                    optimizer.zero_grad()
                    pred = model(X_batch)
                    loss = nn.MSELoss()(pred, y_batch)
                    loss.backward()
                    optimizer.step()
                scheduler.step()
            
            model.eval()
            self.models.append(model)
        
        return self
    
    def predict(self, X_test):
        """Predict using ensemble."""
        X_feat = self._get_features(X_test)
        
        all_preds = []
        for model, scaler in zip(self.models, self.scalers):
            X_scaled = scaler.transform(X_feat)
            X_tensor = torch.tensor(X_scaled).to(device)
            
            with torch.no_grad():
                pred = model(X_tensor).cpu()
            all_preds.append(pred)
        
        # Average predictions
        return torch.stack(all_preds).mean(dim=0)

print('ChemBERTaWrapper defined')

ChemBERTaWrapper defined


In [12]:
# Quick test on single fold
X_single, Y_single = load_data("single_solvent")
test_solvent = sorted(X_single["SOLVENT NAME"].unique())[0]
mask = X_single["SOLVENT NAME"] != test_solvent

print(f"Test solvent: {test_solvent}")
print(f"Training samples: {mask.sum()}, Test samples: {(~mask).sum()}")

model = ChemBERTaWrapper(data='single', n_models=1)
model.train_model(X_single[mask], Y_single[mask], epochs=100)
preds = model.predict(X_single[~mask])

actuals = Y_single[~mask].values
mse = np.mean((actuals - preds.numpy())**2)
print(f"\nTest fold MSE: {mse:.6f}")
print(f"Predictions shape: {preds.shape}")
print(f"\nComparison: exp_035 baseline CV = 0.008194")
print(f"GNN (exp_040) test fold MSE = 0.068767")

Test solvent: 1,1,1,3,3,3-Hexafluoropropan-2-ol
Training samples: 619, Test samples: 37



Test fold MSE: 0.061135
Predictions shape: torch.Size([37, 3])

Comparison: exp_035 baseline CV = 0.008194
GNN (exp_040) test fold MSE = 0.068767


In [None]:
# ChemBERTa alone is not enough (MSE 0.061135 vs baseline 0.008194)
# Let's try combining ChemBERTa with Spange descriptors

# Load Spange descriptors
spange_df = pd.read_csv(f'{DATA_PATH}/spange_descriptors.csv', index_col=0)
print(f"Spange descriptors: {spange_df.shape}")
print(spange_df.head())

# Create combined features: ChemBERTa + Spange + Arrhenius
SPANGE_COLS = ['alpha', 'beta', 'pi_star', 'eps', 'cohesive_pressure', 'viscosity', 'surface_tension']
SPANGE_DICT = {}
for solvent in spange_df.index:
    SPANGE_DICT[solvent] = spange_df.loc[solvent, SPANGE_COLS].values.astype(np.float32)
print(f"\nSpange dict: {len(SPANGE_DICT)} solvents")

In [None]:
# If promising, run full CV
print("Running full leave-one-solvent-out CV...")

all_solvents = sorted(X_single["SOLVENT NAME"].unique())
fold_mses = []

for test_solvent in all_solvents:
    mask = X_single["SOLVENT NAME"] != test_solvent
    
    model = ChemBERTaWrapper(data='single', n_models=3)
    model.train_model(X_single[mask], Y_single[mask], epochs=150)
    preds = model.predict(X_single[~mask])
    
    actuals = Y_single[~mask].values
    mse = np.mean((actuals - preds.numpy())**2)
    fold_mses.append(mse)
    print(f"{test_solvent}: MSE = {mse:.6f}")

mean_mse = np.mean(fold_mses)
std_mse = np.std(fold_mses)
print(f"\n=== ChemBERTa CV Results ===")
print(f"Mean MSE: {mean_mse:.6f} Â± {std_mse:.6f}")
print(f"\nComparison:")
print(f"  exp_035 baseline: CV = 0.008194")
print(f"  GNN (exp_040): test fold MSE = 0.068767")