# ChemBERTa Embeddings for Solvent Representation

**Problem**: CV-LB gap has intercept (0.0525) > target (0.0347). Current approach CANNOT reach target.

**Solution**: Use pre-trained ChemBERTa embeddings from SMILES. ChemBERTa is trained on millions of molecules and captures chemical knowledge.

**Key**: This notebook has EXACTLY 3 submission cells at the end (no extra cells).

In [None]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.set_default_dtype(torch.float32)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

In [None]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

In [None]:
# Load SMILES lookup
SMILES_DF = pd.read_csv(f'{DATA_PATH}/smiles_lookup.csv', index_col=0)
print(f'SMILES lookup: {len(SMILES_DF)} solvents')
print(SMILES_DF.head())

In [None]:
# Load ChemBERTa model and tokenizer
from transformers import AutoModel, AutoTokenizer

print('Loading ChemBERTa model...')
model_name = 'seyonec/ChemBERTa-zinc-base-v1'
tokenizer = AutoTokenizer.from_pretrained(model_name)
chemberta = AutoModel.from_pretrained(model_name).to(device)
chemberta.eval()
print(f'ChemBERTa loaded: {model_name}')
print(f'Hidden size: {chemberta.config.hidden_size}')

In [None]:
# Pre-compute ChemBERTa embeddings for all solvents
def get_chemberta_embedding(smiles):
    """Get ChemBERTa embedding for a SMILES string."""
    # Handle mixture SMILES by taking the first component
    if '.' in smiles:
        smiles = smiles.split('.')[0]
    
    inputs = tokenizer(smiles, return_tensors='pt', padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = chemberta(**inputs)
        # Use [CLS] token embedding
        embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()
    
    return embedding

# Pre-compute embeddings for all solvents
SOLVENT_EMBEDDINGS = {}
for solvent_name in SMILES_DF.index:
    smiles = SMILES_DF.loc[solvent_name, 'solvent smiles']
    SOLVENT_EMBEDDINGS[solvent_name] = get_chemberta_embedding(smiles)

print(f'Pre-computed {len(SOLVENT_EMBEDDINGS)} solvent embeddings')
print(f'Embedding dimension: {len(list(SOLVENT_EMBEDDINGS.values())[0])}')

In [None]:
# ChemBERTa + MLP Model
class ChemBERTaModel:
    def __init__(self, data='single', hidden_dims=[128, 64]):
        self.data_type = data
        self.hidden_dims = hidden_dims
        self.scaler = StandardScaler()
        self.model = None
        
    def _get_features(self, X):
        """Get combined features: kinetics + ChemBERTa embeddings."""
        # Kinetics features
        time_m = X["Residence Time"].values.reshape(-1, 1)
        temp_c = X["Temperature"].values.reshape(-1, 1)
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        kinetics = np.hstack([time_m, temp_c, inv_temp, log_time, interaction])
        
        # ChemBERTa embeddings
        if self.data_type == 'single':
            embeddings = np.array([SOLVENT_EMBEDDINGS[name] for name in X["SOLVENT NAME"]])
        else:
            emb_a = np.array([SOLVENT_EMBEDDINGS[name] for name in X["SOLVENT A NAME"]])
            emb_b = np.array([SOLVENT_EMBEDDINGS[name] for name in X["SOLVENT B NAME"]])
            pct = X["SolventB%"].values.reshape(-1, 1)
            # Weighted combination
            embeddings = (1 - pct) * emb_a + pct * emb_b
        
        return np.hstack([kinetics, embeddings])
    
    def train_model(self, X_train, Y_train, epochs=200):
        Y_np = Y_train.values if hasattr(Y_train, 'values') else Y_train
        
        # Get features
        X_features = self._get_features(X_train)
        X_scaled = self.scaler.fit_transform(X_features)
        
        # Build MLP
        input_dim = X_scaled.shape[1]
        layers = [nn.BatchNorm1d(input_dim)]
        prev_dim = input_dim
        for h in self.hidden_dims:
            layers.extend([nn.Linear(prev_dim, h), nn.ReLU(), nn.Dropout(0.2)])
            prev_dim = h
        layers.append(nn.Linear(prev_dim, 3))
        layers.append(nn.Sigmoid())
        
        self.model = nn.Sequential(*layers).to(device)
        
        # Training
        X_torch = torch.tensor(X_scaled, dtype=torch.float32).to(device)
        Y_torch = torch.tensor(Y_np, dtype=torch.float32).to(device)
        
        optimizer = torch.optim.Adam(self.model.parameters(), lr=5e-4, weight_decay=1e-5)
        criterion = nn.HuberLoss()
        
        dataset = TensorDataset(X_torch, Y_torch)
        loader = DataLoader(dataset, batch_size=32, shuffle=True)
        
        self.model.train()
        for epoch in range(epochs):
            for xb, yb in loader:
                optimizer.zero_grad()
                pred = self.model(xb)
                loss = criterion(pred, yb)
                loss.backward()
                optimizer.step()
    
    def predict(self, X_test):
        X_features = self._get_features(X_test)
        X_scaled = self.scaler.transform(X_features)
        
        self.model.eval()
        with torch.no_grad():
            X_torch = torch.tensor(X_scaled, dtype=torch.float32).to(device)
            preds = self.model(X_torch).cpu()
        
        return torch.clamp(preds, 0, 1).double()

print('ChemBERTaModel defined')

In [None]:
# Quick test on single fold
X_single, Y_single = load_data("single_solvent")
test_solvent = sorted(X_single["SOLVENT NAME"].unique())[0]
mask = X_single["SOLVENT NAME"] != test_solvent

print(f"Test solvent: {test_solvent}")
print(f"Training samples: {mask.sum()}, Test samples: {(~mask).sum()}")

model = ChemBERTaModel(data='single')
model.train_model(X_single[mask], Y_single[mask], epochs=100)
preds = model.predict(X_single[~mask])

actuals = Y_single[~mask].values
mse = np.mean((actuals - preds.numpy()) ** 2)
print(f'Test fold MSE: {mse:.6f}')

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = ChemBERTaModel(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y, epochs=200)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

# Calculate single solvent MSE
actuals_single = []
for solvent in sorted(X["SOLVENT NAME"].unique()):
    mask = X["SOLVENT NAME"] == solvent
    actuals_single.append(Y[mask].values)
actuals_single = np.vstack(actuals_single)
preds_single = submission_single_solvent[['target_1', 'target_2', 'target_3']].values
mse_single = np.mean((actuals_single - preds_single) ** 2)
print(f'\nSingle Solvent MSE: {mse_single:.6f}')

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = ChemBERTaModel(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y, epochs=200)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

# Calculate full data MSE
actuals_full = []
ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
for _, row in ramps.iterrows():
    mask = (X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"])
    actuals_full.append(Y[mask].values)
actuals_full = np.vstack(actuals_full)
preds_full = submission_full_data[['target_1', 'target_2', 'target_3']].values
mse_full = np.mean((actuals_full - preds_full) ** 2)
print(f'\nFull Data MSE: {mse_full:.6f}')

# Calculate overall MSE
n_single = len(actuals_single)
n_full = len(actuals_full)
overall_mse = (mse_single * n_single + mse_full * n_full) / (n_single + n_full)
print(f'\nOverall MSE: {overall_mse:.6f}')
print(f'Baseline (exp_030): CV 0.008298')

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################