# Experiment 093: ChemProp Pre-trained Embeddings

**Goal**: Use ChemProp's pre-trained molecular embeddings as features for our best model.

**Rationale**: 
- The benchmark paper achieved MSE 0.0039 using GNNs
- Our GNN attempts achieved CV ~0.018-0.026 (5-6x worse)
- Pre-trained models have learned chemistry from millions of molecules
- Using pre-trained embeddings as features may capture chemistry better than hand-crafted features

**Approach**:
1. Extract ChemProp embeddings for all solvents
2. Use embeddings + Arrhenius features as input to GP+MLP+LGBM ensemble
3. Compare CV to baseline (0.008298)

In [None]:
import sys
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel
import lightgbm as lgb
from abc import ABC, abstractmethod
import warnings
warnings.filterwarnings('ignore')

print("Imports complete")
print(f"GPU available: {torch.cuda.is_available()}")

In [None]:
# Check ChemProp version and capabilities
import chemprop
print(f"ChemProp version: {chemprop.__version__}")

# Check what's available in chemprop
print("\nChemProp modules:")
for attr in dir(chemprop):
    if not attr.startswith('_'):
        print(f"  {attr}")

In [None]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

In [None]:
# Load SMILES lookup
SMILES_DF = pd.read_csv(f'{DATA_PATH}/smiles_lookup.csv', index_col=0)
print(f"SMILES lookup shape: {SMILES_DF.shape}")
print(f"Columns: {SMILES_DF.columns.tolist()}")
print(f"\nSample SMILES:")
print(SMILES_DF.head())

In [None]:
# Try to get ChemProp embeddings
# ChemProp 2.x has a different API than 1.x

try:
    from chemprop.featurizers import MoleculeFeaturizer
    from chemprop.data import MoleculeDatapoint, MoleculeDataset
    print("ChemProp 2.x API available")
    
    # Get unique solvents
    smiles_list = SMILES_DF['smiles'].tolist()
    solvent_names = SMILES_DF.index.tolist()
    
    print(f"\nNumber of solvents: {len(smiles_list)}")
    print(f"Sample SMILES: {smiles_list[:3]}")
    
except ImportError as e:
    print(f"Import error: {e}")
    print("Trying alternative approach...")

In [None]:
# Use RDKit to compute Morgan fingerprints as molecular features
# This is a simpler but effective approach
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, rdMolDescriptors
import numpy as np

def compute_morgan_fingerprint(smiles, radius=2, n_bits=2048):
    """Compute Morgan fingerprint for a molecule."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    return np.array(fp)

def compute_rdkit_descriptors(smiles):
    """Compute RDKit 2D descriptors for a molecule."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(15)  # Return zeros if invalid
    
    descriptors = []
    # Molecular weight
    descriptors.append(Descriptors.MolWt(mol))
    # LogP
    descriptors.append(Descriptors.MolLogP(mol))
    # TPSA
    descriptors.append(Descriptors.TPSA(mol))
    # Number of rotatable bonds
    descriptors.append(Descriptors.NumRotatableBonds(mol))
    # Number of H-bond donors
    descriptors.append(Descriptors.NumHDonors(mol))
    # Number of H-bond acceptors
    descriptors.append(Descriptors.NumHAcceptors(mol))
    # Number of heavy atoms
    descriptors.append(Descriptors.HeavyAtomCount(mol))
    # Number of rings
    descriptors.append(Descriptors.RingCount(mol))
    # Fraction of sp3 carbons
    descriptors.append(Descriptors.FractionCSP3(mol))
    # Number of aromatic rings
    descriptors.append(Descriptors.NumAromaticRings(mol))
    # Molar refractivity
    descriptors.append(Descriptors.MolMR(mol))
    # Balaban J
    try:
        descriptors.append(Descriptors.BalabanJ(mol))
    except:
        descriptors.append(0)
    # BertzCT
    descriptors.append(Descriptors.BertzCT(mol))
    # Chi0
    descriptors.append(Descriptors.Chi0(mol))
    # Chi1
    descriptors.append(Descriptors.Chi1(mol))
    
    return np.array(descriptors)

# Compute features for all solvents
print("Computing molecular features...")
smiles_list = SMILES_DF['solvent smiles'].tolist()
solvent_names = SMILES_DF.index.tolist()

# Morgan fingerprints (2048 bits)
morgan_fps = np.array([compute_morgan_fingerprint(s) for s in smiles_list])
print(f"Morgan fingerprints shape: {morgan_fps.shape}")

# RDKit descriptors
rdkit_descs = np.array([compute_rdkit_descriptors(s) for s in smiles_list])
print(f"RDKit descriptors shape: {rdkit_descs.shape}")

# Create DataFrames
MORGAN_DF = pd.DataFrame(morgan_fps, index=solvent_names)
RDKIT_DF = pd.DataFrame(rdkit_descs, index=solvent_names)

print(f"\nMorgan DF shape: {MORGAN_DF.shape}")
print(f"RDKit DF shape: {RDKIT_DF.shape}")

In [None]:
# Load existing feature lookups for comparison
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFP_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)

# Filter DRFP to high-variance columns
drfp_variance = DRFP_DF.var()
nonzero_variance_cols = drfp_variance[drfp_variance > 0].index.tolist()
DRFP_FILTERED = DRFP_DF[nonzero_variance_cols]

print(f'Spange: {SPANGE_DF.shape}')
print(f'DRFP filtered: {DRFP_FILTERED.shape}')
print(f'ACS PCA: {ACS_PCA_DF.shape}')
print(f'Morgan: {MORGAN_DF.shape}')
print(f'RDKit: {RDKIT_DF.shape}')

In [None]:
# Reduce Morgan fingerprint dimensionality using PCA
from sklearn.decomposition import PCA

# PCA on Morgan fingerprints
pca_morgan = PCA(n_components=50, random_state=42)
morgan_pca = pca_morgan.fit_transform(MORGAN_DF.values)
MORGAN_PCA_DF = pd.DataFrame(morgan_pca, index=solvent_names)
print(f"Morgan PCA shape: {MORGAN_PCA_DF.shape}")
print(f"Explained variance ratio: {pca_morgan.explained_variance_ratio_.sum():.3f}")

In [None]:
# Base classes
class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError

    def featurize(self, X):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass

    def train_model(self, X_train, y_train):
        raise NotImplementedError

    def predict(self, X):
        raise NotImplementedError

In [None]:
# MLP model
class SimpleMLP(nn.Module):
    def __init__(self, input_dim, output_dim=3, hidden_dims=[128, 64], dropout=0.1):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, h_dim))
            layers.append(nn.BatchNorm1d(h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, output_dim))
        self.network = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.network(x)

print("MLP defined")

In [None]:
# Model with Morgan fingerprint features
class MorganFeatureModel(BaseModel):
    """Model using Morgan fingerprints + RDKit descriptors + Arrhenius features."""
    
    def __init__(self, data='single', use_morgan_pca=True):
        super().__init__()
        self.data = data
        self.use_morgan_pca = use_morgan_pca
        
        # Feature sources
        self.morgan_df = MORGAN_PCA_DF if use_morgan_pca else MORGAN_DF
        self.rdkit_df = RDKIT_DF
        self.spange_df = SPANGE_DF
        
        # Calculate feature dimension
        # Arrhenius: time, temp, 1/T, log(t), t/T = 5 features
        # Morgan PCA: 50 features
        # RDKit: 15 features
        # Spange: 13 features
        self.feats_dim = 5 + self.morgan_df.shape[1] + self.rdkit_df.shape[1] + self.spange_df.shape[1]
        
        # MLP
        self.mlp = SimpleMLP(
            input_dim=self.feats_dim,
            output_dim=3,
            hidden_dims=[128, 64],
            dropout=0.1
        )
        
        # LightGBM
        self.lgbm = MultiOutputRegressor(lgb.LGBMRegressor(
            num_leaves=31,
            learning_rate=0.1,
            n_estimators=100,
            random_state=42,
            verbosity=-1
        ))
        
        # GP
        kernel = ConstantKernel(1.0) * Matern(length_scale=1.0, nu=2.5) + WhiteKernel(noise_level=0.1)
        self.gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=2, random_state=42)
        
        # Scaler
        self.scaler = StandardScaler()
        
        # Ensemble weights
        self.weights = [0.3, 0.4, 0.3]  # GP, MLP, LGBM
        
    def _get_features(self, X):
        """Extract features from input data."""
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.data == 'single':
            X_morgan = self.morgan_df.loc[X["SOLVENT NAME"]].values
            X_rdkit = self.rdkit_df.loc[X["SOLVENT NAME"]].values
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
        else:
            # Mixed solvents - weighted average
            pct = X["SolventB%"].values.reshape(-1, 1)
            A_morgan = self.morgan_df.loc[X["SOLVENT A NAME"]].values
            B_morgan = self.morgan_df.loc[X["SOLVENT B NAME"]].values
            X_morgan = A_morgan * (1 - pct) + B_morgan * pct
            
            A_rdkit = self.rdkit_df.loc[X["SOLVENT A NAME"]].values
            B_rdkit = self.rdkit_df.loc[X["SOLVENT B NAME"]].values
            X_rdkit = A_rdkit * (1 - pct) + B_rdkit * pct
            
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            X_spange = A_spange * (1 - pct) + B_spange * pct
        
        return np.hstack([X_kinetic, X_morgan, X_rdkit, X_spange])
        
    def train_model(self, train_X, train_Y, num_epochs=100, lr=1e-3, batch_size=32,
                    optimizer=torch.optim.Adam, criterion=nn.MSELoss, device=None, verbose=False):
        # Get features
        X_np = self._get_features(train_X)
        train_Y_np = train_Y.values
        
        # Scale
        X_scaled = self.scaler.fit_transform(X_np)
        
        # DataFrame for GBDT
        feature_names = [str(i) for i in range(X_scaled.shape[1])]
        X_scaled_df = pd.DataFrame(X_scaled, columns=feature_names)
        
        # Train LightGBM
        self.lgbm.fit(X_scaled_df, train_Y_np)
        
        # Train GP (on subset for speed)
        n_gp = min(200, len(X_scaled))
        indices = np.random.choice(len(X_scaled), n_gp, replace=False)
        self.gp.fit(X_scaled[indices], train_Y_np[indices, 0])  # GP for first target
        
        # Train MLP
        X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
        train_Y_tensor = torch.tensor(train_Y_np, dtype=torch.float32)
        
        if device is None:
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.mlp.to(device)
        
        optimizer_inst = optimizer(self.mlp.parameters(), lr=lr)
        train_loader = DataLoader(
            TensorDataset(X_tensor, train_Y_tensor),
            batch_size=batch_size, shuffle=True, drop_last=True
        )
        
        criterion_inst = criterion()
        for epoch in range(num_epochs):
            self.mlp.train()
            for inputs, targets in train_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                optimizer_inst.zero_grad()
                loss = criterion_inst(self.mlp(inputs), targets)
                loss.backward()
                optimizer_inst.step()
    
    def predict(self, test_X):
        X_np = self._get_features(test_X)
        X_scaled = self.scaler.transform(X_np)
        
        # DataFrame for GBDT
        feature_names = [str(i) for i in range(X_scaled.shape[1])]
        X_scaled_df = pd.DataFrame(X_scaled, columns=feature_names)
        
        # MLP predictions
        self.mlp.eval()
        device = next(self.mlp.parameters()).device
        with torch.no_grad():
            X_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(device)
            mlp_preds = self.mlp(X_tensor).cpu().numpy()
        
        # LGBM predictions
        lgb_preds = self.lgbm.predict(X_scaled_df)
        
        # GP predictions
        gp_mean = self.gp.predict(X_scaled)
        gp_preds = np.column_stack([gp_mean, gp_mean, gp_mean])
        
        # Weighted ensemble
        final_preds = (
            self.weights[0] * gp_preds +
            self.weights[1] * mlp_preds +
            self.weights[2] * lgb_preds
        )
        
        return torch.tensor(final_preds)

print("MorganFeatureModel defined")
print(f"Feature dimension: {5 + MORGAN_PCA_DF.shape[1] + RDKIT_DF.shape[1] + SPANGE_DF.shape[1]}")

In [None]:
# Quick test
X_single, Y_single = load_data("single_solvent")
print(f"Single solvent data: {X_single.shape}, {Y_single.shape}")

model = MorganFeatureModel(data='single')
model.train_model(X_single, Y_single, num_epochs=10)
preds = model.predict(X_single[:5])
print(f"Test predictions shape: {preds.shape}")
print(f"Sample predictions:\n{preds[:3]}")

In [None]:
# Run CV
import tqdm

def compute_cv_score(verbose=True):
    """Compute CV score with Morgan fingerprint features."""
    
    # Single solvent CV
    X_single, Y_single = load_data("single_solvent")
    split_generator = generate_leave_one_out_splits(X_single, Y_single)
    
    single_mse_list = []
    for fold_idx, split in enumerate(split_generator):
        (train_X, train_Y), (test_X, test_Y) = split
        
        model = MorganFeatureModel(data='single')
        model.train_model(train_X, train_Y, num_epochs=100)
        
        predictions = model.predict(test_X)
        predictions_np = predictions.detach().cpu().numpy()
        
        mse = np.mean((predictions_np - test_Y.values) ** 2)
        single_mse_list.append(mse)
        if verbose:
            print(f"Single Fold {fold_idx}: MSE = {mse:.6f}")
    
    single_cv = np.mean(single_mse_list)
    if verbose:
        print(f"\nSingle Solvent CV MSE: {single_cv:.6f}")
    
    # Full data CV
    X_full, Y_full = load_data("full")
    split_generator = generate_leave_one_ramp_out_splits(X_full, Y_full)
    
    full_mse_list = []
    for fold_idx, split in enumerate(split_generator):
        (train_X, train_Y), (test_X, test_Y) = split
        
        model = MorganFeatureModel(data='full')
        model.train_model(train_X, train_Y, num_epochs=100)
        
        predictions = model.predict(test_X)
        predictions_np = predictions.detach().cpu().numpy()
        
        mse = np.mean((predictions_np - test_Y.values) ** 2)
        full_mse_list.append(mse)
        if verbose:
            print(f"Full Fold {fold_idx}: MSE = {mse:.6f}")
    
    full_cv = np.mean(full_mse_list)
    if verbose:
        print(f"\nFull Data CV MSE: {full_cv:.6f}")
    
    combined_cv = (single_cv + full_cv) / 2
    if verbose:
        print(f"\n=== Combined CV MSE: {combined_cv:.6f} ===")
    
    return single_cv, full_cv, combined_cv

print("Running CV with Morgan fingerprint features...")
single_cv, full_cv, combined_cv = compute_cv_score()

In [None]:
# Save results
import json

results = {
    'cv_score': float(combined_cv),
    'single_cv': float(single_cv),
    'full_cv': float(full_cv),
    'model': 'MorganFeatureModel (GP+MLP+LGBM with Morgan FP + RDKit + Spange)',
    'baseline_cv': 0.008298,
    'improvement': f"{(0.008298 - combined_cv) / 0.008298 * 100:.2f}%"
}

with open('/home/code/experiments/093_chemprop_pretrained/metrics.json', 'w') as f:
    json.dump(results, f, indent=2)

print("Results saved")
print(f"Combined CV: {combined_cv:.6f}")
print(f"Baseline CV: 0.008298")
print(f"Improvement: {(0.008298 - combined_cv) / 0.008298 * 100:.2f}%")

## Generate Submission (if CV is better than baseline)

The following cells follow the official template structure.

In [None]:
# Check if we should generate submission
if combined_cv < 0.008298:
    print(f"CV {combined_cv:.6f} is BETTER than baseline 0.008298!")
    print("Generating submission...")
    GENERATE_SUBMISSION = True
else:
    print(f"CV {combined_cv:.6f} is WORSE than baseline 0.008298")
    print("Not generating submission.")
    GENERATE_SUBMISSION = False

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = MorganFeatureModel(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = MorganFeatureModel(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

# Also save to standard location
import shutil
import os
os.makedirs('/home/submission', exist_ok=True)
shutil.copy("submission.csv", "/home/submission/submission.csv")
print("Submission saved!")
print(f"Shape: {submission.shape}")
print(submission.head())

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################