# Experiment 020: Graph Neural Network (GNN) for Molecular Property Prediction

**Key insight from research:**
- Paper arxiv:2512.19530 achieved MSE 0.0039 using GNN (25x better than tabular ensembles)
- GNN can learn molecular structure patterns that generalize to unseen solvents
- This is the only approach with demonstrated target-level performance

**Architecture:**
- Use RDKit to convert solvent SMILES to molecular graphs
- Graph Convolutional Network (GCN) for molecular encoding
- Combine molecular embeddings with process conditions (Temperature, Residence Time)
- Per-target prediction heads

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler
from abc import ABC
import tqdm
import warnings
warnings.filterwarnings('ignore')

# GNN imports
from rdkit import Chem
from torch_geometric.data import Data, Batch
from torch_geometric.nn import GCNConv, global_mean_pool, global_add_pool

DATA_PATH = '/home/data'
torch.set_default_dtype(torch.float32)  # PyG works better with float32
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')

CUDA available: True
GPU: NVIDIA H100 80GB HBM3


In [2]:
# --- UTILITY FUNCTIONS ---
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[TARGET_LABELS]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & 
                 (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

# Load SMILES lookup
SMILES_DF = pd.read_csv(f'{DATA_PATH}/smiles_lookup.csv', index_col=0)
SMILES_DICT = SMILES_DF['solvent smiles'].to_dict()
print(f"SMILES lookup: {len(SMILES_DICT)} solvents")

SMILES lookup: 26 solvents


In [3]:
# --- BASE CLASSES ---
class SmilesFeaturizer(ABC):
    def __init__(self): raise NotImplementedError
    def featurize(self, X): raise NotImplementedError

class BaseModel(ABC):
    def __init__(self): pass
    def train_model(self, X_train, y_train): raise NotImplementedError
    def predict(self): raise NotImplementedError

In [4]:
# --- MOLECULAR GRAPH UTILITIES ---
def smiles_to_graph(smiles):
    """Convert SMILES string to PyTorch Geometric Data object."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        # Fallback for invalid SMILES (e.g., mixtures like "O.CC#N")
        # Use first component
        smiles_parts = smiles.split('.')
        mol = Chem.MolFromSmiles(smiles_parts[0])
        if mol is None:
            # Return a simple water molecule as fallback
            mol = Chem.MolFromSmiles('O')
    
    # Atom features: [atomic_num, degree, formal_charge, hybridization, aromatic]
    atom_features = []
    for atom in mol.GetAtoms():
        features = [
            atom.GetAtomicNum(),
            atom.GetDegree(),
            atom.GetFormalCharge(),
            int(atom.GetHybridization()),
            int(atom.GetIsAromatic())
        ]
        atom_features.append(features)
    
    x = torch.tensor(atom_features, dtype=torch.float32)
    
    # Edge index (bonds)
    edge_index = []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edge_index.append([i, j])
        edge_index.append([j, i])  # Undirected
    
    if len(edge_index) == 0:
        # Single atom molecule
        edge_index = torch.zeros((2, 0), dtype=torch.long)
    else:
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    
    return Data(x=x, edge_index=edge_index)

# Test
test_graph = smiles_to_graph('CCO')  # Ethanol
print(f"Ethanol graph: {test_graph.num_nodes} atoms, {test_graph.num_edges} edges")
test_graph = smiles_to_graph('O.CC#N')  # Water.Acetonitrile mixture
print(f"Water.Acetonitrile graph: {test_graph.num_nodes} atoms, {test_graph.num_edges} edges")

Ethanol graph: 3 atoms, 4 edges
Water.Acetonitrile graph: 4 atoms, 4 edges


In [5]:
# --- GNN MODEL ---
class MolecularGNN(nn.Module):
    """Graph Neural Network for molecular property prediction.
    
    Architecture:
    - 3 GCN layers for molecular encoding
    - Global mean pooling to get molecule-level embedding
    - Combine with process conditions (Temperature, Residence Time)
    - MLP head for prediction
    """
    def __init__(self, atom_features=5, hidden_dim=64, output_dim=3):
        super().__init__()
        
        # GCN layers
        self.conv1 = GCNConv(atom_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.conv3 = GCNConv(hidden_dim, hidden_dim)
        
        # Process condition encoder
        self.condition_encoder = nn.Sequential(
            nn.Linear(3, 32),  # [RT, Temp, SolventB%]
            nn.ReLU(),
            nn.Linear(32, 32)
        )
        
        # Prediction head
        self.head = nn.Sequential(
            nn.Linear(hidden_dim + 32, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, output_dim),
            nn.Sigmoid()
        )
    
    def forward(self, graph_batch, conditions):
        # Graph encoding
        x, edge_index, batch = graph_batch.x, graph_batch.edge_index, graph_batch.batch
        
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = F.relu(self.conv3(x, edge_index))
        
        # Global pooling
        x = global_mean_pool(x, batch)  # [batch_size, hidden_dim]
        
        # Condition encoding
        cond = self.condition_encoder(conditions)  # [batch_size, 32]
        
        # Combine and predict
        combined = torch.cat([x, cond], dim=1)
        out = self.head(combined)
        
        return out

print("MolecularGNN defined")

MolecularGNN defined


In [6]:
# --- GNN MODEL WRAPPER ---
class GNNModel(BaseModel):
    """GNN-based model for solvent yield prediction.
    
    For single solvents: Use molecular graph directly
    For mixed solvents: Average embeddings of both solvents weighted by SolventB%
    """
    def __init__(self, data='single'):
        self.data_type = data
        self.mixed = (data == 'full')
        self.model = None
        self.scaler = StandardScaler()
        
        # Pre-compute graphs for all solvents
        self.solvent_graphs = {}
        for name, smiles in SMILES_DICT.items():
            self.solvent_graphs[name] = smiles_to_graph(smiles)
    
    def _get_conditions(self, X):
        """Extract process conditions."""
        rt = X['Residence Time'].values.reshape(-1, 1)
        temp = X['Temperature'].values.reshape(-1, 1)
        
        if self.mixed:
            pct = X['SolventB%'].values.reshape(-1, 1) / 100.0
            return np.hstack([rt, temp, pct])
        else:
            return np.hstack([rt, temp, np.zeros((len(X), 1))])
    
    def _get_graphs(self, X):
        """Get molecular graphs for samples."""
        graphs = []
        if self.mixed:
            for _, row in X.iterrows():
                # For mixed solvents, use the primary solvent (A)
                # Could also try averaging embeddings
                solvent = row['SOLVENT A NAME']
                if solvent in self.solvent_graphs:
                    graphs.append(self.solvent_graphs[solvent])
                else:
                    graphs.append(smiles_to_graph('O'))  # Fallback
        else:
            for _, row in X.iterrows():
                solvent = row['SOLVENT NAME']
                if solvent in self.solvent_graphs:
                    graphs.append(self.solvent_graphs[solvent])
                else:
                    graphs.append(smiles_to_graph('O'))  # Fallback
        return graphs
    
    def train_model(self, X_train, y_train):
        # Get conditions and graphs
        conditions = self._get_conditions(X_train)
        conditions_scaled = self.scaler.fit_transform(conditions)
        graphs = self._get_graphs(X_train)
        y = y_train.values
        
        # Initialize model
        self.model = MolecularGNN(atom_features=5, hidden_dim=64, output_dim=3).to(device)
        
        # Training
        optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-3, weight_decay=1e-4)
        criterion = nn.MSELoss()
        
        # Create batches
        batch_size = 32
        n_samples = len(graphs)
        
        self.model.train()
        for epoch in range(100):
            # Shuffle
            indices = np.random.permutation(n_samples)
            
            for start in range(0, n_samples, batch_size):
                end = min(start + batch_size, n_samples)
                batch_idx = indices[start:end]
                
                # Batch graphs
                batch_graphs = [graphs[i] for i in batch_idx]
                graph_batch = Batch.from_data_list(batch_graphs).to(device)
                
                # Batch conditions
                batch_cond = torch.tensor(conditions_scaled[batch_idx], dtype=torch.float32).to(device)
                
                # Batch targets
                batch_y = torch.tensor(y[batch_idx], dtype=torch.float32).to(device)
                
                # Forward
                optimizer.zero_grad()
                pred = self.model(graph_batch, batch_cond)
                loss = criterion(pred, batch_y)
                loss.backward()
                optimizer.step()
    
    def predict(self, X):
        conditions = self._get_conditions(X)
        conditions_scaled = self.scaler.transform(conditions)
        graphs = self._get_graphs(X)
        
        self.model.eval()
        with torch.no_grad():
            graph_batch = Batch.from_data_list(graphs).to(device)
            cond_tensor = torch.tensor(conditions_scaled, dtype=torch.float32).to(device)
            pred = self.model(graph_batch, cond_tensor)
        
        # Convert to double for template compatibility
        return pred.cpu().double()

print("GNNModel defined")

GNNModel defined


In [7]:
# --- QUICK VALIDATION TEST ---
print("Testing GNNModel...")
X_test, Y_test = load_data("single_solvent")

errors = []
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_out_splits(X_test, Y_test)):
    if i >= 3: break
    model = GNNModel(data='single')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors.append(mae)
    solvent = test_X['SOLVENT NAME'].iloc[0]
    print(f"Single Fold {i} ({solvent}): MAE = {mae:.4f}")

print(f"\nSingle solvent quick test MAE: {np.mean(errors):.4f}")

# Test full data
print("\nTesting on full data...")
X_full, Y_full = load_data("full")
errors_full = []
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_ramp_out_splits(X_full, Y_full)):
    if i >= 3: break
    model = GNNModel(data='full')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors_full.append(mae)
    print(f"Full Fold {i}: MAE = {mae:.4f}")

print(f"\nFull data quick test MAE: {np.mean(errors_full):.4f}")

Testing GNNModel...


Single Fold 0 (1,1,1,3,3,3-Hexafluoropropan-2-ol): MAE = 0.2505


Single Fold 1 (2,2,2-Trifluoroethanol): MAE = 0.1118


Single Fold 2 (2-Methyltetrahydrofuran [2-MeTHF]): MAE = 0.0502

Single solvent quick test MAE: 0.1375

Testing on full data...


Full Fold 0: MAE = 0.0840


Full Fold 1: MAE = 0.1285


Full Fold 2: MAE = 0.0833

Full data quick test MAE: 0.0986


In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = GNNModel(data='single') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:06,  6.33s/it]

2it [00:12,  6.34s/it]

3it [00:18,  6.22s/it]

4it [00:24,  6.19s/it]

5it [00:31,  6.32s/it]

6it [00:37,  6.36s/it]

7it [00:44,  6.37s/it]

8it [00:50,  6.36s/it]

9it [00:57,  6.38s/it]

10it [01:03,  6.37s/it]

11it [01:09,  6.37s/it]

12it [01:16,  6.37s/it]

13it [01:22,  6.37s/it]

14it [01:28,  6.39s/it]

15it [01:35,  6.40s/it]

16it [01:41,  6.44s/it]

17it [01:48,  6.59s/it]

18it [01:55,  6.58s/it]

19it [02:01,  6.56s/it]

20it [02:08,  6.56s/it]

21it [02:14,  6.54s/it]

22it [02:21,  6.54s/it]

23it [02:27,  6.52s/it]

24it [02:34,  6.52s/it]

24it [02:34,  6.44s/it]




In [9]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = GNNModel(data = 'full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:11, 11.37s/it]

2it [00:22, 11.31s/it]

3it [00:34, 11.39s/it]

4it [00:45, 11.32s/it]

5it [00:56, 11.25s/it]

6it [01:07, 11.22s/it]

7it [01:18, 11.20s/it]

8it [01:30, 11.23s/it]

9it [01:41, 11.23s/it]

10it [01:53, 11.53s/it]

11it [02:05, 11.73s/it]

12it [02:17, 11.86s/it]

13it [02:29, 11.94s/it]

13it [02:29, 11.53s/it]




In [10]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [11]:
# Calculate CV score
X_single, Y_single = load_data("single_solvent")
X_full, Y_full = load_data("full")

# Single solvent CV
single_preds = submission_single_solvent[['target_1', 'target_2', 'target_3']].values
single_true = []
for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_out_splits(X_single, Y_single)):
    single_true.append(test_Y.values)
single_true = np.vstack(single_true)
single_mae = np.mean(np.abs(single_preds - single_true))
print(f"Single solvent CV MAE: {single_mae:.4f}")

# Full data CV
full_preds = submission_full_data[['target_1', 'target_2', 'target_3']].values
full_true = []
for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_ramp_out_splits(X_full, Y_full)):
    full_true.append(test_Y.values)
full_true = np.vstack(full_true)
full_mae = np.mean(np.abs(full_preds - full_true))
print(f"Full data CV MAE: {full_mae:.4f}")

# Combined
combined_mae = (single_mae + full_mae) / 2
print(f"\nCombined CV MAE: {combined_mae:.4f}")

Single solvent CV MAE: 0.1008
Full data CV MAE: 0.0972

Combined CV MAE: 0.0990


In [12]:
# --- IMPROVED GNN MODEL ---
# Let's try a more sophisticated architecture

from torch_geometric.nn import GATConv, NNConv, Set2Set

def smiles_to_graph_v2(smiles):
    """Enhanced SMILES to graph with more atom features."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        smiles_parts = smiles.split('.')
        mol = Chem.MolFromSmiles(smiles_parts[0])
        if mol is None:
            mol = Chem.MolFromSmiles('O')
    
    # Enhanced atom features (13 features)
    atom_features = []
    for atom in mol.GetAtoms():
        features = [
            atom.GetAtomicNum(),
            atom.GetDegree(),
            atom.GetFormalCharge(),
            int(atom.GetHybridization()),
            int(atom.GetIsAromatic()),
            atom.GetTotalNumHs(),
            atom.GetNumRadicalElectrons(),
            int(atom.IsInRing()),
            atom.GetMass() / 100.0,  # Normalized mass
            atom.GetExplicitValence(),
            int(atom.GetChiralTag()),
            atom.GetTotalValence(),
            int(atom.GetNoImplicit())
        ]
        atom_features.append(features)
    
    x = torch.tensor(atom_features, dtype=torch.float32)
    
    # Edge index and edge features
    edge_index = []
    edge_attr = []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        
        # Bond features (4 features)
        bond_features = [
            float(bond.GetBondTypeAsDouble()),
            int(bond.GetIsConjugated()),
            int(bond.IsInRing()),
            int(bond.GetStereo())
        ]
        
        edge_index.append([i, j])
        edge_index.append([j, i])
        edge_attr.append(bond_features)
        edge_attr.append(bond_features)
    
    if len(edge_index) == 0:
        edge_index = torch.zeros((2, 0), dtype=torch.long)
        edge_attr = torch.zeros((0, 4), dtype=torch.float32)
    else:
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        edge_attr = torch.tensor(edge_attr, dtype=torch.float32)
    
    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr)

print("Testing enhanced graph...")
test_graph = smiles_to_graph_v2('CCO')
print(f"Ethanol: {test_graph.num_nodes} atoms, {test_graph.num_edges} edges, atom_features={test_graph.x.shape}, edge_features={test_graph.edge_attr.shape}")

Testing enhanced graph...
Ethanol: 3 atoms, 4 edges, atom_features=torch.Size([3, 13]), edge_features=torch.Size([4, 4])


In [13]:
# --- IMPROVED GNN WITH GAT ---
class ImprovedMolecularGNN(nn.Module):
    """Graph Attention Network with edge features."""
    def __init__(self, atom_features=13, hidden_dim=128, output_dim=3, heads=4):
        super().__init__()
        
        # Initial embedding
        self.atom_embed = nn.Linear(atom_features, hidden_dim)
        
        # GAT layers
        self.gat1 = GATConv(hidden_dim, hidden_dim // heads, heads=heads, dropout=0.2)
        self.gat2 = GATConv(hidden_dim, hidden_dim // heads, heads=heads, dropout=0.2)
        self.gat3 = GATConv(hidden_dim, hidden_dim, heads=1, dropout=0.2)
        
        # Batch normalization
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.bn3 = nn.BatchNorm1d(hidden_dim)
        
        # Process condition encoder
        self.condition_encoder = nn.Sequential(
            nn.Linear(3, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 64)
        )
        
        # Prediction head
        self.head = nn.Sequential(
            nn.Linear(hidden_dim + 64, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, output_dim),
            nn.Sigmoid()
        )
    
    def forward(self, graph_batch, conditions):
        x, edge_index, batch = graph_batch.x, graph_batch.edge_index, graph_batch.batch
        
        # Initial embedding
        x = self.atom_embed(x)
        
        # GAT layers with residual connections
        x1 = F.elu(self.bn1(self.gat1(x, edge_index)))
        x2 = F.elu(self.bn2(self.gat2(x1, edge_index)))
        x3 = self.bn3(self.gat3(x2, edge_index))
        
        # Global pooling
        x = global_mean_pool(x3, batch)
        
        # Condition encoding
        cond = self.condition_encoder(conditions)
        
        # Combine and predict
        combined = torch.cat([x, cond], dim=1)
        out = self.head(combined)
        
        return out

print("ImprovedMolecularGNN defined")

ImprovedMolecularGNN defined


In [14]:
# --- IMPROVED GNN MODEL WRAPPER ---
class ImprovedGNNModel(BaseModel):
    """Improved GNN with GAT, more features, and better training."""
    def __init__(self, data='single'):
        self.data_type = data
        self.mixed = (data == 'full')
        self.model = None
        self.scaler = StandardScaler()
        
        # Pre-compute graphs for all solvents
        self.solvent_graphs = {}
        for name, smiles in SMILES_DICT.items():
            self.solvent_graphs[name] = smiles_to_graph_v2(smiles)
    
    def _get_conditions(self, X):
        rt = X['Residence Time'].values.reshape(-1, 1)
        temp = X['Temperature'].values.reshape(-1, 1)
        
        if self.mixed:
            pct = X['SolventB%'].values.reshape(-1, 1) / 100.0
            return np.hstack([rt, temp, pct])
        else:
            return np.hstack([rt, temp, np.zeros((len(X), 1))])
    
    def _get_graphs(self, X):
        graphs = []
        if self.mixed:
            for _, row in X.iterrows():
                solvent = row['SOLVENT A NAME']
                if solvent in self.solvent_graphs:
                    graphs.append(self.solvent_graphs[solvent])
                else:
                    graphs.append(smiles_to_graph_v2('O'))
        else:
            for _, row in X.iterrows():
                solvent = row['SOLVENT NAME']
                if solvent in self.solvent_graphs:
                    graphs.append(self.solvent_graphs[solvent])
                else:
                    graphs.append(smiles_to_graph_v2('O'))
        return graphs
    
    def train_model(self, X_train, y_train):
        conditions = self._get_conditions(X_train)
        conditions_scaled = self.scaler.fit_transform(conditions)
        graphs = self._get_graphs(X_train)
        y = y_train.values
        
        # Initialize model
        self.model = ImprovedMolecularGNN(atom_features=13, hidden_dim=128, output_dim=3).to(device)
        
        # Training with LR scheduling
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=5e-4, weight_decay=1e-3)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=20)
        criterion = nn.MSELoss()
        
        batch_size = 32
        n_samples = len(graphs)
        
        best_loss = float('inf')
        patience_counter = 0
        
        self.model.train()
        for epoch in range(200):
            indices = np.random.permutation(n_samples)
            epoch_loss = 0
            
            for start in range(0, n_samples, batch_size):
                end = min(start + batch_size, n_samples)
                batch_idx = indices[start:end]
                
                batch_graphs = [graphs[i] for i in batch_idx]
                graph_batch = Batch.from_data_list(batch_graphs).to(device)
                batch_cond = torch.tensor(conditions_scaled[batch_idx], dtype=torch.float32).to(device)
                batch_y = torch.tensor(y[batch_idx], dtype=torch.float32).to(device)
                
                optimizer.zero_grad()
                pred = self.model(graph_batch, batch_cond)
                loss = criterion(pred, batch_y)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                
                epoch_loss += loss.item()
            
            avg_loss = epoch_loss / (n_samples // batch_size + 1)
            scheduler.step(avg_loss)
            
            # Early stopping
            if avg_loss < best_loss:
                best_loss = avg_loss
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= 40:
                    break
    
    def predict(self, X):
        conditions = self._get_conditions(X)
        conditions_scaled = self.scaler.transform(conditions)
        graphs = self._get_graphs(X)
        
        self.model.eval()
        with torch.no_grad():
            graph_batch = Batch.from_data_list(graphs).to(device)
            cond_tensor = torch.tensor(conditions_scaled, dtype=torch.float32).to(device)
            pred = self.model(graph_batch, cond_tensor)
        
        return pred.cpu().double()

print("ImprovedGNNModel defined")

ImprovedGNNModel defined


In [15]:
# Quick test of improved GNN
print("Testing ImprovedGNNModel...")
X_test, Y_test = load_data("single_solvent")

errors = []
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_out_splits(X_test, Y_test)):
    if i >= 3: break
    model = ImprovedGNNModel(data='single')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors.append(mae)
    solvent = test_X['SOLVENT NAME'].iloc[0]
    print(f"Single Fold {i} ({solvent}): MAE = {mae:.4f}")

print(f"\nImproved GNN quick test MAE: {np.mean(errors):.4f}")

Testing ImprovedGNNModel...


Single Fold 0 (1,1,1,3,3,3-Hexafluoropropan-2-ol): MAE = 0.1616


Single Fold 1 (2,2,2-Trifluoroethanol): MAE = 0.1417


Single Fold 2 (2-Methyltetrahydrofuran [2-MeTHF]): MAE = 0.0259

Improved GNN quick test MAE: 0.1097


In [16]:
# Let's try a hybrid approach: RDKit descriptors + MLP
# This might work better than pure GNN with limited data

from rdkit.Chem import Descriptors, rdMolDescriptors

def get_rdkit_descriptors(smiles):
    """Get comprehensive RDKit molecular descriptors."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        smiles_parts = smiles.split('.')
        mol = Chem.MolFromSmiles(smiles_parts[0])
        if mol is None:
            mol = Chem.MolFromSmiles('O')
    
    descriptors = [
        Descriptors.MolWt(mol),
        Descriptors.MolLogP(mol),
        Descriptors.TPSA(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.NumHAcceptors(mol),
        Descriptors.NumRotatableBonds(mol),
        Descriptors.NumAromaticRings(mol),
        Descriptors.NumAliphaticRings(mol),
        Descriptors.FractionCSP3(mol),
        rdMolDescriptors.CalcNumHeavyAtoms(mol),
        rdMolDescriptors.CalcNumHeteroatoms(mol),
        Descriptors.NumValenceElectrons(mol),
        Descriptors.MaxPartialCharge(mol) if Descriptors.MaxPartialCharge(mol) is not None else 0,
        Descriptors.MinPartialCharge(mol) if Descriptors.MinPartialCharge(mol) is not None else 0,
        Descriptors.MaxAbsPartialCharge(mol) if Descriptors.MaxAbsPartialCharge(mol) is not None else 0,
        Descriptors.BalabanJ(mol) if Descriptors.BalabanJ(mol) != 0 else 0,
        Descriptors.BertzCT(mol),
        Descriptors.Chi0(mol),
        Descriptors.Chi1(mol),
        Descriptors.HallKierAlpha(mol),
        Descriptors.Kappa1(mol),
        Descriptors.Kappa2(mol),
        Descriptors.LabuteASA(mol),
        Descriptors.PEOE_VSA1(mol),
        Descriptors.PEOE_VSA2(mol),
        Descriptors.SMR_VSA1(mol),
        Descriptors.SlogP_VSA1(mol),
        Descriptors.EState_VSA1(mol),
    ]
    
    # Replace NaN/inf with 0
    descriptors = [0 if (d is None or np.isnan(d) or np.isinf(d)) else d for d in descriptors]
    return np.array(descriptors)

# Test
test_desc = get_rdkit_descriptors('CCO')
print(f"Ethanol descriptors: {len(test_desc)} features")
print(f"Sample values: {test_desc[:5]}")

Ethanol descriptors: 28 features
Sample values: [ 4.6069e+01 -1.4000e-03  2.0230e+01  1.0000e+00  1.0000e+00]


In [17]:
# Pre-compute RDKit descriptors for all solvents
RDKIT_DESC_DICT = {}
for name, smiles in SMILES_DICT.items():
    RDKIT_DESC_DICT[name] = get_rdkit_descriptors(smiles)
print(f"Computed RDKit descriptors for {len(RDKIT_DESC_DICT)} solvents")

# --- HYBRID MODEL: RDKit Descriptors + MLP ---
class RDKitMLPModel(BaseModel):
    """MLP using RDKit molecular descriptors."""
    def __init__(self, data='single'):
        self.data_type = data
        self.mixed = (data == 'full')
        self.model = None
        self.scaler = StandardScaler()
    
    def _get_features(self, X):
        features = []
        if self.mixed:
            for _, row in X.iterrows():
                solvent_a = row['SOLVENT A NAME']
                solvent_b = row['SOLVENT B NAME']
                pct_b = row['SolventB%'] / 100.0
                
                desc_a = RDKIT_DESC_DICT.get(solvent_a, np.zeros(28))
                desc_b = RDKIT_DESC_DICT.get(solvent_b, np.zeros(28))
                
                # Weighted average of descriptors
                desc_mix = (1 - pct_b) * desc_a + pct_b * desc_b
                
                feat = np.concatenate([
                    [row['Residence Time'], row['Temperature'], pct_b],
                    desc_mix
                ])
                features.append(feat)
        else:
            for _, row in X.iterrows():
                solvent = row['SOLVENT NAME']
                desc = RDKIT_DESC_DICT.get(solvent, np.zeros(28))
                
                feat = np.concatenate([
                    [row['Residence Time'], row['Temperature']],
                    desc
                ])
                features.append(feat)
        
        return np.array(features)
    
    def train_model(self, X_train, y_train):
        X_feat = self._get_features(X_train)
        X_scaled = self.scaler.fit_transform(X_feat)
        y = y_train.values
        
        input_dim = X_scaled.shape[1]
        
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 3),
            nn.Sigmoid()
        ).to(device)
        
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=1e-3, weight_decay=1e-3)
        criterion = nn.MSELoss()
        
        X_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(device)
        y_tensor = torch.tensor(y, dtype=torch.float32).to(device)
        
        self.model.train()
        for epoch in range(200):
            optimizer.zero_grad()
            pred = self.model(X_tensor)
            loss = criterion(pred, y_tensor)
            loss.backward()
            optimizer.step()
    
    def predict(self, X):
        X_feat = self._get_features(X)
        X_scaled = self.scaler.transform(X_feat)
        
        self.model.eval()
        with torch.no_grad():
            X_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(device)
            pred = self.model(X_tensor)
        
        return pred.cpu().double()

print("RDKitMLPModel defined")

Computed RDKit descriptors for 26 solvents
RDKitMLPModel defined


In [18]:
# Quick test of RDKit MLP
print("Testing RDKitMLPModel...")
X_test, Y_test = load_data("single_solvent")

errors = []
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_out_splits(X_test, Y_test)):
    if i >= 5: break
    model = RDKitMLPModel(data='single')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors.append(mae)
    solvent = test_X['SOLVENT NAME'].iloc[0]
    print(f"Single Fold {i} ({solvent}): MAE = {mae:.4f}")

print(f"\nRDKit MLP quick test MAE: {np.mean(errors):.4f}")

Testing RDKitMLPModel...


Single Fold 0 (1,1,1,3,3,3-Hexafluoropropan-2-ol): MAE = 0.2035
Single Fold 1 (2,2,2-Trifluoroethanol): MAE = 0.1334


Single Fold 2 (2-Methyltetrahydrofuran [2-MeTHF]): MAE = 0.0344
Single Fold 3 (Acetonitrile): MAE = 0.0783


Single Fold 4 (Acetonitrile.Acetic Acid): MAE = 0.1746

RDKit MLP quick test MAE: 0.1248


In [19]:
# Let's try Morgan fingerprints - these are known to work well for molecular property prediction
from rdkit.Chem import AllChem

def get_morgan_fingerprint(smiles, radius=2, n_bits=1024):
    """Get Morgan fingerprint as numpy array."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        smiles_parts = smiles.split('.')
        mol = Chem.MolFromSmiles(smiles_parts[0])
        if mol is None:
            mol = Chem.MolFromSmiles('O')
    
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    return np.array(fp)

# Pre-compute fingerprints for all solvents
MORGAN_FP_DICT = {}
for name, smiles in SMILES_DICT.items():
    MORGAN_FP_DICT[name] = get_morgan_fingerprint(smiles)
print(f"Computed Morgan fingerprints for {len(MORGAN_FP_DICT)} solvents")

# Test
test_fp = get_morgan_fingerprint('CCO')
print(f"Ethanol fingerprint: {len(test_fp)} bits, {np.sum(test_fp)} bits set")

Computed Morgan fingerprints for 26 solvents
Ethanol fingerprint: 1024 bits, 6 bits set


In [20]:
# --- MORGAN FINGERPRINT + MLP MODEL ---
class MorganMLPModel(BaseModel):
    """MLP using Morgan fingerprints + RDKit descriptors."""
    def __init__(self, data='single'):
        self.data_type = data
        self.mixed = (data == 'full')
        self.model = None
        self.scaler = StandardScaler()
    
    def _get_features(self, X):
        features = []
        if self.mixed:
            for _, row in X.iterrows():
                solvent_a = row['SOLVENT A NAME']
                solvent_b = row['SOLVENT B NAME']
                pct_b = row['SolventB%'] / 100.0
                
                fp_a = MORGAN_FP_DICT.get(solvent_a, np.zeros(1024))
                fp_b = MORGAN_FP_DICT.get(solvent_b, np.zeros(1024))
                desc_a = RDKIT_DESC_DICT.get(solvent_a, np.zeros(28))
                desc_b = RDKIT_DESC_DICT.get(solvent_b, np.zeros(28))
                
                # Weighted average
                fp_mix = (1 - pct_b) * fp_a + pct_b * fp_b
                desc_mix = (1 - pct_b) * desc_a + pct_b * desc_b
                
                feat = np.concatenate([
                    [row['Residence Time'], row['Temperature'], pct_b],
                    desc_mix,
                    fp_mix
                ])
                features.append(feat)
        else:
            for _, row in X.iterrows():
                solvent = row['SOLVENT NAME']
                fp = MORGAN_FP_DICT.get(solvent, np.zeros(1024))
                desc = RDKIT_DESC_DICT.get(solvent, np.zeros(28))
                
                feat = np.concatenate([
                    [row['Residence Time'], row['Temperature']],
                    desc,
                    fp
                ])
                features.append(feat)
        
        return np.array(features)
    
    def train_model(self, X_train, y_train):
        X_feat = self._get_features(X_train)
        X_scaled = self.scaler.fit_transform(X_feat)
        y = y_train.values
        
        input_dim = X_scaled.shape[1]
        
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 3),
            nn.Sigmoid()
        ).to(device)
        
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=1e-3, weight_decay=1e-3)
        criterion = nn.MSELoss()
        
        X_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(device)
        y_tensor = torch.tensor(y, dtype=torch.float32).to(device)
        
        self.model.train()
        for epoch in range(200):
            optimizer.zero_grad()
            pred = self.model(X_tensor)
            loss = criterion(pred, y_tensor)
            loss.backward()
            optimizer.step()
    
    def predict(self, X):
        X_feat = self._get_features(X)
        X_scaled = self.scaler.transform(X_feat)
        
        self.model.eval()
        with torch.no_grad():
            X_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(device)
            pred = self.model(X_tensor)
        
        return pred.cpu().double()

print("MorganMLPModel defined")

# Quick test
print("\nTesting MorganMLPModel...")
X_test, Y_test = load_data("single_solvent")

errors = []
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_out_splits(X_test, Y_test)):
    if i >= 5: break
    model = MorganMLPModel(data='single')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors.append(mae)
    solvent = test_X['SOLVENT NAME'].iloc[0]
    print(f"Single Fold {i} ({solvent}): MAE = {mae:.4f}")

print(f"\nMorgan MLP quick test MAE: {np.mean(errors):.4f}")

MorganMLPModel defined

Testing MorganMLPModel...


Single Fold 0 (1,1,1,3,3,3-Hexafluoropropan-2-ol): MAE = 0.2630


Single Fold 1 (2,2,2-Trifluoroethanol): MAE = 0.1127


Single Fold 2 (2-Methyltetrahydrofuran [2-MeTHF]): MAE = 0.0600


Single Fold 3 (Acetonitrile): MAE = 0.0662


Single Fold 4 (Acetonitrile.Acetic Acid): MAE = 0.1890

Morgan MLP quick test MAE: 0.1382


In [21]:
# Let's go back to our best approach - Spange descriptors + MLP with improvements
# Load Spange descriptors
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
print(f"Spange descriptors: {SPANGE_DF.shape}")
print(SPANGE_DF.columns.tolist())

Spange descriptors: (26, 13)
['dielectric constant', 'ET(30)', 'alpha', 'beta', 'pi*', 'SA', 'SB', 'SP', 'SdP', 'N', 'n', 'f(n)', 'delta']


In [22]:
# --- ENHANCED MLP WITH SPANGE DESCRIPTORS ---
# This is our best approach with improvements: 300 epochs, LR scheduling, gradient clipping

class EnhancedSpangeMLPModel(BaseModel):
    """Enhanced MLP using Spange descriptors with better training."""
    def __init__(self, data='single'):
        self.data_type = data
        self.mixed = (data == 'full')
        self.model = None
        self.scaler = StandardScaler()
    
    def _get_features(self, X):
        features = []
        if self.mixed:
            for _, row in X.iterrows():
                solvent_a = row['SOLVENT A NAME']
                solvent_b = row['SOLVENT B NAME']
                pct_b = row['SolventB%'] / 100.0
                
                desc_a = SPANGE_DF.loc[solvent_a].values if solvent_a in SPANGE_DF.index else np.zeros(13)
                desc_b = SPANGE_DF.loc[solvent_b].values if solvent_b in SPANGE_DF.index else np.zeros(13)
                
                # Weighted average
                desc_mix = (1 - pct_b) * desc_a + pct_b * desc_b
                
                feat = np.concatenate([
                    [row['Residence Time'], row['Temperature'], pct_b],
                    desc_mix
                ])
                features.append(feat)
        else:
            for _, row in X.iterrows():
                solvent = row['SOLVENT NAME']
                desc = SPANGE_DF.loc[solvent].values if solvent in SPANGE_DF.index else np.zeros(13)
                
                feat = np.concatenate([
                    [row['Residence Time'], row['Temperature']],
                    desc
                ])
                features.append(feat)
        
        return np.array(features)
    
    def train_model(self, X_train, y_train):
        X_feat = self._get_features(X_train)
        X_scaled = self.scaler.fit_transform(X_feat)
        y = y_train.values
        
        input_dim = X_scaled.shape[1]
        
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 3),
            nn.Sigmoid()
        ).to(device)
        
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=1e-3, weight_decay=1e-3)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=30)
        criterion = nn.MSELoss()
        
        X_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(device)
        y_tensor = torch.tensor(y, dtype=torch.float32).to(device)
        
        self.model.train()
        for epoch in range(300):
            optimizer.zero_grad()
            pred = self.model(X_tensor)
            loss = criterion(pred, y_tensor)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
            optimizer.step()
            scheduler.step(loss.item())
    
    def predict(self, X):
        X_feat = self._get_features(X)
        X_scaled = self.scaler.transform(X_feat)
        
        self.model.eval()
        with torch.no_grad():
            X_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(device)
            pred = self.model(X_tensor)
        
        return pred.cpu().double()

print("EnhancedSpangeMLPModel defined")

EnhancedSpangeMLPModel defined
