# Experiment 096: Multi-Order GAT with Attention Readout

**Goal**: Implement a PROPER Graph Attention Network (not just name it GNN and implement MLP).

**Architecture based on MoGAT paper**:
1. Multi-order graph attention - extract embeddings from EVERY GAT layer
2. Attention-based readout - learned weighted sum of graph embeddings
3. Proper node features from RDKit
4. DRFP features integrated after graph pooling
5. Reaction conditions (T, RT)

**Benchmark paper achieved MSE 0.0039** - this proves the target is reachable with GNNs.

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler
from abc import ABC, abstractmethod
import warnings
warnings.filterwarnings('ignore')

# PyTorch Geometric imports
from torch_geometric.nn import GATConv, global_mean_pool
from torch_geometric.data import Data, Batch

# RDKit imports
from rdkit import Chem
from rdkit.Chem import AllChem

print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

GPU available: True
GPU: NVIDIA H100 80GB HBM3


In [2]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [3]:
# Base classes
class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError

    def featurize(self, X):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass

    def train_model(self, X_train, y_train):
        raise NotImplementedError

    def predict(self, X):
        raise NotImplementedError

In [4]:
# Load feature lookups
SPANGE_DF = load_features("spange_descriptors")
DRFP_DF = load_features("drfps_catechol")
SMILES_DF = pd.read_csv(f'{DATA_PATH}/smiles_lookup.csv', index_col=0)

# Filter DRFP to non-zero variance columns
drfp_variance = DRFP_DF.var()
nonzero_variance_cols = drfp_variance[drfp_variance > 0].index.tolist()
DRFP_FILTERED = DRFP_DF[nonzero_variance_cols]

print(f"Spange: {SPANGE_DF.shape}")
print(f"DRFP filtered: {DRFP_FILTERED.shape}")
print(f"SMILES: {SMILES_DF.shape}")
print(f"\nSample SMILES:")
print(SMILES_DF.head())

Spange: (26, 13)
DRFP filtered: (24, 122)
SMILES: (26, 1)

Sample SMILES:
                                           solvent smiles
SOLVENT NAME                                             
Cyclohexane                                      C1CCCCC1
Ethyl Acetate                                   O=C(OCC)C
Acetic Acid                                       CC(=O)O
2-Methyltetrahydrofuran [2-MeTHF]              O1C(C)CCC1
1,1,1,3,3,3-Hexafluoropropan-2-ol  C(C(F)(F)F)(C(F)(F)F)O


In [5]:
# Molecule Graph Converter
class MoleculeGraph:
    """Convert SMILES to PyTorch Geometric graph with rich features."""
    
    @staticmethod
    def smiles_to_graph(smiles):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        
        # Node features (7 features per atom)
        node_features = []
        for atom in mol.GetAtoms():
            features = [
                atom.GetAtomicNum(),           # Atomic number
                atom.GetDegree(),              # Degree
                atom.GetFormalCharge(),        # Formal charge
                int(atom.GetHybridization()),  # Hybridization
                int(atom.GetIsAromatic()),     # Aromaticity
                atom.GetTotalNumHs(),          # H count
                atom.GetNumRadicalElectrons(), # Radical electrons
            ]
            node_features.append(features)
        
        if len(node_features) == 0:
            return None
        
        # Edge index (bond connectivity)
        edge_index = []
        for bond in mol.GetBonds():
            i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
            edge_index.extend([[i, j], [j, i]])
        
        x = torch.tensor(node_features, dtype=torch.float)
        
        if len(edge_index) > 0:
            edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        else:
            edge_index = torch.zeros((2, 0), dtype=torch.long)
        
        return Data(x=x, edge_index=edge_index)

# Test graph conversion
test_smiles = SMILES_DF['solvent smiles'].iloc[0]
print(f"Test SMILES: {test_smiles}")
test_graph = MoleculeGraph.smiles_to_graph(test_smiles)
print(f"Graph: {test_graph}")
print(f"Node features shape: {test_graph.x.shape}")
print(f"Edge index shape: {test_graph.edge_index.shape}")

Test SMILES: C1CCCCC1
Graph: Data(x=[6, 7], edge_index=[2, 12])
Node features shape: torch.Size([6, 7])
Edge index shape: torch.Size([2, 12])


In [6]:
# Multi-Order GAT Model
class MultiOrderGAT(nn.Module):
    """Multi-order Graph Attention Network with attention readout."""
    
    def __init__(self, node_dim=7, hidden_dim=64, num_layers=3, num_heads=4, drfp_dim=122):
        super().__init__()
        
        self.num_layers = num_layers
        
        # Node embedding
        self.node_embed = nn.Linear(node_dim, hidden_dim)
        
        # Multiple GAT layers
        self.gat_layers = nn.ModuleList()
        for i in range(num_layers):
            self.gat_layers.append(
                GATConv(hidden_dim, hidden_dim, heads=num_heads, concat=False, dropout=0.2)
            )
        
        # Attention readout for multi-order embeddings
        self.order_attention = nn.Linear(hidden_dim, 1)
        
        # DRFP projection
        self.drfp_proj = nn.Linear(drfp_dim, hidden_dim)
        
        # Output head (graph_feat + drfp_feat + T + RT)
        self.output = nn.Sequential(
            nn.Linear(hidden_dim * 2 + 2, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 3)
        )
    
    def forward(self, data, T, RT, drfp):
        # Node embedding
        x = self.node_embed(data.x)
        
        # Collect embeddings from all layers
        layer_embeddings = []
        for gat in self.gat_layers:
            x = F.relu(gat(x, data.edge_index))
            # Pool to graph level
            graph_embed = global_mean_pool(x, data.batch)
            layer_embeddings.append(graph_embed)
        
        # Stack and apply attention
        stacked = torch.stack(layer_embeddings, dim=1)  # [batch, num_layers, hidden_dim]
        attn_weights = F.softmax(self.order_attention(stacked).squeeze(-1), dim=1)  # [batch, num_layers]
        graph_feat = (stacked * attn_weights.unsqueeze(-1)).sum(dim=1)  # [batch, hidden_dim]
        
        # DRFP features
        drfp_feat = self.drfp_proj(drfp)
        
        # Combine all features
        combined = torch.cat([graph_feat, drfp_feat, T, RT], dim=1)
        
        return self.output(combined)

print("MultiOrderGAT defined")

MultiOrderGAT defined


In [7]:
# Multi-Order GAT Wrapper
class MultiOrderGATWrapper(BaseModel):
    """Wrapper for MultiOrderGAT to match competition template."""
    
    def __init__(self, data='single', hidden_dim=64, num_layers=3, num_epochs=200, lr=0.001):
        self.data_mode = data
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.num_epochs = num_epochs
        self.lr = lr
        self.model = None
        self.graph_cache = {}
        self.drfp_scaler = StandardScaler()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
    def _get_graph(self, smiles):
        """Get or create graph for a SMILES string."""
        if smiles not in self.graph_cache:
            self.graph_cache[smiles] = MoleculeGraph.smiles_to_graph(smiles)
        return self.graph_cache[smiles]
    
    def _prepare_batch(self, X, fit_scaler=False):
        """Prepare a batch of data for the model."""
        # Get reaction conditions
        T = torch.tensor(X['Temperature'].values, dtype=torch.float).reshape(-1, 1)
        RT = torch.tensor(X['Residence Time'].values, dtype=torch.float).reshape(-1, 1)
        
        # Normalize T and RT
        T = (T - 50) / 50  # Assuming T is around 50-100
        RT = (RT - 5) / 5  # Assuming RT is around 0-10
        
        if self.data_mode == 'single':
            # Single solvent - get graphs and DRFP
            graphs = []
            drfp_list = []
            for idx, row in X.iterrows():
                solvent = row['SOLVENT NAME']
                smiles = SMILES_DF.loc[solvent, 'solvent smiles']
                graph = self._get_graph(smiles)
                if graph is not None:
                    graphs.append(graph)
                    drfp_list.append(DRFP_FILTERED.loc[solvent].values)
            
            # Batch graphs
            batch = Batch.from_data_list(graphs)
            drfp = np.array(drfp_list)
            
        else:
            # Mixed solvents - weighted average of graphs and DRFP
            graphs = []
            drfp_list = []
            for idx, row in X.iterrows():
                solvent_a = row['SOLVENT A NAME']
                solvent_b = row['SOLVENT B NAME']
                pct_b = row['SolventB%']
                
                smiles_a = SMILES_DF.loc[solvent_a, 'solvent smiles']
                smiles_b = SMILES_DF.loc[solvent_b, 'solvent smiles']
                
                # For mixtures, we'll use solvent A's graph (simplified approach)
                # A more sophisticated approach would pool both graphs
                graph = self._get_graph(smiles_a)
                if graph is not None:
                    graphs.append(graph)
                    
                    # Weighted DRFP
                    drfp_a = DRFP_FILTERED.loc[solvent_a].values
                    drfp_b = DRFP_FILTERED.loc[solvent_b].values
                    drfp_mix = (1 - pct_b) * drfp_a + pct_b * drfp_b
                    drfp_list.append(drfp_mix)
            
            batch = Batch.from_data_list(graphs)
            drfp = np.array(drfp_list)
        
        # Scale DRFP
        if fit_scaler:
            drfp = self.drfp_scaler.fit_transform(drfp)
        else:
            drfp = self.drfp_scaler.transform(drfp)
        
        drfp = torch.tensor(drfp, dtype=torch.float)
        
        return batch, T, RT, drfp
    
    def train_model(self, train_X, train_Y, device=None, verbose=False):
        """Train the model."""
        if device is None:
            device = self.device
        
        # Prepare data
        batch, T, RT, drfp = self._prepare_batch(train_X, fit_scaler=True)
        Y = torch.tensor(train_Y.values, dtype=torch.float)
        
        # Move to device
        batch = batch.to(device)
        T = T.to(device)
        RT = RT.to(device)
        drfp = drfp.to(device)
        Y = Y.to(device)
        
        # Create model
        drfp_dim = DRFP_FILTERED.shape[1]
        self.model = MultiOrderGAT(
            node_dim=7,
            hidden_dim=self.hidden_dim,
            num_layers=self.num_layers,
            drfp_dim=drfp_dim
        ).to(device)
        
        # Training
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
        criterion = nn.MSELoss()
        
        self.model.train()
        for epoch in range(self.num_epochs):
            optimizer.zero_grad()
            pred = self.model(batch, T, RT, drfp)
            loss = criterion(pred, Y)
            loss.backward()
            optimizer.step()
            
            if verbose and (epoch + 1) % 50 == 0:
                print(f"Epoch {epoch+1}/{self.num_epochs}, Loss: {loss.item():.6f}")
    
    def predict(self, X):
        """Make predictions."""
        self.model.eval()
        device = self.device
        
        # Prepare data
        batch, T, RT, drfp = self._prepare_batch(X, fit_scaler=False)
        
        # Move to device
        batch = batch.to(device)
        T = T.to(device)
        RT = RT.to(device)
        drfp = drfp.to(device)
        
        with torch.no_grad():
            pred = self.model(batch, T, RT, drfp).cpu().numpy()
        
        # Clip to non-negative
        pred = np.clip(pred, 0, None)
        
        return torch.tensor(pred, dtype=torch.double)

print("MultiOrderGATWrapper defined")

MultiOrderGATWrapper defined


In [None]:
# Test the model
X_single, Y_single = load_data("single_solvent")
print(f"Single solvent data: {X_single.shape}, {Y_single.shape}")

# Quick test with small subset
X_small = X_single.head(50)
Y_small = Y_single.head(50)

model = MultiOrderGATWrapper(data='single', num_epochs=20)
model.train_model(X_small, Y_small, verbose=True)
preds = model.predict(X_small[:5])
print(f"\nTest predictions shape: {preds.shape}")
print(f"Sample predictions:\n{preds[:3]}")

In [None]:
# Run CV
import tqdm

def compute_cv_score(verbose=True):
    """Compute CV score with MultiOrderGAT."""
    
    # Single solvent CV
    X_single, Y_single = load_data("single_solvent")
    split_generator = generate_leave_one_out_splits(X_single, Y_single)
    
    single_mse_list = []
    for fold_idx, split in enumerate(split_generator):
        (train_X, train_Y), (test_X, test_Y) = split
        
        model = MultiOrderGATWrapper(data='single', hidden_dim=64, num_layers=3, num_epochs=200, lr=0.001)
        model.train_model(train_X, train_Y)
        
        predictions = model.predict(test_X)
        predictions_np = predictions.detach().cpu().numpy()
        
        mse = np.mean((predictions_np - test_Y.values) ** 2)
        single_mse_list.append(mse)
        if verbose:
            print(f"Single Fold {fold_idx}: MSE = {mse:.6f}")
    
    single_cv = np.mean(single_mse_list)
    if verbose:
        print(f"\nSingle Solvent CV MSE: {single_cv:.6f}")
    
    # Full data CV
    X_full, Y_full = load_data("full")
    split_generator = generate_leave_one_ramp_out_splits(X_full, Y_full)
    
    full_mse_list = []
    for fold_idx, split in enumerate(split_generator):
        (train_X, train_Y), (test_X, test_Y) = split
        
        model = MultiOrderGATWrapper(data='full', hidden_dim=64, num_layers=3, num_epochs=200, lr=0.001)
        model.train_model(train_X, train_Y)
        
        predictions = model.predict(test_X)
        predictions_np = predictions.detach().cpu().numpy()
        
        mse = np.mean((predictions_np - test_Y.values) ** 2)
        full_mse_list.append(mse)
        if verbose:
            print(f"Full Fold {fold_idx}: MSE = {mse:.6f}")
    
    full_cv = np.mean(full_mse_list)
    if verbose:
        print(f"\nFull Data CV MSE: {full_cv:.6f}")
    
    combined_cv = (single_cv + full_cv) / 2
    if verbose:
        print(f"\n=== Combined CV MSE: {combined_cv:.6f} ===")
    
    return single_cv, full_cv, combined_cv

print("Running CV with MultiOrderGAT...")
single_cv, full_cv, combined_cv = compute_cv_score()

In [None]:
# Save results
import json

results = {
    'cv_score': float(combined_cv),
    'single_cv': float(single_cv),
    'full_cv': float(full_cv),
    'model': 'MultiOrderGATWrapper (Multi-Order GAT with Attention Readout)',
    'baseline_cv': 0.008092,
    'improvement': f"{(0.008092 - combined_cv) / 0.008092 * 100:.2f}%"
}

with open('/home/code/experiments/096_multi_order_gat/metrics.json', 'w') as f:
    json.dump(results, f, indent=2)

print("Results saved")
print(f"Combined CV: {combined_cv:.6f}")
print(f"Baseline CV: 0.008092")
print(f"Improvement: {(0.008092 - combined_cv) / 0.008092 * 100:.2f}%")

## Generate Submission

The following cells follow the official template structure.

**CRITICAL**: The model class in submission cells MUST match the CV computation class (`MultiOrderGATWrapper`).

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = MultiOrderGATWrapper(data='single', hidden_dim=64, num_layers=3, num_epochs=200, lr=0.001)  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = MultiOrderGATWrapper(data='full', hidden_dim=64, num_layers=3, num_epochs=200, lr=0.001)  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

# Also save to standard location
import shutil
import os
os.makedirs('/home/submission', exist_ok=True)
shutil.copy("submission.csv", "/home/submission/submission.csv")
print("Submission saved!")
print(f"Shape: {submission.shape}")
print(submission.head())

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################