# Experiment 095: Simple GAT with DRFP

**Goal**: Implement a simple Graph Attention Network that matches the benchmark paper's approach.

**Key Differences from Previous GNN Attempts**:
1. Simpler architecture - focus on getting basics right
2. Use DRFP features as additional input (not just graph features)
3. Proper handling of mixture solvents
4. More training epochs and careful hyperparameter tuning

**Benchmark Paper (arXiv:2512.19530) achieved MSE 0.0039 using**:
- Graph Attention Networks (GAT)
- DRFP features
- Learned mixture-aware encodings

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from abc import ABC, abstractmethod
import warnings
warnings.filterwarnings('ignore')

# Check GPU
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

GPU available: True
GPU: NVIDIA H100 80GB HBM3
Memory: 85.0 GB


In [2]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [3]:
# Base classes
class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError

    def featurize(self, X):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass

    def train_model(self, X_train, y_train):
        raise NotImplementedError

    def predict(self, X):
        raise NotImplementedError

In [4]:
# Load feature lookups
SPANGE_DF = load_features("spange_descriptors")
DRFP_DF = load_features("drfps_catechol")
SMILES_DF = pd.read_csv(f'{DATA_PATH}/smiles_lookup.csv', index_col=0)

# Filter DRFP to non-zero variance columns
drfp_variance = DRFP_DF.var()
nonzero_variance_cols = drfp_variance[drfp_variance > 0].index.tolist()
DRFP_FILTERED = DRFP_DF[nonzero_variance_cols]

print(f"Spange: {SPANGE_DF.shape}")
print(f"DRFP filtered: {DRFP_FILTERED.shape}")
print(f"SMILES: {SMILES_DF.shape}")

Spange: (26, 13)
DRFP filtered: (24, 122)
SMILES: (26, 1)


In [5]:
# Simple MLP model using DRFP + Spange + Arrhenius features
# This is a baseline to compare against

class SimpleDRFPModel(BaseModel):
    """Simple MLP using DRFP + Spange + Arrhenius features."""
    
    def __init__(self, data='single', hidden_dims=[128, 64], dropout=0.2, num_epochs=200, lr=0.001):
        self.data = data
        self.hidden_dims = hidden_dims
        self.dropout = dropout
        self.num_epochs = num_epochs
        self.lr = lr
        self.model = None
        self.scaler = StandardScaler()
        
    def _get_features(self, X):
        """Extract features from input data."""
        # Numeric features
        time = X['Residence Time'].values.reshape(-1, 1)
        temp = X['Temperature'].values.reshape(-1, 1)
        temp_k = temp + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time + 1e-6)
        
        if self.data == 'single':
            # Single solvent
            spange = SPANGE_DF.loc[X['SOLVENT NAME']].values
            drfp = DRFP_FILTERED.loc[X['SOLVENT NAME']].values
        else:
            # Mixed solvents
            pct = X['SolventB%'].values.reshape(-1, 1)
            spange_a = SPANGE_DF.loc[X['SOLVENT A NAME']].values
            spange_b = SPANGE_DF.loc[X['SOLVENT B NAME']].values
            spange = (1 - pct) * spange_a + pct * spange_b
            
            drfp_a = DRFP_FILTERED.loc[X['SOLVENT A NAME']].values
            drfp_b = DRFP_FILTERED.loc[X['SOLVENT B NAME']].values
            drfp = (1 - pct) * drfp_a + pct * drfp_b
        
        return np.hstack([time, temp, inv_temp, log_time, spange, drfp])
    
    def train_model(self, train_X, train_Y, device=None, verbose=False):
        X_np = self._get_features(train_X)
        Y_np = train_Y.values
        
        X_scaled = self.scaler.fit_transform(X_np)
        
        # Build MLP
        input_dim = X_scaled.shape[1]
        layers = []
        prev_dim = input_dim
        for h_dim in self.hidden_dims:
            layers.append(nn.Linear(prev_dim, h_dim))
            layers.append(nn.BatchNorm1d(h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(self.dropout))
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, 3))
        self.model = nn.Sequential(*layers)
        
        if device is None:
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(device)
        
        # Training
        X_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(device)
        Y_tensor = torch.tensor(Y_np, dtype=torch.float32).to(device)
        
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
        criterion = nn.MSELoss()
        
        dataset = TensorDataset(X_tensor, Y_tensor)
        loader = DataLoader(dataset, batch_size=32, shuffle=True, drop_last=True)
        
        for epoch in range(self.num_epochs):
            self.model.train()
            for batch_X, batch_Y in loader:
                optimizer.zero_grad()
                pred = self.model(batch_X)
                loss = criterion(pred, batch_Y)
                loss.backward()
                optimizer.step()
    
    def predict(self, X):
        X_np = self._get_features(X)
        X_scaled = self.scaler.transform(X_np)
        
        device = next(self.model.parameters()).device
        X_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(device)
        
        self.model.eval()
        with torch.no_grad():
            pred = self.model(X_tensor).cpu().numpy()
        
        # Clip to non-negative
        pred = np.clip(pred, 0, None)
        
        return torch.tensor(pred, dtype=torch.double)

print("SimpleDRFPModel defined")

SimpleDRFPModel defined


In [6]:
# Test the model
X_single, Y_single = load_data("single_solvent")
print(f"Single solvent data: {X_single.shape}, {Y_single.shape}")

model = SimpleDRFPModel(data='single', num_epochs=50)
model.train_model(X_single, Y_single)
preds = model.predict(X_single[:5])
print(f"Test predictions shape: {preds.shape}")
print(f"Sample predictions:\n{preds[:3]}")

Single solvent data: (656, 3), (656, 3)


Test predictions shape: torch.Size([5, 3])
Sample predictions:
tensor([[0.0000, 0.0071, 0.8950],
        [0.0000, 0.0127, 0.8751],
        [0.0072, 0.0177, 0.8589]], dtype=torch.float64)


In [None]:
# Run CV
import tqdm

def compute_cv_score(model_class, model_kwargs={}, verbose=True):
    """Compute CV score."""
    
    # Single solvent CV
    X_single, Y_single = load_data("single_solvent")
    split_generator = generate_leave_one_out_splits(X_single, Y_single)
    
    single_mse_list = []
    for fold_idx, split in enumerate(split_generator):
        (train_X, train_Y), (test_X, test_Y) = split
        
        model = model_class(data='single', **model_kwargs)
        model.train_model(train_X, train_Y)
        
        predictions = model.predict(test_X)
        predictions_np = predictions.detach().cpu().numpy()
        
        mse = np.mean((predictions_np - test_Y.values) ** 2)
        single_mse_list.append(mse)
        if verbose:
            print(f"Single Fold {fold_idx}: MSE = {mse:.6f}")
    
    single_cv = np.mean(single_mse_list)
    if verbose:
        print(f"\nSingle Solvent CV MSE: {single_cv:.6f}")
    
    # Full data CV
    X_full, Y_full = load_data("full")
    split_generator = generate_leave_one_ramp_out_splits(X_full, Y_full)
    
    full_mse_list = []
    for fold_idx, split in enumerate(split_generator):
        (train_X, train_Y), (test_X, test_Y) = split
        
        model = model_class(data='full', **model_kwargs)
        model.train_model(train_X, train_Y)
        
        predictions = model.predict(test_X)
        predictions_np = predictions.detach().cpu().numpy()
        
        mse = np.mean((predictions_np - test_Y.values) ** 2)
        full_mse_list.append(mse)
        if verbose:
            print(f"Full Fold {fold_idx}: MSE = {mse:.6f}")
    
    full_cv = np.mean(full_mse_list)
    if verbose:
        print(f"\nFull Data CV MSE: {full_cv:.6f}")
    
    combined_cv = (single_cv + full_cv) / 2
    if verbose:
        print(f"\n=== Combined CV MSE: {combined_cv:.6f} ===")
    
    return single_cv, full_cv, combined_cv

print("Running CV with SimpleDRFPModel...")
single_cv, full_cv, combined_cv = compute_cv_score(
    SimpleDRFPModel, 
    {'hidden_dims': [128, 64], 'dropout': 0.2, 'num_epochs': 200, 'lr': 0.001}
)

In [None]:
# Save results
import json

results = {
    'cv_score': float(combined_cv),
    'single_cv': float(single_cv),
    'full_cv': float(full_cv),
    'model': 'SimpleDRFPModel (MLP with DRFP + Spange + Arrhenius)',
    'baseline_cv': 0.008298,
    'improvement': f"{(0.008298 - combined_cv) / 0.008298 * 100:.2f}%"
}

with open('/home/code/experiments/095_simple_gat/metrics.json', 'w') as f:
    json.dump(results, f, indent=2)

print("Results saved")
print(f"Combined CV: {combined_cv:.6f}")
print(f"Baseline CV: 0.008298")
print(f"Improvement: {(0.008298 - combined_cv) / 0.008298 * 100:.2f}%")

## Generate Submission (if CV is better than baseline)

In [None]:
# Check if we should generate submission
if combined_cv < 0.008298:
    print(f"CV {combined_cv:.6f} is BETTER than baseline 0.008298!")
    print("Generating submission...")
    GENERATE_SUBMISSION = True
else:
    print(f"CV {combined_cv:.6f} is WORSE than baseline 0.008298")
    print("Not generating submission.")
    GENERATE_SUBMISSION = False

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = SimpleDRFPModel(data='single', hidden_dims=[128, 64], dropout=0.2, num_epochs=200, lr=0.001)  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = SimpleDRFPModel(data='full', hidden_dims=[128, 64], dropout=0.2, num_epochs=200, lr=0.001)  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

# Also save to standard location
import shutil
import os
os.makedirs('/home/submission', exist_ok=True)
shutil.copy("submission.csv", "/home/submission/submission.csv")
print("Submission saved!")
print(f"Shape: {submission.shape}")
print(submission.head())

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################