# Experiment 087: DRFP + GAT Hybrid Model

**Rationale**: The GNN benchmark (arXiv:2512.19530) achieved MSE 0.0039 using:
1. GAT + DRFP combined
2. Learned mixture-aware solvent encodings
3. Explicit molecular graph message-passing

**Key improvements over exp_086:**
- Add DRFP features (2048-dim) alongside graph features
- Implement learned mixture-aware encoding (not just concatenation)
- Combine DRFP + GAT outputs before final MLP
- Train for 500 epochs with proper LR scheduling

In [1]:
import sys
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import CosineAnnealingLR
from rdkit import Chem
from torch_geometric.data import Data, Batch
from torch_geometric.nn import GATConv, global_mean_pool
import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')
    print(f'Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')

Device: cuda
GPU: NVIDIA H100 80GB HBM3
Memory: 85.0 GB


In [2]:
# Load data
def load_data(data_type):
    if data_type == "single_solvent":
        df = pd.read_csv('/home/data/catechol_single_solvent_yields.csv')
        X = df[['Residence Time', 'Temperature', 'SOLVENT NAME']]
        Y = df[['SM', 'Product 2', 'Product 3']]
    elif data_type == "full":
        df = pd.read_csv('/home/data/catechol_full_data_yields.csv')
        X = df[['Residence Time', 'Temperature', 'SOLVENT A NAME', 'SOLVENT B NAME', 'SolventB%']]
        Y = df[['SM', 'Product 2', 'Product 3']]
    return X, Y

# Load SMILES lookup
smiles_df = pd.read_csv('/home/data/smiles_lookup.csv')
smiles_dict = dict(zip(smiles_df['SOLVENT NAME'], smiles_df['solvent smiles']))
print(f"Loaded {len(smiles_dict)} SMILES")

# Load DRFP features
drfp_df = pd.read_csv('/home/data/drfps_catechol_lookup.csv')
drfp_cols = [str(i) for i in range(2048)]
drfp_dict = {}
for _, row in drfp_df.iterrows():
    name = row['SOLVENT NAME']
    drfp_dict[name] = row[drfp_cols].values.astype(np.float32)
print(f"Loaded {len(drfp_dict)} DRFP vectors (dim={len(drfp_cols)})")

Loaded 26 SMILES
Loaded 24 DRFP vectors (dim=2048)


In [3]:
# Official CV split functions (DO NOT MODIFY)
from typing import Any, Generator

def generate_leave_one_out_splits(
    X: pd.DataFrame, Y: pd.DataFrame
) -> Generator[
    tuple[tuple[pd.DataFrame, pd.DataFrame], tuple[pd.DataFrame, pd.DataFrame]],
    Any,
    None,
]:
    for solvent in X["SOLVENT NAME"].unique():
        train_mask = X["SOLVENT NAME"] != solvent
        test_mask = X["SOLVENT NAME"] == solvent
        yield (
            (X[train_mask], Y[train_mask]),
            (X[test_mask], Y[test_mask]),
        )

def generate_leave_one_ramp_out_splits(
    X: pd.DataFrame, Y: pd.DataFrame
) -> Generator[
    tuple[tuple[pd.DataFrame, pd.DataFrame], tuple[pd.DataFrame, pd.DataFrame]],
    Any,
    None,
]:
    ramps = X["SOLVENT A NAME"].astype(str) + "_" + X["SOLVENT B NAME"].astype(str)
    for ramp in ramps.unique():
        train_mask = ramps != ramp
        test_mask = ramps == ramp
        yield (
            (X[train_mask], Y[train_mask]),
            (X[test_mask], Y[test_mask]),
        )

print("CV split functions defined")

CV split functions defined


In [4]:
# Convert SMILES to molecular graph with edge features
def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    
    atom_features = []
    for atom in mol.GetAtoms():
        features = [
            atom.GetAtomicNum(),
            atom.GetDegree(),
            atom.GetFormalCharge(),
            int(atom.GetHybridization()),
            int(atom.GetIsAromatic()),
            atom.GetTotalNumHs(),
            int(atom.IsInRing()),
            atom.GetMass() / 100.0,
            int(atom.GetChiralTag()),
        ]
        atom_features.append(features)
    
    edge_index = []
    edge_features = []
    bond_type_map = {
        Chem.rdchem.BondType.SINGLE: 0,
        Chem.rdchem.BondType.DOUBLE: 1,
        Chem.rdchem.BondType.TRIPLE: 2,
        Chem.rdchem.BondType.AROMATIC: 3,
    }
    
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        bond_type = bond_type_map.get(bond.GetBondType(), 0)
        bond_order = bond.GetBondTypeAsDouble()
        is_aromatic = int(bond.GetIsAromatic())
        is_conjugated = int(bond.GetIsConjugated())
        is_in_ring = int(bond.IsInRing())
        edge_feat = [bond_type, bond_order, is_aromatic, is_conjugated, is_in_ring]
        edge_index.append([i, j])
        edge_index.append([j, i])
        edge_features.append(edge_feat)
        edge_features.append(edge_feat)
    
    if len(edge_index) == 0:
        edge_index = [[0, 0]]
        edge_features = [[0, 1.0, 0, 0, 0]]
    
    x = torch.tensor(atom_features, dtype=torch.float)
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edge_features, dtype=torch.float)
    
    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr)

# Pre-compute all solvent graphs
solvent_graphs = {}
for name, smiles in smiles_dict.items():
    graph = smiles_to_graph(smiles)
    if graph is not None:
        solvent_graphs[name] = graph
print(f"Total: {len(solvent_graphs)} solvent graphs")

Total: 26 solvent graphs


In [5]:
# DRFP + GAT Hybrid Model for SINGLE SOLVENT
class DRFPGATHybrid(nn.Module):
    def __init__(self, drfp_dim=2048, atom_dim=9, edge_dim=5, hidden_dim=128, out_dim=3, heads=4, num_layers=3):
        super().__init__()
        
        # DRFP encoder
        self.drfp_encoder = nn.Sequential(
            nn.Linear(drfp_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
        )
        
        # GAT encoder
        self.gat_convs = nn.ModuleList()
        self.gat_bns = nn.ModuleList()
        
        self.gat_convs.append(GATConv(atom_dim, hidden_dim // heads, heads=heads, edge_dim=edge_dim, concat=True))
        self.gat_bns.append(nn.BatchNorm1d(hidden_dim))
        
        for _ in range(num_layers - 1):
            self.gat_convs.append(GATConv(hidden_dim, hidden_dim // heads, heads=heads, edge_dim=edge_dim, concat=True))
            self.gat_bns.append(nn.BatchNorm1d(hidden_dim))
        
        # Final MLP: DRFP_emb + GAT_emb + T + RT
        self.final_mlp = nn.Sequential(
            nn.Linear(hidden_dim * 2 + 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim // 2, out_dim),
            nn.Sigmoid()
        )
    
    def encode_graph(self, data):
        x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
        for i, (conv, bn) in enumerate(zip(self.gat_convs, self.gat_bns)):
            x = conv(x, edge_index, edge_attr=edge_attr)
            x = bn(x)
            x = F.elu(x)
            x = F.dropout(x, p=0.1, training=self.training)
        return global_mean_pool(x, batch)
    
    def forward(self, drfp, graph_data, T, RT):
        drfp_emb = self.drfp_encoder(drfp)
        gat_emb = self.encode_graph(graph_data)
        combined = torch.cat([drfp_emb, gat_emb, T.unsqueeze(1), RT.unsqueeze(1)], dim=1)
        return self.final_mlp(combined)

print("DRFPGATHybrid defined")

DRFPGATHybrid defined


In [6]:
# DRFP + GAT Hybrid Model for MIXTURES with LEARNED mixture encoding
class DRFPGATMixtureHybrid(nn.Module):
    def __init__(self, drfp_dim=2048, atom_dim=9, edge_dim=5, hidden_dim=128, out_dim=3, heads=4, num_layers=3):
        super().__init__()
        
        # DRFP encoder (shared for both solvents)
        self.drfp_encoder = nn.Sequential(
            nn.Linear(drfp_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
        )
        
        # GAT encoder (shared for both solvents)
        self.gat_convs = nn.ModuleList()
        self.gat_bns = nn.ModuleList()
        
        self.gat_convs.append(GATConv(atom_dim, hidden_dim // heads, heads=heads, edge_dim=edge_dim, concat=True))
        self.gat_bns.append(nn.BatchNorm1d(hidden_dim))
        
        for _ in range(num_layers - 1):
            self.gat_convs.append(GATConv(hidden_dim, hidden_dim // heads, heads=heads, edge_dim=edge_dim, concat=True))
            self.gat_bns.append(nn.BatchNorm1d(hidden_dim))
        
        # LEARNED mixture encoding - key innovation!
        # Instead of simple concatenation, learn how to combine solvent embeddings
        self.mixture_attention = nn.Sequential(
            nn.Linear(hidden_dim * 4 + 1, hidden_dim),  # A_drfp + A_gat + B_drfp + B_gat + mix_frac
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
        )
        
        # Final MLP: mixture_emb + T + RT
        self.final_mlp = nn.Sequential(
            nn.Linear(hidden_dim + 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim // 2, out_dim),
            nn.Sigmoid()
        )
    
    def encode_graph(self, data):
        x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
        for i, (conv, bn) in enumerate(zip(self.gat_convs, self.gat_bns)):
            x = conv(x, edge_index, edge_attr=edge_attr)
            x = bn(x)
            x = F.elu(x)
            x = F.dropout(x, p=0.1, training=self.training)
        return global_mean_pool(x, batch)
    
    def forward(self, drfp_A, drfp_B, graph_A, graph_B, mix_frac, T, RT):
        # Encode both solvents
        drfp_emb_A = self.drfp_encoder(drfp_A)
        drfp_emb_B = self.drfp_encoder(drfp_B)
        gat_emb_A = self.encode_graph(graph_A)
        gat_emb_B = self.encode_graph(graph_B)
        
        # LEARNED mixture encoding
        mixture_input = torch.cat([drfp_emb_A, gat_emb_A, drfp_emb_B, gat_emb_B, mix_frac.unsqueeze(1)], dim=1)
        mixture_emb = self.mixture_attention(mixture_input)
        
        # Final prediction
        combined = torch.cat([mixture_emb, T.unsqueeze(1), RT.unsqueeze(1)], dim=1)
        return self.final_mlp(combined)

print("DRFPGATMixtureHybrid defined")

DRFPGATMixtureHybrid defined


In [7]:
# Model wrapper
class HybridModel:
    def __init__(self, data='single', hidden_dim=128, num_epochs=400, lr=1e-3, heads=4, num_layers=3):
        self.data_type = data
        self.mixed = (data == 'full')
        self.hidden_dim = hidden_dim
        self.num_epochs = num_epochs
        self.lr = lr
        self.heads = heads
        self.num_layers = num_layers
        self.model = None
        
    def train_model(self, train_X, train_Y):
        if self.mixed:
            self._train_mixed(train_X, train_Y)
        else:
            self._train_single(train_X, train_Y)
    
    def _train_single(self, train_X, train_Y):
        drfps = []
        graphs = []
        temps = []
        rts = []
        targets = []
        
        for i in range(len(train_X)):
            row = train_X.iloc[i]
            solvent_name = row['SOLVENT NAME']
            
            if solvent_name not in solvent_graphs or solvent_name not in drfp_dict:
                continue
            
            drfps.append(drfp_dict[solvent_name])
            graphs.append(solvent_graphs[solvent_name].clone())
            temps.append(row['Temperature'])
            rts.append(row['Residence Time'])
            targets.append(train_Y.iloc[i].values)
        
        # Normalize
        temps = np.array(temps)
        rts = np.array(rts)
        self.temp_mean, self.temp_std = temps.mean(), temps.std() + 1e-8
        self.rt_mean, self.rt_std = rts.mean(), rts.std() + 1e-8
        temps = (temps - self.temp_mean) / self.temp_std
        rts = (rts - self.rt_mean) / self.rt_std
        
        drfps = torch.tensor(np.array(drfps), dtype=torch.float).to(device)
        temps = torch.tensor(temps, dtype=torch.float).to(device)
        rts = torch.tensor(rts, dtype=torch.float).to(device)
        targets = torch.tensor(np.array(targets), dtype=torch.float).to(device)
        
        # Model
        self.model = DRFPGATHybrid(
            drfp_dim=2048, atom_dim=9, edge_dim=5, hidden_dim=self.hidden_dim,
            out_dim=3, heads=self.heads, num_layers=self.num_layers
        ).to(device)
        
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.lr, weight_decay=1e-4)
        scheduler = CosineAnnealingLR(optimizer, T_max=self.num_epochs, eta_min=1e-6)
        
        self.model.train()
        batch_size = 32
        n_samples = len(graphs)
        
        for epoch in range(self.num_epochs):
            indices = np.random.permutation(n_samples)
            total_loss = 0
            
            for start in range(0, n_samples, batch_size):
                end = min(start + batch_size, n_samples)
                batch_idx = indices[start:end]
                
                batch_graphs = [graphs[i].to(device) for i in batch_idx]
                batch_data = Batch.from_data_list(batch_graphs)
                batch_drfp = drfps[batch_idx]
                batch_T = temps[batch_idx]
                batch_RT = rts[batch_idx]
                batch_targets = targets[batch_idx]
                
                optimizer.zero_grad()
                outputs = self.model(batch_drfp, batch_data, batch_T, batch_RT)
                loss = F.mse_loss(outputs, batch_targets)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                
                total_loss += loss.item() * len(batch_idx)
            
            scheduler.step()
        
        self.model.eval()
    
    def _train_mixed(self, train_X, train_Y):
        drfps_A = []
        drfps_B = []
        graphs_A = []
        graphs_B = []
        mix_fracs = []
        temps = []
        rts = []
        targets = []
        
        for i in range(len(train_X)):
            row = train_X.iloc[i]
            solvent_A = row['SOLVENT A NAME']
            solvent_B = row['SOLVENT B NAME']
            
            if solvent_A not in solvent_graphs or solvent_A not in drfp_dict:
                continue
            if solvent_B not in solvent_graphs or solvent_B not in drfp_dict:
                continue
            
            drfps_A.append(drfp_dict[solvent_A])
            drfps_B.append(drfp_dict[solvent_B])
            graphs_A.append(solvent_graphs[solvent_A].clone())
            graphs_B.append(solvent_graphs[solvent_B].clone())
            mix_fracs.append(row['SolventB%'] / 100.0)
            temps.append(row['Temperature'])
            rts.append(row['Residence Time'])
            targets.append(train_Y.iloc[i].values)
        
        # Normalize
        temps = np.array(temps)
        rts = np.array(rts)
        mix_fracs = np.array(mix_fracs)
        self.temp_mean, self.temp_std = temps.mean(), temps.std() + 1e-8
        self.rt_mean, self.rt_std = rts.mean(), rts.std() + 1e-8
        temps = (temps - self.temp_mean) / self.temp_std
        rts = (rts - self.rt_mean) / self.rt_std
        
        drfps_A = torch.tensor(np.array(drfps_A), dtype=torch.float).to(device)
        drfps_B = torch.tensor(np.array(drfps_B), dtype=torch.float).to(device)
        temps = torch.tensor(temps, dtype=torch.float).to(device)
        rts = torch.tensor(rts, dtype=torch.float).to(device)
        mix_fracs = torch.tensor(mix_fracs, dtype=torch.float).to(device)
        targets = torch.tensor(np.array(targets), dtype=torch.float).to(device)
        
        # Model
        self.model = DRFPGATMixtureHybrid(
            drfp_dim=2048, atom_dim=9, edge_dim=5, hidden_dim=self.hidden_dim,
            out_dim=3, heads=self.heads, num_layers=self.num_layers
        ).to(device)
        
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.lr, weight_decay=1e-4)
        scheduler = CosineAnnealingLR(optimizer, T_max=self.num_epochs, eta_min=1e-6)
        
        self.model.train()
        batch_size = 32
        n_samples = len(graphs_A)
        
        for epoch in range(self.num_epochs):
            indices = np.random.permutation(n_samples)
            total_loss = 0
            
            for start in range(0, n_samples, batch_size):
                end = min(start + batch_size, n_samples)
                batch_idx = indices[start:end]
                
                batch_A = [graphs_A[i].to(device) for i in batch_idx]
                batch_B = [graphs_B[i].to(device) for i in batch_idx]
                data_A = Batch.from_data_list(batch_A)
                data_B = Batch.from_data_list(batch_B)
                batch_drfp_A = drfps_A[batch_idx]
                batch_drfp_B = drfps_B[batch_idx]
                batch_mix = mix_fracs[batch_idx]
                batch_T = temps[batch_idx]
                batch_RT = rts[batch_idx]
                batch_targets = targets[batch_idx]
                
                optimizer.zero_grad()
                outputs = self.model(batch_drfp_A, batch_drfp_B, data_A, data_B, batch_mix, batch_T, batch_RT)
                loss = F.mse_loss(outputs, batch_targets)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                
                total_loss += loss.item() * len(batch_idx)
            
            scheduler.step()
        
        self.model.eval()
    
    def predict(self, test_X):
        self.model.eval()
        with torch.no_grad():
            if self.mixed:
                return self._predict_mixed(test_X)
            else:
                return self._predict_single(test_X)
    
    def _predict_single(self, test_X):
        drfps = []
        graphs = []
        temps = []
        rts = []
        
        for i in range(len(test_X)):
            row = test_X.iloc[i]
            solvent_name = row['SOLVENT NAME']
            
            if solvent_name not in solvent_graphs or solvent_name not in drfp_dict:
                # Fallback
                drfps.append(list(drfp_dict.values())[0])
                graphs.append(list(solvent_graphs.values())[0].clone())
            else:
                drfps.append(drfp_dict[solvent_name])
                graphs.append(solvent_graphs[solvent_name].clone())
            
            temps.append(row['Temperature'])
            rts.append(row['Residence Time'])
        
        temps = (np.array(temps) - self.temp_mean) / self.temp_std
        rts = (np.array(rts) - self.rt_mean) / self.rt_std
        
        drfps = torch.tensor(np.array(drfps), dtype=torch.float).to(device)
        temps = torch.tensor(temps, dtype=torch.float).to(device)
        rts = torch.tensor(rts, dtype=torch.float).to(device)
        
        batch_data = Batch.from_data_list([g.to(device) for g in graphs])
        outputs = self.model(drfps, batch_data, temps, rts)
        
        return outputs
    
    def _predict_mixed(self, test_X):
        drfps_A = []
        drfps_B = []
        graphs_A = []
        graphs_B = []
        mix_fracs = []
        temps = []
        rts = []
        
        for i in range(len(test_X)):
            row = test_X.iloc[i]
            solvent_A = row['SOLVENT A NAME']
            solvent_B = row['SOLVENT B NAME']
            
            if solvent_A not in solvent_graphs or solvent_A not in drfp_dict:
                drfps_A.append(list(drfp_dict.values())[0])
                graphs_A.append(list(solvent_graphs.values())[0].clone())
            else:
                drfps_A.append(drfp_dict[solvent_A])
                graphs_A.append(solvent_graphs[solvent_A].clone())
            
            if solvent_B not in solvent_graphs or solvent_B not in drfp_dict:
                drfps_B.append(list(drfp_dict.values())[0])
                graphs_B.append(list(solvent_graphs.values())[0].clone())
            else:
                drfps_B.append(drfp_dict[solvent_B])
                graphs_B.append(solvent_graphs[solvent_B].clone())
            
            mix_fracs.append(row['SolventB%'] / 100.0)
            temps.append(row['Temperature'])
            rts.append(row['Residence Time'])
        
        temps = (np.array(temps) - self.temp_mean) / self.temp_std
        rts = (np.array(rts) - self.rt_mean) / self.rt_std
        
        drfps_A = torch.tensor(np.array(drfps_A), dtype=torch.float).to(device)
        drfps_B = torch.tensor(np.array(drfps_B), dtype=torch.float).to(device)
        temps = torch.tensor(temps, dtype=torch.float).to(device)
        rts = torch.tensor(rts, dtype=torch.float).to(device)
        mix_fracs = torch.tensor(np.array(mix_fracs), dtype=torch.float).to(device)
        
        data_A = Batch.from_data_list([g.to(device) for g in graphs_A])
        data_B = Batch.from_data_list([g.to(device) for g in graphs_B])
        
        outputs = self.model(drfps_A, drfps_B, data_A, data_B, mix_fracs, temps, rts)
        
        return outputs

print("HybridModel wrapper defined")

HybridModel wrapper defined


In [8]:
# Run CV for single solvent data
import tqdm

X, Y = load_data("single_solvent")
print(f"Single solvent data: {len(X)} samples, {len(X['SOLVENT NAME'].unique())} solvents")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []
fold_mses = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=24):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = HybridModel(data='single', hidden_dim=128, num_epochs=400, lr=1e-3, heads=4, num_layers=3)
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X)
    predictions_np = predictions.detach().cpu().numpy()
    
    fold_mse = np.mean((predictions_np - test_Y.values) ** 2)
    fold_mses.append(fold_mse)
    
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_single_solvent = pd.DataFrame(all_predictions)
print(f"\nSingle solvent predictions: {len(submission_single_solvent)} rows")
print(f"Mean fold MSE: {np.mean(fold_mses):.6f} ± {np.std(fold_mses):.6f}")

Single solvent data: 656 samples, 24 solvents


  0%|          | 0/24 [00:00<?, ?it/s]

  4%|▍         | 1/24 [00:56<21:46, 56.79s/it]

  8%|▊         | 2/24 [01:53<20:44, 56.58s/it]

 12%|█▎        | 3/24 [02:49<19:43, 56.36s/it]

 17%|█▋        | 4/24 [03:43<18:27, 55.36s/it]

 21%|██        | 5/24 [04:39<17:35, 55.58s/it]

 25%|██▌       | 6/24 [05:38<17:01, 56.74s/it]

 29%|██▉       | 7/24 [06:34<16:01, 56.56s/it]

 33%|███▎      | 8/24 [07:27<14:48, 55.52s/it]

 38%|███▊      | 9/24 [08:23<13:56, 55.74s/it]

 42%|████▏     | 10/24 [09:20<13:04, 56.06s/it]

 46%|████▌     | 11/24 [10:16<12:08, 56.06s/it]

 50%|█████     | 12/24 [11:12<11:12, 56.08s/it]

 54%|█████▍    | 13/24 [12:08<10:15, 55.98s/it]

 58%|█████▊    | 14/24 [13:04<09:19, 55.97s/it]

 62%|██████▎   | 15/24 [14:00<08:23, 55.94s/it]

 67%|██████▋   | 16/24 [14:56<07:28, 56.05s/it]

 71%|███████   | 17/24 [15:52<06:32, 56.08s/it]

 75%|███████▌  | 18/24 [16:48<05:36, 56.08s/it]

 79%|███████▉  | 19/24 [17:44<04:40, 56.05s/it]

 83%|████████▎ | 20/24 [18:40<03:44, 56.06s/it]

 88%|████████▊ | 21/24 [19:37<02:48, 56.24s/it]

 92%|█████████▏| 22/24 [20:33<01:52, 56.19s/it]

 96%|█████████▌| 23/24 [21:30<00:56, 56.31s/it]

100%|██████████| 24/24 [22:25<00:00, 56.07s/it]

100%|██████████| 24/24 [22:25<00:00, 56.07s/it]


Single solvent predictions: 656 rows
Mean fold MSE: 0.016990 ± 0.016228





In [9]:
# Run CV for full (mixture) data
X, Y = load_data("full")
print(f"Full data: {len(X)} samples")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []
fold_mses = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=13):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = HybridModel(data='full', hidden_dim=128, num_epochs=400, lr=1e-3, heads=4, num_layers=3)
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X)
    predictions_np = predictions.detach().cpu().numpy()
    
    fold_mse = np.mean((predictions_np - test_Y.values) ** 2)
    fold_mses.append(fold_mse)
    
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_full_data = pd.DataFrame(all_predictions)
print(f"\nFull data predictions: {len(submission_full_data)} rows")
print(f"Mean fold MSE: {np.mean(fold_mses):.6f} ± {np.std(fold_mses):.6f}")

Full data: 1227 samples


  0%|          | 0/13 [00:00<?, ?it/s]

  8%|▊         | 1/13 [03:01<36:18, 181.57s/it]

 15%|█▌        | 2/13 [06:02<33:12, 181.09s/it]

 23%|██▎       | 3/13 [09:08<30:32, 183.29s/it]

 31%|███       | 4/13 [12:08<27:19, 182.12s/it]

 38%|███▊      | 5/13 [15:09<24:12, 181.61s/it]

 46%|████▌     | 6/13 [18:10<21:09, 181.34s/it]

 54%|█████▍    | 7/13 [21:11<18:08, 181.49s/it]

 62%|██████▏   | 8/13 [24:14<15:08, 181.77s/it]

 69%|██████▉   | 9/13 [27:15<12:06, 181.52s/it]

 77%|███████▋  | 10/13 [30:33<09:20, 186.69s/it]

 85%|████████▍ | 11/13 [33:51<06:20, 190.28s/it]

 92%|█████████▏| 12/13 [37:09<03:12, 192.50s/it]

100%|██████████| 13/13 [40:27<00:00, 194.30s/it]

100%|██████████| 13/13 [40:27<00:00, 186.76s/it]


Full data predictions: 1227 rows
Mean fold MSE: 0.020746 ± 0.013590





In [10]:
# Combine and save submission
submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"

print(f"Submission shape: {submission.shape}")

submission.to_csv("/home/submission/submission.csv", index=True)
print(f"\nSubmission saved to /home/submission/submission.csv")

submission_check = pd.read_csv("/home/submission/submission.csv")
print(f"\nSubmission rows: {len(submission_check)}")

target_cols = ['target_1', 'target_2', 'target_3']
for col in target_cols:
    print(f"{col}: min={submission_check[col].min():.4f}, max={submission_check[col].max():.4f}")

Submission shape: (1883, 7)

Submission saved to /home/submission/submission.csv

Submission rows: 1883
target_1: min=0.0008, max=0.9817
target_2: min=0.0003, max=0.4334
target_3: min=0.0003, max=0.4641


In [11]:
# Calculate overall CV score
print("="*50)
print("EXPERIMENT 087 COMPLETE")
print("="*50)

# Get the MSE values from the fold results
single_cv = 0.0  # Will be filled from output
full_cv = 0.0  # Will be filled from output

print(f"\nKey techniques:")
print("1. DRFP features (2048-dim) + GAT graph embeddings")
print("2. LEARNED mixture-aware encoding (not simple concatenation)")
print("3. GAT with 4 heads, 3 layers, 128 hidden dim")
print("4. 400 epochs with cosine annealing LR")
print("5. Combined DRFP + GAT before final MLP")
print("\nThis implements the key ingredients from the GNN benchmark (arXiv:2512.19530).")

EXPERIMENT 087 COMPLETE

Key techniques:
1. DRFP features (2048-dim) + GAT graph embeddings
2. LEARNED mixture-aware encoding (not simple concatenation)
3. GAT with 4 heads, 3 layers, 128 hidden dim
4. 400 epochs with cosine annealing LR
5. Combined DRFP + GAT before final MLP

This implements the key ingredients from the GNN benchmark (arXiv:2512.19530).


In [None]:
# Calculate overall CV score
single_cv = 0.016990
full_cv = 0.020746

# Weighted by sample count
total_samples = 656 + 1227
overall_cv = (656 * single_cv + 1227 * full_cv) / total_samples

print(f"\nSingle solvent CV: {single_cv:.6f}")
print(f"Full data CV: {full_cv:.6f}")
print(f"Overall CV (sample-weighted): {overall_cv:.6f}")

print("\n" + "="*50)
print("COMPARISON WITH PREVIOUS RESULTS")
print("="*50)
print(f"This experiment (DRFP+GAT): {overall_cv:.6f}")
print(f"Previous GAT only: 0.018474")
print(f"Previous GCN only: 0.020130")
print(f"Best previous CV (Leave-One-Out): 0.008092 (exp_049)")
print(f"Best verified LB: 0.0877 (exp_030, exp_067)")

print("\n" + "="*50)
print("ANALYSIS")
print("="*50)
if overall_cv < 0.018474:
    print(f"DRFP+GAT improved over GAT-only by {(0.018474 - overall_cv) / 0.018474 * 100:.1f}%")
else:
    print(f"DRFP+GAT did NOT improve over GAT-only")
    
print(f"\nThe DRFP+GAT hybrid achieved CV={overall_cv:.6f}")
print(f"This is still {(overall_cv - 0.008092) / 0.008092 * 100:.1f}% worse than best tabular (0.008092)")
print(f"\nPredicted LB: 4.36 * {overall_cv:.6f} + 0.052 = {4.36 * overall_cv + 0.052:.4f}")
print(f"This would be worse than our best LB (0.0877)")