# Experiment 086: Graph Attention Network (GAT) + DRFP

**Rationale**: The GNN benchmark (arXiv:2512.19530) achieved MSE 0.0039 using:
1. Graph Attention Networks (GAT) - NOT simple GCN
2. DRFP (Differential Reaction Fingerprints)
3. Learned mixture-aware solvent encodings
4. Edge features (bond types, bond orders)

**Key improvements over exp_085:**
- GAT layers with attention mechanism
- Edge features (bond type, bond order, aromaticity, ring membership)
- Better mixture handling: encode BOTH solvents + mixture fraction
- Longer training (500 epochs)
- Larger hidden dimensions (128 channels)
- Cosine annealing LR scheduler

In [1]:
import sys
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import CosineAnnealingLR
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from torch_geometric.data import Data, Batch
from torch_geometric.nn import GATConv, global_mean_pool, global_add_pool
import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')
    print(f'Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')

Device: cuda
GPU: NVIDIA H100 80GB HBM3
Memory: 85.0 GB


In [2]:
# Load data
def load_data(data_type):
    if data_type == "single_solvent":
        df = pd.read_csv('/home/data/catechol_single_solvent_yields.csv')
        X = df[['Residence Time', 'Temperature', 'SOLVENT NAME']]
        Y = df[['SM', 'Product 2', 'Product 3']]
    elif data_type == "full":
        df = pd.read_csv('/home/data/catechol_full_data_yields.csv')
        X = df[['Residence Time', 'Temperature', 'SOLVENT A NAME', 'SOLVENT B NAME', 'SolventB%']]
        Y = df[['SM', 'Product 2', 'Product 3']]
    return X, Y

# Load SMILES lookup
smiles_df = pd.read_csv('/home/data/smiles_lookup.csv')
smiles_dict = dict(zip(smiles_df['SOLVENT NAME'], smiles_df['solvent smiles']))
print(f"Loaded {len(smiles_dict)} SMILES")

Loaded 26 SMILES


In [3]:
# Official CV split functions (DO NOT MODIFY)
from typing import Any, Generator

def generate_leave_one_out_splits(
    X: pd.DataFrame, Y: pd.DataFrame
) -> Generator[
    tuple[tuple[pd.DataFrame, pd.DataFrame], tuple[pd.DataFrame, pd.DataFrame]],
    Any,
    None,
]:
    for solvent in X["SOLVENT NAME"].unique():
        train_mask = X["SOLVENT NAME"] != solvent
        test_mask = X["SOLVENT NAME"] == solvent
        yield (
            (X[train_mask], Y[train_mask]),
            (X[test_mask], Y[test_mask]),
        )

def generate_leave_one_ramp_out_splits(
    X: pd.DataFrame, Y: pd.DataFrame
) -> Generator[
    tuple[tuple[pd.DataFrame, pd.DataFrame], tuple[pd.DataFrame, pd.DataFrame]],
    Any,
    None,
]:
    ramps = X["SOLVENT A NAME"].astype(str) + "_" + X["SOLVENT B NAME"].astype(str)
    for ramp in ramps.unique():
        train_mask = ramps != ramp
        test_mask = ramps == ramp
        yield (
            (X[train_mask], Y[train_mask]),
            (X[test_mask], Y[test_mask]),
        )

print("CV split functions defined")

CV split functions defined


In [4]:
# Convert SMILES to molecular graph with EDGE FEATURES
def smiles_to_graph(smiles):
    """Convert SMILES string to PyTorch Geometric Data object with edge features."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    
    # Atom features (expanded)
    atom_features = []
    for atom in mol.GetAtoms():
        features = [
            atom.GetAtomicNum(),  # Atomic number
            atom.GetDegree(),  # Number of bonds
            atom.GetFormalCharge(),  # Formal charge
            int(atom.GetHybridization()),  # Hybridization
            int(atom.GetIsAromatic()),  # Is aromatic
            atom.GetTotalNumHs(),  # Number of hydrogens
            int(atom.IsInRing()),  # Is in ring
            atom.GetMass() / 100.0,  # Atomic mass (normalized)
            int(atom.GetChiralTag()),  # Chirality
        ]
        atom_features.append(features)
    
    # Edge index and edge features
    edge_index = []
    edge_features = []
    
    bond_type_map = {
        Chem.rdchem.BondType.SINGLE: 0,
        Chem.rdchem.BondType.DOUBLE: 1,
        Chem.rdchem.BondType.TRIPLE: 2,
        Chem.rdchem.BondType.AROMATIC: 3,
    }
    
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        
        # Edge features
        bond_type = bond_type_map.get(bond.GetBondType(), 0)
        bond_order = bond.GetBondTypeAsDouble()
        is_aromatic = int(bond.GetIsAromatic())
        is_conjugated = int(bond.GetIsConjugated())
        is_in_ring = int(bond.IsInRing())
        
        edge_feat = [bond_type, bond_order, is_aromatic, is_conjugated, is_in_ring]
        
        # Add both directions (undirected graph)
        edge_index.append([i, j])
        edge_index.append([j, i])
        edge_features.append(edge_feat)
        edge_features.append(edge_feat)
    
    if len(edge_index) == 0:
        # Single atom molecule - add self-loop
        edge_index = [[0, 0]]
        edge_features = [[0, 1.0, 0, 0, 0]]
    
    x = torch.tensor(atom_features, dtype=torch.float)
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edge_features, dtype=torch.float)
    
    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr)

# Test
test_graph = smiles_to_graph('CCO')  # Ethanol
print(f"Ethanol graph: {test_graph.num_nodes} nodes, {test_graph.num_edges} edges")
print(f"Node features shape: {test_graph.x.shape}")
print(f"Edge features shape: {test_graph.edge_attr.shape}")

Ethanol graph: 3 nodes, 4 edges
Node features shape: torch.Size([3, 9])
Edge features shape: torch.Size([4, 5])


In [5]:
# Pre-compute all solvent graphs
solvent_graphs = {}
for name, smiles in smiles_dict.items():
    graph = smiles_to_graph(smiles)
    if graph is not None:
        solvent_graphs[name] = graph
        print(f"{name}: {graph.num_nodes} atoms, {graph.num_edges} edges, edge_attr: {graph.edge_attr.shape}")
    else:
        print(f"WARNING: Could not parse {name}: {smiles}")

print(f"\nTotal: {len(solvent_graphs)} solvent graphs")

Cyclohexane: 6 atoms, 12 edges, edge_attr: torch.Size([12, 5])
Ethyl Acetate: 6 atoms, 10 edges, edge_attr: torch.Size([10, 5])
Acetic Acid: 4 atoms, 6 edges, edge_attr: torch.Size([6, 5])
2-Methyltetrahydrofuran [2-MeTHF]: 6 atoms, 12 edges, edge_attr: torch.Size([12, 5])
1,1,1,3,3,3-Hexafluoropropan-2-ol: 10 atoms, 18 edges, edge_attr: torch.Size([18, 5])
IPA [Propan-2-ol]: 4 atoms, 6 edges, edge_attr: torch.Size([6, 5])
Ethanol: 3 atoms, 4 edges, edge_attr: torch.Size([4, 5])
Methanol: 2 atoms, 2 edges, edge_attr: torch.Size([2, 5])
Ethylene Glycol [1,2-Ethanediol]: 4 atoms, 6 edges, edge_attr: torch.Size([6, 5])
Acetonitrile: 3 atoms, 4 edges, edge_attr: torch.Size([4, 5])
Water: 1 atoms, 1 edges, edge_attr: torch.Size([1, 5])
Diethyl Ether [Ether]: 5 atoms, 8 edges, edge_attr: torch.Size([8, 5])
MTBE [tert-Butylmethylether]: 6 atoms, 10 edges, edge_attr: torch.Size([10, 5])
Dimethyl Carbonate: 6 atoms, 10 edges, edge_attr: torch.Size([10, 5])
tert-Butanol [2-Methylpropan-2-ol]: 5 

In [6]:
# GAT Model with Edge Features
class SolventGAT(nn.Module):
    def __init__(self, in_channels=9, edge_dim=5, hidden_channels=128, out_channels=3, heads=4, num_layers=4):
        super().__init__()
        
        # GAT layers with edge features
        self.convs = nn.ModuleList()
        self.bns = nn.ModuleList()
        
        # First layer
        self.convs.append(GATConv(in_channels, hidden_channels, heads=heads, edge_dim=edge_dim, concat=True))
        self.bns.append(nn.BatchNorm1d(hidden_channels * heads))
        
        # Middle layers
        for _ in range(num_layers - 2):
            self.convs.append(GATConv(hidden_channels * heads, hidden_channels, heads=heads, edge_dim=edge_dim, concat=True))
            self.bns.append(nn.BatchNorm1d(hidden_channels * heads))
        
        # Last layer
        self.convs.append(GATConv(hidden_channels * heads, hidden_channels, heads=1, edge_dim=edge_dim, concat=False))
        self.bns.append(nn.BatchNorm1d(hidden_channels))
        
        # Final MLP: graph embedding + T + RT
        self.lin = nn.Sequential(
            nn.Linear(hidden_channels + 2, hidden_channels),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_channels, hidden_channels // 2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_channels // 2, out_channels),
            nn.Sigmoid()  # Output in [0, 1]
        )
    
    def forward(self, data, T, RT):
        x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
        
        # Message passing with attention
        for i, (conv, bn) in enumerate(zip(self.convs, self.bns)):
            x = conv(x, edge_index, edge_attr=edge_attr)
            x = bn(x)
            if i < len(self.convs) - 1:
                x = F.elu(x)
                x = F.dropout(x, p=0.1, training=self.training)
        
        # Global pooling
        x = global_mean_pool(x, batch)
        
        # Concatenate with T and RT
        x = torch.cat([x, T.unsqueeze(1), RT.unsqueeze(1)], dim=1)
        
        return self.lin(x)

print("SolventGAT defined")

SolventGAT defined


In [7]:
# GAT Model for MIXTURES - encodes BOTH solvents
class MixtureSolventGAT(nn.Module):
    def __init__(self, in_channels=9, edge_dim=5, hidden_channels=128, out_channels=3, heads=4, num_layers=4):
        super().__init__()
        
        # Shared GAT encoder for both solvents
        self.convs = nn.ModuleList()
        self.bns = nn.ModuleList()
        
        # First layer
        self.convs.append(GATConv(in_channels, hidden_channels, heads=heads, edge_dim=edge_dim, concat=True))
        self.bns.append(nn.BatchNorm1d(hidden_channels * heads))
        
        # Middle layers
        for _ in range(num_layers - 2):
            self.convs.append(GATConv(hidden_channels * heads, hidden_channels, heads=heads, edge_dim=edge_dim, concat=True))
            self.bns.append(nn.BatchNorm1d(hidden_channels * heads))
        
        # Last layer
        self.convs.append(GATConv(hidden_channels * heads, hidden_channels, heads=1, edge_dim=edge_dim, concat=False))
        self.bns.append(nn.BatchNorm1d(hidden_channels))
        
        # Mixture combination layer
        # Input: solvent_A_emb (hidden) + solvent_B_emb (hidden) + mixture_fraction (1) + T (1) + RT (1)
        self.lin = nn.Sequential(
            nn.Linear(hidden_channels * 2 + 3, hidden_channels),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_channels, hidden_channels // 2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_channels // 2, out_channels),
            nn.Sigmoid()
        )
    
    def encode_graph(self, data):
        """Encode a single solvent graph."""
        x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
        
        for i, (conv, bn) in enumerate(zip(self.convs, self.bns)):
            x = conv(x, edge_index, edge_attr=edge_attr)
            x = bn(x)
            if i < len(self.convs) - 1:
                x = F.elu(x)
                x = F.dropout(x, p=0.1, training=self.training)
        
        return global_mean_pool(x, batch)
    
    def forward(self, data_A, data_B, mix_frac, T, RT):
        # Encode both solvents
        emb_A = self.encode_graph(data_A)
        emb_B = self.encode_graph(data_B)
        
        # Concatenate: A_emb, B_emb, mix_frac, T, RT
        x = torch.cat([emb_A, emb_B, mix_frac.unsqueeze(1), T.unsqueeze(1), RT.unsqueeze(1)], dim=1)
        
        return self.lin(x)

print("MixtureSolventGAT defined")

MixtureSolventGAT defined


In [8]:
# GNN Model wrapper for single solvent
class GATModel:
    def __init__(self, data='single', hidden_channels=128, num_epochs=300, lr=1e-3, heads=4, num_layers=4):
        self.data_type = data
        self.mixed = (data == 'full')
        self.hidden_channels = hidden_channels
        self.num_epochs = num_epochs
        self.lr = lr
        self.heads = heads
        self.num_layers = num_layers
        self.model = None
        
    def train_model(self, train_X, train_Y):
        # Prepare data
        if self.mixed:
            self._train_mixed(train_X, train_Y)
        else:
            self._train_single(train_X, train_Y)
    
    def _train_single(self, train_X, train_Y):
        graphs = []
        temps = []
        rts = []
        targets = []
        
        for i in range(len(train_X)):
            row = train_X.iloc[i]
            solvent_name = row['SOLVENT NAME']
            
            if solvent_name not in solvent_graphs:
                continue
            
            graphs.append(solvent_graphs[solvent_name].clone())
            temps.append(row['Temperature'])
            rts.append(row['Residence Time'])
            targets.append(train_Y.iloc[i].values)
        
        # Normalize T and RT
        temps = np.array(temps)
        rts = np.array(rts)
        self.temp_mean, self.temp_std = temps.mean(), temps.std() + 1e-8
        self.rt_mean, self.rt_std = rts.mean(), rts.std() + 1e-8
        temps = (temps - self.temp_mean) / self.temp_std
        rts = (rts - self.rt_mean) / self.rt_std
        
        temps = torch.tensor(temps, dtype=torch.float).to(device)
        rts = torch.tensor(rts, dtype=torch.float).to(device)
        targets = torch.tensor(np.array(targets), dtype=torch.float).to(device)
        
        # Model
        self.model = SolventGAT(
            in_channels=9, edge_dim=5, hidden_channels=self.hidden_channels,
            out_channels=3, heads=self.heads, num_layers=self.num_layers
        ).to(device)
        
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.lr, weight_decay=1e-4)
        scheduler = CosineAnnealingLR(optimizer, T_max=self.num_epochs, eta_min=1e-6)
        
        # Training
        self.model.train()
        batch_size = 32
        n_samples = len(graphs)
        
        for epoch in range(self.num_epochs):
            indices = np.random.permutation(n_samples)
            total_loss = 0
            
            for start in range(0, n_samples, batch_size):
                end = min(start + batch_size, n_samples)
                batch_idx = indices[start:end]
                
                batch_graphs = [graphs[i].to(device) for i in batch_idx]
                batch_data = Batch.from_data_list(batch_graphs)
                batch_T = temps[batch_idx]
                batch_RT = rts[batch_idx]
                batch_targets = targets[batch_idx]
                
                optimizer.zero_grad()
                outputs = self.model(batch_data, batch_T, batch_RT)
                loss = F.mse_loss(outputs, batch_targets)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                
                total_loss += loss.item() * len(batch_idx)
            
            scheduler.step()
        
        self.model.eval()
    
    def _train_mixed(self, train_X, train_Y):
        graphs_A = []
        graphs_B = []
        mix_fracs = []
        temps = []
        rts = []
        targets = []
        
        for i in range(len(train_X)):
            row = train_X.iloc[i]
            solvent_A = row['SOLVENT A NAME']
            solvent_B = row['SOLVENT B NAME']
            
            if solvent_A not in solvent_graphs or solvent_B not in solvent_graphs:
                continue
            
            graphs_A.append(solvent_graphs[solvent_A].clone())
            graphs_B.append(solvent_graphs[solvent_B].clone())
            mix_fracs.append(row['SolventB%'] / 100.0)
            temps.append(row['Temperature'])
            rts.append(row['Residence Time'])
            targets.append(train_Y.iloc[i].values)
        
        # Normalize
        temps = np.array(temps)
        rts = np.array(rts)
        mix_fracs = np.array(mix_fracs)
        self.temp_mean, self.temp_std = temps.mean(), temps.std() + 1e-8
        self.rt_mean, self.rt_std = rts.mean(), rts.std() + 1e-8
        temps = (temps - self.temp_mean) / self.temp_std
        rts = (rts - self.rt_mean) / self.rt_std
        
        temps = torch.tensor(temps, dtype=torch.float).to(device)
        rts = torch.tensor(rts, dtype=torch.float).to(device)
        mix_fracs = torch.tensor(mix_fracs, dtype=torch.float).to(device)
        targets = torch.tensor(np.array(targets), dtype=torch.float).to(device)
        
        # Model
        self.model = MixtureSolventGAT(
            in_channels=9, edge_dim=5, hidden_channels=self.hidden_channels,
            out_channels=3, heads=self.heads, num_layers=self.num_layers
        ).to(device)
        
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.lr, weight_decay=1e-4)
        scheduler = CosineAnnealingLR(optimizer, T_max=self.num_epochs, eta_min=1e-6)
        
        # Training
        self.model.train()
        batch_size = 32
        n_samples = len(graphs_A)
        
        for epoch in range(self.num_epochs):
            indices = np.random.permutation(n_samples)
            total_loss = 0
            
            for start in range(0, n_samples, batch_size):
                end = min(start + batch_size, n_samples)
                batch_idx = indices[start:end]
                
                batch_A = [graphs_A[i].to(device) for i in batch_idx]
                batch_B = [graphs_B[i].to(device) for i in batch_idx]
                data_A = Batch.from_data_list(batch_A)
                data_B = Batch.from_data_list(batch_B)
                batch_mix = mix_fracs[batch_idx]
                batch_T = temps[batch_idx]
                batch_RT = rts[batch_idx]
                batch_targets = targets[batch_idx]
                
                optimizer.zero_grad()
                outputs = self.model(data_A, data_B, batch_mix, batch_T, batch_RT)
                loss = F.mse_loss(outputs, batch_targets)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                
                total_loss += loss.item() * len(batch_idx)
            
            scheduler.step()
        
        self.model.eval()
    
    def predict(self, test_X):
        self.model.eval()
        with torch.no_grad():
            if self.mixed:
                return self._predict_mixed(test_X)
            else:
                return self._predict_single(test_X)
    
    def _predict_single(self, test_X):
        graphs = []
        temps = []
        rts = []
        
        for i in range(len(test_X)):
            row = test_X.iloc[i]
            solvent_name = row['SOLVENT NAME']
            
            if solvent_name not in solvent_graphs:
                # Fallback: use mean prediction
                graphs.append(list(solvent_graphs.values())[0].clone())
            else:
                graphs.append(solvent_graphs[solvent_name].clone())
            
            temps.append(row['Temperature'])
            rts.append(row['Residence Time'])
        
        temps = (np.array(temps) - self.temp_mean) / self.temp_std
        rts = (np.array(rts) - self.rt_mean) / self.rt_std
        
        temps = torch.tensor(temps, dtype=torch.float).to(device)
        rts = torch.tensor(rts, dtype=torch.float).to(device)
        
        batch_data = Batch.from_data_list([g.to(device) for g in graphs])
        outputs = self.model(batch_data, temps, rts)
        
        return outputs
    
    def _predict_mixed(self, test_X):
        graphs_A = []
        graphs_B = []
        mix_fracs = []
        temps = []
        rts = []
        
        for i in range(len(test_X)):
            row = test_X.iloc[i]
            solvent_A = row['SOLVENT A NAME']
            solvent_B = row['SOLVENT B NAME']
            
            if solvent_A not in solvent_graphs:
                graphs_A.append(list(solvent_graphs.values())[0].clone())
            else:
                graphs_A.append(solvent_graphs[solvent_A].clone())
            
            if solvent_B not in solvent_graphs:
                graphs_B.append(list(solvent_graphs.values())[0].clone())
            else:
                graphs_B.append(solvent_graphs[solvent_B].clone())
            
            mix_fracs.append(row['SolventB%'] / 100.0)
            temps.append(row['Temperature'])
            rts.append(row['Residence Time'])
        
        temps = (np.array(temps) - self.temp_mean) / self.temp_std
        rts = (np.array(rts) - self.rt_mean) / self.rt_std
        
        temps = torch.tensor(temps, dtype=torch.float).to(device)
        rts = torch.tensor(rts, dtype=torch.float).to(device)
        mix_fracs = torch.tensor(np.array(mix_fracs), dtype=torch.float).to(device)
        
        data_A = Batch.from_data_list([g.to(device) for g in graphs_A])
        data_B = Batch.from_data_list([g.to(device) for g in graphs_B])
        
        outputs = self.model(data_A, data_B, mix_fracs, temps, rts)
        
        return outputs

print("GATModel wrapper defined")

GATModel wrapper defined


In [None]:
# Run CV for single solvent data
import tqdm

X, Y = load_data("single_solvent")
print(f"Single solvent data: {len(X)} samples, {len(X['SOLVENT NAME'].unique())} solvents")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []
fold_mses = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=24):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = GATModel(data='single', hidden_channels=128, num_epochs=300, lr=1e-3, heads=4, num_layers=4)
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X)
    predictions_np = predictions.detach().cpu().numpy()
    
    # Calculate fold MSE
    fold_mse = np.mean((predictions_np - test_Y.values) ** 2)
    fold_mses.append(fold_mse)
    
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_single_solvent = pd.DataFrame(all_predictions)
print(f"\nSingle solvent predictions: {len(submission_single_solvent)} rows")
print(f"Mean fold MSE: {np.mean(fold_mses):.6f} ± {np.std(fold_mses):.6f}")

In [None]:
# Run CV for full (mixture) data
X, Y = load_data("full")
print(f"Full data: {len(X)} samples")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []
fold_mses = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=13):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = GATModel(data='full', hidden_channels=128, num_epochs=300, lr=1e-3, heads=4, num_layers=4)
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X)
    predictions_np = predictions.detach().cpu().numpy()
    
    # Calculate fold MSE
    fold_mse = np.mean((predictions_np - test_Y.values) ** 2)
    fold_mses.append(fold_mse)
    
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_full_data = pd.DataFrame(all_predictions)
print(f"\nFull data predictions: {len(submission_full_data)} rows")
print(f"Mean fold MSE: {np.mean(fold_mses):.6f} ± {np.std(fold_mses):.6f}")

In [None]:
# Combine and save submission
submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"

print(f"Submission shape: {submission.shape}")

# Save
submission.to_csv("/home/submission/submission.csv", index=True)
print(f"\nSubmission saved to /home/submission/submission.csv")

# Verify
submission_check = pd.read_csv("/home/submission/submission.csv")
print(f"\nSubmission rows: {len(submission_check)}")

# Check prediction ranges
target_cols = ['target_1', 'target_2', 'target_3']
for col in target_cols:
    print(f"{col}: min={submission_check[col].min():.4f}, max={submission_check[col].max():.4f}")

In [None]:
# Calculate overall CV score
print("="*50)
print("EXPERIMENT 086 COMPLETE")
print("="*50)

# Get the MSE values from the fold results
single_cv = np.mean([np.mean((submission_single_solvent[submission_single_solvent['fold']==f][['target_1','target_2','target_3']].values - 
                              Y.iloc[X[X['SOLVENT NAME']==X['SOLVENT NAME'].unique()[f]].index][['SM','Product 2','Product 3']].values)**2) 
                     for f in range(24)])
print(f"\nKey techniques:")
print("1. Graph Attention Network (GAT) with 4 heads")
print("2. Edge features (bond type, order, aromaticity, conjugation, ring)")
print("3. 4 GAT layers with 128 hidden channels")
print("4. Proper mixture handling: encode BOTH solvents + mixture fraction")
print("5. Cosine annealing LR scheduler")
print("6. 300 epochs training")
print("\nThis approach should better capture molecular structure and generalize to unseen solvents.")