In [None]:
# notebooks/02_surrogate_training.ipynb
# ====================================
# Jupyter Notebook: Surrogate MPNN Training for Organic Semiconductors
# - Load dataset (SMILES + HOMO/LUMO)
# - Convert to graph using featurization.py
# - Train MPNN / ensemble
# - Evaluate predictions and visualize results

# %%
# Imports
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader

# PyG imports
from torch_geometric.loader import DataLoader as PyGDataLoader

# Local imports
try:
    from src.data.featurization import mol_to_graph
    from src.models.mpnn import MPModel, MoleculeDataset, train_one, evaluate, train_ensemble, ensemble_predict
except Exception:
    from src.data.featurization import mol_to_graph
    from src.models.mpnn import MPModel, MoleculeDataset, train_one, evaluate, train_ensemble, ensemble_predict

In [None]:
# %%
# 1. Prepare example dataset (replace with your real dataset later)
demo_data = pd.DataFrame({
    'smiles': ['c1ccccc1', 'c1ccncc1', 'C1=CC=CC=C1O', 'c1ccoc1'],
    'HOMO': [-6.5, -6.2, -6.8, -6.0],
    'LUMO': [-2.5, -2.6, -2.3, -2.0]
})

# Optional: split train/test
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(demo_data, test_size=0.25, random_state=42)


In [None]:
# %%
# 2. Convert to MoleculeDataset
train_dataset = MoleculeDataset(train_df)
val_dataset = MoleculeDataset(val_df)

# Data loaders
train_loader = PyGDataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = PyGDataLoader(val_dataset, batch_size=2, shuffle=False)

In [None]:
# %%
# 3. Initialize model
node_dim = train_dataset[0].x.size(1)
edge_dim = train_dataset[0].edge_attr.size(1)
out_dim = train_dataset[0].y.size(1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = MPModel(node_in_dim=node_dim, edge_in_dim=edge_dim, out_dim=out_dim)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
# %%
# 4. Training loop
epochs = 20
for epoch in range(1, epochs+1):
    train_loss = train_one(model, train_loader, optimizer, device)
    val_mae, val_preds, val_trues = evaluate(model, val_loader, device)
    print(f"Epoch {epoch:02d} | Train Loss: {train_loss:.4f} | Val MAE: {val_mae:.4f}")


In [None]:
# %%
# 5. Visualize predictions
import matplotlib.pyplot as plt

plt.figure(figsize=(6,6))
plt.scatter(val_trues[:,0], val_preds[:,0], label='HOMO', color='blue')
plt.scatter(val_trues[:,1], val_preds[:,1], label='LUMO', color='red')
plt.plot([-7,-5],[-7,-5], '--', color='gray')
plt.xlabel('True values (eV)')
plt.ylabel('Predicted values (eV)')
plt.title('Surrogate MPNN Predictions')
plt.legend()
plt.show()


In [None]:
# %%
# 5. Visualize predictions
import matplotlib.pyplot as plt

plt.figure(figsize=(6,6))
plt.scatter(val_trues[:,0], val_preds[:,0], label='HOMO', color='blue')
plt.scatter(val_trues[:,1], val_preds[:,1], label='LUMO', color='red')
plt.plot([-7,-5],[-7,-5], '--', color='gray')
plt.xlabel('True values (eV)')
plt.ylabel('Predicted values (eV)')
plt.title('Surrogate MPNN Predictions')
plt.legend()
plt.show()


In [None]:
# %%
# 6. Optional: train small ensemble
ensemble_dir = './models_demo'
os.makedirs(ensemble_dir, exist_ok=True)
train_ensemble(demo_data, ensemble_dir, n_models=2, epochs=5, batch_size=2, lr=1e-3, device=device)

In [None]:
# %%
# 7. Ensemble predictions with uncertainty
dataset = MoleculeDataset(demo_data)
mean_pred, std_pred = ensemble_predict(ensemble_dir, dataset, device=device)

print('Mean predictions:\n', mean_pred)
print('Prediction std (uncertainty):\n', std_pred)
