## Notebook 4: State-of-the-Art Model (AttentiveFP)

Implement, train, and evaluate an AttentiveFP model, a state-of-the-art GNN architecture designed for molecular property prediction. Our aim is to surpass the RandomForest baseline and achieve an ROC AUC score greater than 0.81.

### Setup

In [None]:
import pandas as pd
import numpy as np
import ast
import torch
import torch.nn.functional as F
from tqdm.notebook import tqdm

# RDKit for chemoinformatics
from rdkit import Chem

# PyTorch Geometric
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn.models import AttentiveFP # Import the AttentiveFP model

# Scikit-learn for evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

print("Libraries imported successfully.")

### Load and Prepare Data

In [None]:
try:
    df = pd.read_csv('data/processed/dili_data_clean.csv')
    df.dropna(subset=['fingerprint', 'smiles'], inplace=True)
    print("Processed data loaded successfully.")
except FileNotFoundError:
    print("Error: dili_data_clean.csv not found.")

# Graph Conversion Functions 
def get_atom_features(atom):
    features = []
    features.append(atom.GetAtomicNum())
    features.append(atom.GetDegree())
    features.append(atom.GetFormalCharge())
    features.append(int(atom.GetHybridization()))
    features.append(atom.GetIsAromatic())
    return features

def get_bond_features(bond):
    bond_type = bond.GetBondType()
    return [
        bond_type == Chem.rdchem.BondType.SINGLE,
        bond_type == Chem.rdchem.BondType.DOUBLE,
        bond_type == Chem.rdchem.BondType.TRIPLE,
        bond_type == Chem.rdchem.BondType.AROMATIC,
        bond.GetIsConjugated(),
        bond.IsInRing(),
    ]

def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None: return None
    
    atom_features = [get_atom_features(atom) for atom in mol.GetAtoms()]
    x = torch.tensor(atom_features, dtype=torch.float)

    edge_indices, edge_attrs = [], []
    for bond in mol.GetBonds():
        i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        bond_feats = get_bond_features(bond)
        edge_indices.extend([(i, j), (j, i)])
        edge_attrs.extend([bond_feats, bond_feats])

    edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edge_attrs, dtype=torch.float)
    
    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr)

# Create Graph Dataset
print("Converting SMILES to graph objects...")
data_list = [smiles_to_graph(s) for s in tqdm(df['smiles'])]

successful_indices = [i for i, d in enumerate(data_list) if d is not None]
data_list = [data_list[i] for i in successful_indices]
labels = df['dili_concern'].iloc[successful_indices].values

for i, data in enumerate(data_list):
    data.y = torch.tensor([labels[i]], dtype=torch.float)

print(f"Successfully created {len(data_list)} graph objects.")

### Create Train and Test Sets

In [None]:
train_data, test_data = train_test_split(data_list, test_size=0.2, random_state=42, stratify=labels)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

print(f"Number of training graphs: {len(train_data)}")
print(f"Number of testing graphs: {len(test_data)}")

### Define the AttentiveFP Model

Initialize the `AttentiveFP` model. We'll use parameters that are commonly effective for this architecture.


In [None]:
# Determine feature sizes from our data
num_node_features = data_list[0].x.shape[1]
num_edge_features = data_list[0].edge_attr.shape[1]

model = AttentiveFP(
    in_channels=num_node_features,
    hidden_channels=64,
    out_channels=1, # Final output is a single value for binary classification
    edge_dim=num_edge_features,
    num_layers=2,
    num_timesteps=2,
    dropout=0.5
)

print("AttentiveFP Model defined:")
print(model)

### Train the Model

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)

# Calculate class weights
neg_count = np.sum(labels == 0)
pos_count = np.sum(labels == 1)
pos_weight_value = neg_count / pos_count if pos_count > 0 else 1
pos_weight_tensor = torch.tensor([pos_weight_value], dtype=torch.float)
criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)

def train():
    model.train()
    total_loss = 0
    for data in train_loader:
        optimizer.zero_grad()
        out = model(data.x, data.edge_index, data.batch, data.edge_attr)
        loss = criterion(out, data.y.view(-1, 1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

print("Starting AttentiveFP training...")
for epoch in range(1, 101): # Train for 100 epochs
    loss = train()
    scheduler.step()
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}')
print("Training finished.")


### Evaluate the Model

In [None]:
def test(loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for data in loader:
            out = model(data.x, data.edge_index, data.batch, data.edge_attr)
            preds = (torch.sigmoid(out) > 0.5).float()
            all_preds.extend(preds.view(-1).tolist())
            all_labels.extend(data.y.view(-1).tolist())
    return np.array(all_preds), np.array(all_labels)

y_pred, y_true = test(test_loader)

# Calculate metrics
afp_accuracy = accuracy_score(y_true, y_pred)
afp_roc_auc = roc_auc_score(y_true, y_pred)


# ## Step 7: Compare Results and Conclude

# In[7]:
print("--- AttentiveFP Model Performance ---")
print(f"Accuracy: {afp_accuracy:.3f}")
print(f"ROC AUC:  {afp_roc_auc:.3f}")

print("\n--- Comparison ---")
print("Metric         | RandomForest (Baseline) | AttentiveFP Model")
print("----------------|-------------------------|-------------------")
rf_roc_auc = 0.761
print(f"ROC AUC       | {rf_roc_auc:.3f}                   | {afp_roc_auc:.3f}")