# Molecular Property Prediction using Graph Neural Networks

This Notebook runs inferrence on a model trained on the PaiNN (Polarizable Atom Interaction Neural Network) architechture with the goal of predicting the QM9 property $U_0$ known as "internal energy at 0K". This notebook has been adapted from the template *minimal_example.py* in https://github.com/jonasvj/02456_painn_project

In [1]:
# Load dependencies
%pip install torch numpy lightning torch-geometric torchvision rdkit scipy tabulate

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
layers = 6
# The path to the model we want to load
path = f'trained_models/{layers}layers.pth'

In [3]:
import torch  # Importing PyTorch for tensor operations and model training
from tqdm import trange  # Importing tqdm for progress bar functionality
import torch.nn.functional as F  # Importing functional interface for neural network operations
from src.data import QM9DataModule  # Importing the data module for QM9 dataset handling
from pytorch_lightning import seed_everything  # Importing function to set random seed for reproducibility
from src.models import PaiNN, AtomwisePostProcessing  # Importing the model and post-processing class
import sys
import torch._dynamo
torch._dynamo.config.suppress_errors = True

# Use GPU if available, otherwise fallback to CPU
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'
print(f'Using device {device}')



Using device cpu


### Load the model and setup environment

In [4]:
print(f"Loading path: {path}")

# Load model from path
pth = torch.load(path, map_location=torch.device(device))
args = pth["args"]
seed_everything(args.seed)  # Set the random seed for reproducibility of results

if args.use_high_matmul_precision:
    print("Using high precision floats for matrix multiplications")
    torch.set_float32_matmul_precision('high')

  pth = torch.load(path, map_location=torch.device(device))
Seed set to 0


Loading path: trained_models/6layers.pth


### Load data

In [5]:
# Initialize the data module for QM9 dataset
dm = QM9DataModule(
    target=args.target,
    data_dir=args.data_dir,
    batch_size_train=args.batch_size_train,
    batch_size_inference=args.batch_size_inference,
    num_workers=args.num_workers,
    splits=args.splits,
    seed=args.seed,
    subset_size=args.subset_size,
)
dm.prepare_data()  # Prepare the data (download, if necessary)
dm.setup()  # Setup the training and validation/test splits
# Get statistics for the target variable to normalize predictions
y_mean, y_std, atom_refs = dm.get_target_stats(
    remove_atom_refs=True, divide_by_atoms=True
)

Downloading https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/molnet_publish/qm9.zip
Extracting data/raw/qm9.zip
Downloading https://ndownloader.figshare.com/files/3195404


### Instantiate and load the model

In [6]:
# Instantiate the PaiNN model with specified parameters
painn = PaiNN(
    num_message_passing_layers=args.num_message_passing_layers,
    num_features=args.num_features,
    num_outputs=args.num_outputs, 
    num_rbf_features=args.num_rbf_features,
    num_unique_atoms=args.num_unique_atoms,
    cutoff_dist=args.cutoff_dist,
)

# Instantiate post-processing to convert atomic contributions to predicted property
post_processing = AtomwisePostProcessing(
    args.num_outputs, y_mean, y_std, atom_refs
)

# Compile 
if args.compile:
    painn = torch.compile(painn)
    post_processing = torch.compile(post_processing)
    print("Compiled PaiNN and AtomwisePostProcessing module")

# Load models from path
painn.load_state_dict(pth["painn"])
post_processing.load_state_dict(pth["post_processing"])

painn.to(device)  # Move the model to the appropriate device (GPU/CPU)
post_processing.to(device)  # Move the post-processing module to the same device


Compiled PaiNN and AtomwisePostProcessing module


OptimizedModule(
  (_orig_mod): AtomwisePostProcessing(
    (atom_refs): Embedding(100, 1)
  )
)

### Run inference with test data set

In [None]:
# Evaluation phase to compute Mean Absolute Error (MAE) on test data
mae = 0
painn.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Disable gradient computation
    # Iterate through batches of test data
    for batch in dm.test_dataloader():
        batch = batch.to(device)  # Move batch data to the appropriate device

        # Forward pass to compute atomic contributions
        atomic_contributions = painn(
            atoms=batch.z,
            atom_positions=batch.pos,
            graph_indexes=batch.batch,
        )
        # Apply post-processing to obtain predictions from atomic contributions
        preds = post_processing(
            atoms=batch.z,
            graph_indexes=batch.batch,
            atomic_contributions=atomic_contributions,
        )
        # Accumulate Mean Absolute Error
        mae += F.l1_loss(preds, batch.y, reduction='sum')

# Average MAE across the entire test set
mae /= len(dm.data_test)
unit_conversion = dm.unit_conversion[args.target]  # Retrieve unit conversion function for the target
print(f'Test MAE: {unit_conversion(mae):.3f} \n')  # Print the final MAE after unit conversion

