### In this file, we will conduct all of our tests

In [10]:
!pip install nbimporter



In [1]:
import pandas as pd
import numpy as np
import torch
from dgl.dataloading.pytorch import GraphDataLoader
from tqdm.notebook import tqdm
import nbimporter
import dataset as ds
import model as mfile
from score import test

import os

Using backend: pytorch


In [2]:
train_dataset = ds.SyntheticDataset()
batch_size = 1

# We want batch size to be 1 because do not want batched graphs (as this is not the correct structure of our individual molecules)
train_dataloader = GraphDataLoader(train_dataset, batch_size = batch_size, shuffle = True)

In [8]:
from sklearn.metrics import mean_absolute_error as MAE
from os.path import exists

import numpy as np

def train(model, epochs, file_name='SavedModels/electron_TAG.pth', output=False, debug_batch_interval=300):
    optimizer = torch.optim.Adam(model.parameters(),lr=0.0005)
    
    # Try to load best_mae
    best_mae = None
    if exists('SavedModels/bestmae.txt'):
        with open('SavedModels/bestmae.txt', 'r') as f:
            best_mae = float(f.read())
            print("Looking to beat best MAE of", best_mae)
    
    model.train()
    for epoch in tqdm(range(epochs), position=0, desc="Epochs"):
        
        running, batch_running, ct, batch_ct = 0, 0, 0, 0
        print('Epoch', epoch+1)
        for batch_idx, (graph, label) in tqdm(enumerate(train_dataloader), position=1, desc="Batches", total=len(train_dataloader) * batch_size):
            # Some labels may be none (RDKit errors), so move on
            # Ideally this has been filtered out by this step, but as of now there are still some in the dataset
            if np.isnan(label):
                continue
            
            optimizer.zero_grad()

            bf = graph.edata['bond_feats'].float()
            af = graph.ndata['atom_feats'].float()
            
            # Not sure about an error we are experiencing. This is occurring towards the end, so we will just ignore this 1/1000 sample
            try:
                y_pred = model(graph, af, bf)
            except:
                continue
            
            # The 23.06 is the same value used in score.py (conversion to kcal/mol)
            # L1 is MAE, L2 is MSE
            loss = torch.nn.functional.l1_loss(y_pred.reshape(1), label) * 23.06 # ((y_pred.reshape(1,-1) - batch_y)**2).sum()
            running += loss.item()
            batch_running += loss.item()
            ct += 1
            batch_ct += 1
            loss.backward()
            optimizer.step()
            
            if np.isnan(running) or np.isnan(batch_running):
                print("Something went wrong. ABORT! DEBUG INFO:\n")
                print("run, ct", running, ct)
                print("batch_run, batch_ct", batch_running, batch_ct)
                print("label at this value is", label)
                print("pred is", y_pred)
                raise ValueError("NaN output")
                return
                     
            # Every debug_batch_interval iterations, print the data we've churned through (iterations * data per batch)
            if output and batch_idx % (len(train_dataloader) // debug_batch_interval) == 0:                
                print('Epoch: {} [{}/{} ({:.0f}%)]\tBatch Loss: {:.2f}\tEpoch Loss: {:.2f}'.format(
                          epoch+1, batch_idx, len(train_dataloader) * batch_size,    # current sample num / total num
                          100. * batch_idx / len(train_dataloader), # this batch num's % of total dataset
                          batch_running // batch_ct, # the loss for this batch
                          running // ct) # running loss for the epoch
                     )
                batch_running, batch_ct = 0, 0
                
        this_loss = running / ct
        if output:
            print("\nAverage Loss:", round(running / ct * 100) / 100.0,"\n")
        else:
            print("Epoch", epoch+1, "Average Loss:", round(this_loss * 100) / 100.0)
            
        # Save our model
        if not best_mae:
            best_mae = this_loss
            checkpoint = {'state_dict': model.state_dict(),'optimizer': optimizer.state_dict()}
            torch.save(checkpoint, file_name)
        if this_loss < best_mae:
            best_mae = this_loss
            print("New best model found! Saving with loss of", best_mae)
            
            # Write our best mae so we can keep track every time we retrain
            with open('SavedModels/bestmae.txt', 'w') as f:
                f.write(str(best_mae))
            checkpoint = {'state_dict': model.state_dict(),'optimizer': optimizer.state_dict()}
            torch.save(checkpoint, file_name)

#### Create and Train Model

In [4]:
# All graphs in the list have the same scheme size, so pull the dimensions from the first
node_dim = train_dataset[0][0].ndata['atom_feats'].shape[1]
edge_dim = train_dataset[0][0].edata['bond_feats'].shape[1]
print("Dimensions:", node_dim, "(node),", edge_dim, "(edge)")

Dimensions: 6 (node), 5 (edge)


In [5]:
import dgllife
model = mfile.MPNNPredictor(node_dim, edge_dim)
# model = mfile.Electron_MPNN(node_dim, edge_dim)
# Attempt to load model if electron_mpnn.pth exists (check with os)

#### Our Model

Basic Description: \
Our model follows a similar architecture as the MPNN model. It consists of a two linear layers (one at the front, one at the end), a convolution layer, and a GRU layer.

- **fc1**: This linear + relu is our first "line of attack," looking for connectings between our data before we lose information on individual atoms via convolution
- **gnn1**: This layer uses convolution involving two hidden layers to try and grab information about neighbors in an efficient manner
- **gru**: To be completely honest, I am not entirely sure I understand GRUs. My only understanding of it is that it serves to eliminate the issue of the vanishing gradient which we could expect to stumble upon after our fc1 and gnn layers. We are experimented with getting rid of it, and we saw that convergence was decelerating quicker than with it... so we keep it!
- **gnn2**: This layer is a different flavor of graph convolution. The GatedGraphConv was referenced in the paper on MPNNs (https://arxiv.org/pdf/1704.01212.pdf) before they switched to NNConv. Ignoring the time complexity issues (which we serve to rectify by eliminating many of our useless features (quick side note, we also did a bit of "brain-surgery" on our network to try to trace the least important features)), GGC had some positives to it. I figure that reintroducing it after an NNConv might provide another, differing convolution which could be a valuable composition to feed to our readout and prediction functions.
- **gru**: Reusing the same GRU layer before, keeps the gradient after gnn2
- ~~**fc2**: This fully-connected layer serves as our final decision maker, projecting back into 1 dimension (granted there is only 1 dimension at this point anyways) and trying to making sense of the previously convoluted data~~ Removed this as of 13 Nov 21 since we incorporated the MPNN readout and prediction which perform much better. Also an fc2 at the end caused a linear regression on the entire thing and caused the model to predict the average output pe

The first investigation into our important modifications of this model which differentiates it from the MPNN model stems from the negative min_PE output labels. To combat our model giving large error from positive results, many Relu's were stripped from the model, both in the architecture itself and in the forward passes. I experimented with a linear "decision" layer at the very end, but this caused the model to try to make an approximation of the output labels which would end up with the average of the output labels (minimizing error with a constant). As you can imagine, this is unideal, so we ended up scrapping this idea. I also later realized that a prediction layer with negative weights could easily bypass this entire concern.

Training Description: \
To train, I have found that after about 8 epochs, the model begins to stablize. So, the training scheme is planned as follows:

- 3 epochs w/ Adam opt @ 0.1
- 3 epochs w/ Adam opt @ 0.01

This is to help refine the smaller details of the gradient with respect to the weights in our model. This is essentially our own version of momentum because we try to have the model drop mae rather quickly, and then be refined with minute changes in our network.

In [6]:
print(model)

MPNNPredictor(
  (gnn): Electron_MPNN(
    (fc1): Sequential(
      (0): Linear(in_features=6, out_features=128, bias=True)
      (1): ReLU()
    )
    (gnn1): NNConv(
      (edge_func): Sequential(
        (0): Linear(in_features=5, out_features=256, bias=True)
        (1): ReLU()
        (2): Linear(in_features=256, out_features=16384, bias=True)
      )
    )
    (gnn2): GatedGraphConv(
      (linears): ModuleList(
        (0): Linear(in_features=128, out_features=128, bias=True)
        (1): Linear(in_features=128, out_features=128, bias=True)
        (2): Linear(in_features=128, out_features=128, bias=True)
        (3): Linear(in_features=128, out_features=128, bias=True)
        (4): Linear(in_features=128, out_features=128, bias=True)
      )
      (gru): GRUCell(128, 128)
    )
    (gru): GRU(128, 128)
  )
  (readout): Set2Set(
    n_iters=6
    (lstm): LSTM(256, 128, num_layers=3)
  )
  (predict): Sequential(
    (0): Linear(in_features=256, out_features=128, bias=True)
    (1

In [None]:
train(model, 1, output=True)

Looking to beat best MAE of 11490.814183533093


Epochs:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 1


Batches:   0%|          | 0/13480 [00:00<?, ?it/s]



In [7]:
model = mfile.MPNNPredictor(node_dim, edge_dim)
model.load_state_dict(torch.load("SavedModels/20k_custom_mpnn.pth")["state_dict"])

64 5


<All keys matched successfully>

Test to keep jupyterlab running even when computer dies/goes to sleep

In [1]:
# %%capture stored_output

# Upon returning, call
# stored_output.show()

In [None]:
# Output for train on MPNN prebuilt
'''
Epoch: 1 [0/13480 (0%)]	Batch Loss: 15816.00	Epoch Loss: 15816.00
Epoch: 1 [1348/13480 (10%)]	Batch Loss: 21889.00	Epoch Loss: 21884.00
Epoch: 1 [2696/13480 (20%)]	Batch Loss: 21315.00	Epoch Loss: 21600.00
Epoch: 1 [4044/13480 (30%)]	Batch Loss: 20245.00	Epoch Loss: 21148.00
Epoch: 1 [5392/13480 (40%)]	Batch Loss: 18982.00	Epoch Loss: 20607.0
'''

# Output for train on custom MPNN model (note I upped the log interval... so this 29k error is with only 132 samples)
'''
Epoch: 1 [0/13480 (0%)]	Batch Loss: 36117.00	Epoch Loss: 36117.00
Epoch: 1 [44/13480 (0%)]	Batch Loss: 43399.00	Epoch Loss: 43237.00
Epoch: 1 [88/13480 (1%)]	Batch Loss: 22759.00	Epoch Loss: 33113.00
Epoch: 1 [132/13480 (1%)]	Batch Loss: 20600.00	Epoch Loss: 28974.00
'''

### NOTE TO RONAN 
Save model with code below (keep in mind auto-saving only happens when an epoch completes (and if the loss is below our minimum loss of any model)

In [11]:
checkpoint = {'state_dict': model.state_dict()}
torch.save(checkpoint, "SavedModels/20k_custom_mpnn.pth")

In [8]:
test(model, "Electron")

0 / 1291
20 / 1291
40 / 1291
60 / 1291
80 / 1291
100 / 1291
120 / 1291
140 / 1291
160 / 1291
180 / 1291
200 / 1291
220 / 1291
240 / 1291
260 / 1291
280 / 1291
300 / 1291
320 / 1291
340 / 1291
360 / 1291
380 / 1291
400 / 1291
420 / 1291
440 / 1291
460 / 1291
480 / 1291
500 / 1291
520 / 1291
540 / 1291
560 / 1291
580 / 1291
600 / 1291
620 / 1291
640 / 1291
660 / 1291
680 / 1291
700 / 1291
720 / 1291
740 / 1291
760 / 1291
780 / 1291
800 / 1291
820 / 1291
840 / 1291
860 / 1291
880 / 1291
900 / 1291
920 / 1291
940 / 1291
960 / 1291
980 / 1291
1000 / 1291
1020 / 1291
1040 / 1291
1060 / 1291
1080 / 1291
1100 / 1291
1120 / 1291
1140 / 1291
1160 / 1291
1180 / 1291
1200 / 1291
1220 / 1291
1240 / 1291
1260 / 1291
1280 / 1291


20628.016402885405

In [None]:
print("Done")

In [15]:
# checkpoint = {'state_dict': model.state_dict()}
# torch.save(checkpoint, "electron_mpnn_no_ReLU.pth")

hi


### Load best model

In [5]:
best_model = mfile.Electron_MPNN(node_dim, edge_dim, out_dim=1)
best_model.load_state_dict(torch.load("electron_mpnn_v1_ReLU.pth")["state_dict"])

<All keys matched successfully>

In [43]:
best_model.fc2.weight

Parameter containing:
tensor([[1458.2877]], requires_grad=True)

# Sources and References

1. https://arxiv.org/pdf/1704.01212.pdf
2. https://arxiv.org/pdf/1806.03146.pdf
3. https://chemrxiv.org/engage/api-gateway/chemrxiv/assets/orp/resource/item/60c7579dbb8c1a48b63dc892/original/a-graph-neural-network-for-predicting-energy-and-stability-of-known-and-hypothetical-crystal-structures.pdf#page=11&zoom=100,76,125
4. 