### In this file, we will conduct all of our tests

In [5]:
!pip install nbimporter

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import pandas as pd
import numpy as np
import torch
from dgl.dataloading.pytorch import GraphDataLoader
from tqdm.notebook import tqdm
import nbimporter
import dataset as ds
import model as mfile
from score import test

import os

Using backend: pytorch


In [2]:
train_dataset = ds.SyntheticDataset()
batch_size = 1

# We want batch size to be 1 because do not want batched graphs (as this is not the correct structure of our individual molecules)
train_dataloader = GraphDataLoader(train_dataset, batch_size = batch_size, shuffle = True)

In [3]:
from sklearn.metrics import mean_absolute_error as MAE
from os.path import exists

import numpy as np

def train(model, epochs, file_name='SavedModels/rj_electron.pth', output=False, debug_batch_interval=5):
    optimizer = torch.optim.Adam(model.parameters(),lr=0.00005)
    
    # Try to load best_mae
    best_mae = None
    if exists('SavedModels/bestmae.txt'):
        with open('SavedModels/bestmae.txt', 'r') as f:
            best_mae = float(f.read())
            print("Looking to beat best MAE of", best_mae)
    
    model.train()
    for epoch in tqdm(range(epochs), position=0, desc="Epochs"):
        
        running, batch_running, ct, batch_ct = 0, 0, 0, 0
        print('Epoch', epoch+1)
        for batch_idx, (graph, label) in tqdm(enumerate(train_dataloader), position=1, desc="Batches", total=len(train_dataloader) * batch_size):
            # Some labels may be none (RDKit errors), so move on
            # Ideally this has been filtered out by this step, but as of now there are still some in the dataset
            if np.isnan(label):
                continue
            
            optimizer.zero_grad()

            bf = graph.edata['bond_feats'].float()
            af = graph.ndata['atom_feats'].float()
            
            # Not sure about an error we are experiencing. This is occurring towards the end, so we will just ignore this 1/1000 sample
            try:
                y_pred = model(graph, af, bf)[0]
            except:
                continue
            
            # The 23.06 is the same value used in score.py (conversion to kcal/mol)
            # L1 is MAE, L2 is MSE
            loss = torch.nn.functional.l1_loss(y_pred.reshape(1), label) * 23.06 # ((y_pred.reshape(1,-1) - batch_y)**2).sum()
            running += loss.item()
            batch_running += loss.item()
            ct += 1
            batch_ct += 1
            loss.backward()
            optimizer.step()
                     
            # Every debug_batch_interval iterations, print the data we've churned through (iterations * data per batch)
            if output and batch_idx % (len(train_dataloader) // debug_batch_interval) == 0:                
                print('Epoch: {} [{}/{} ({:.0f}%)]\tBatch Loss: {:.2f}\tEpoch Loss: {:.2f}'.format(
                          epoch+1, batch_idx, len(train_dataloader) * batch_size,    # current sample num / total num
                          100. * batch_idx / len(train_dataloader), # this batch num's % of total dataset
                          batch_running // batch_ct, # the loss for this batch
                          running // ct) # running loss for the epoch
                     )
                batch_running, batch_ct = 0, 0
                
        this_loss = running / ct
        if output:
            print("\nAverage Loss:", round(running / ct * 100) / 100.0,"\n")
        else:
            print("Epoch", epoch+1, "Average Loss:", round(this_loss * 100) / 100.0)
            
        # Save our model
        if not best_mae:
            best_mae = this_loss
            checkpoint = {'state_dict': model.state_dict(),'optimizer': optimizer.state_dict()}
            torch.save(checkpoint, file_name)
        if this_loss < best_mae:
            best_mae = this_loss
            print("New best model found! Saving with loss of", best_mae)
            
            # Write our best mae so we can keep track every time we retrain
            with open('SavedModels/bestmae.txt', 'w') as f:
                f.write(str(best_mae))
            checkpoint = {'state_dict': model.state_dict(),'optimizer': optimizer.state_dict()}
            torch.save(checkpoint, file_name)

#### Create and Train Model

In [4]:
# All graphs in the list have the same scheme size, so pull the dimensions from the first
node_dim = train_dataset[0][0].ndata['atom_feats'].shape[1]
edge_dim = train_dataset[0][0].edata['bond_feats'].shape[1]
print("Dimensions:", node_dim, "(node),", edge_dim, "(edge)")

Dimensions: 16 (node), 5 (edge)


In [5]:
import dgllife
model = mfile.Electron_Predictor_3k(node_dim, edge_dim)

In [6]:
model.load_state_dict(torch.load("SavedModels/4k_ish_rj_electron.pth")["state_dict"])

<All keys matched successfully>

#### Our Model

Basic Description: \
Our model follows a similar architecture as the MPNN model. It consists of a two linear layers (one at the front, one at the end), a convolution layer, and a GRU layer.

- **fc1**: This linear + relu is our first "line of attack," looking for connectings between our data before we lose information on individual atoms via convolution. It also projects our data to a higher dimension. I talk about tall vs wide as we learned from class in our model file
- **gnn1**: This layer uses convolution involving two hidden layers to try and grab information about neighbors in an efficient manner
- **gru**: To be completely honest, I am not entirely sure I understand GRUs. My only understanding of it is that it serves to eliminate the issue of the vanishing gradient which we could expect to stumble upon after our fc1 and gnn layers. We are experimented with getting rid of it, and we saw that convergence was decelerating quicker than with it... so we keep it!
- **gnn2**: This layer is a different flavor of graph convolution. The GatedGraphConv was referenced in the paper on MPNNs (https://arxiv.org/pdf/1704.01212.pdf) before they switched to NNConv. Ignoring the time complexity issues (which we serve to rectify by eliminating many of our useless features (quick side note, we also did a bit of "brain-surgery" on our network to try to trace the least important features)), GGC had some positives to it. I figure that reintroducing it after an NNConv might provide another, differing convolution which could be a valuable composition to feed to our readout and prediction functions.
- **gru**: Reusing the same GRU layer before, keeps the gradient after gnn2
- ~~**fc2**: This fully-connected layer serves as our final decision maker, projecting back into 1 dimension (granted there is only 1 dimension at this point anyways) and trying to making sense of the previously convoluted data~~ Removed this as of 13 Nov 21 since we incorporated the MPNN readout and prediction which perform much better. Also an fc2 at the end caused a linear regression on the entire thing and caused the model to predict the average output pe

The first investigation into our important modifications of this model which differentiates it from the MPNN model stems from the negative min_PE output labels. To combat our model giving large error from positive results, many Relu's were stripped from the model, both in the architecture itself and in the forward passes. I experimented with a linear "decision" layer at the very end, but this caused the model to try to make an approximation of the output labels which would end up with the average of the output labels (minimizing error with a constant). As you can imagine, this is unideal, so we ended up scrapping this idea. I also later realized that a prediction layer with negative weights could easily bypass this entire concern.

Training Description: \
To train, I have found that after about 8 epochs, the model begins to stablize. So, the training scheme is planned as follows:

- 3 epochs w/ Adam opt @ 0.1
- 3 epochs w/ Adam opt @ 0.01

This is to help refine the smaller details of the gradient with respect to the weights in our model. This is essentially our own version of momentum because we try to have the model drop mae rather quickly, and then be refined with minute changes in our network.

In [7]:
print(model)

Electron_Predictor(
  (gnn): Electron_MPNN(
    (fc1): Sequential(
      (0): Linear(in_features=10, out_features=128, bias=True)
      (1): ReLU()
    )
    (gnn1): NNConv(
      (edge_func): Sequential(
        (0): Linear(in_features=5, out_features=512, bias=True)
        (1): ReLU()
        (2): Linear(in_features=512, out_features=256, bias=True)
        (3): ReLU()
        (4): Linear(in_features=256, out_features=16384, bias=True)
      )
    )
    (gru): GRU(128, 128)
  )
  (predict): Sequential(
    (0): Linear(in_features=128, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=5, bias=True)
  )
)


In [8]:
train(model, 15, output=True)

Looking to beat best MAE of 3293.649390840706


Epochs:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch 1


Batches:   0%|          | 0/13478 [00:00<?, ?it/s]



KeyboardInterrupt: 

Test to keep jupyterlab running even when computer dies/goes to sleep

In [1]:
# %%capture stored_output

# Upon returning, call
# stored_output.show()

In [None]:
# Output for train on MPNN prebuilt
'''
Epoch: 1 [0/13480 (0%)]	Batch Loss: 15816.00	Epoch Loss: 15816.00
Epoch: 1 [1348/13480 (10%)]	Batch Loss: 21889.00	Epoch Loss: 21884.00
Epoch: 1 [2696/13480 (20%)]	Batch Loss: 21315.00	Epoch Loss: 21600.00
Epoch: 1 [4044/13480 (30%)]	Batch Loss: 20245.00	Epoch Loss: 21148.00
Epoch: 1 [5392/13480 (40%)]	Batch Loss: 18982.00	Epoch Loss: 20607.0
'''

# Output for train on custom MPNN model (note I upped the log interval... so this 29k error is with only 132 samples)
'''
Epoch: 1 [0/13480 (0%)]	Batch Loss: 36117.00	Epoch Loss: 36117.00
Epoch: 1 [44/13480 (0%)]	Batch Loss: 43399.00	Epoch Loss: 43237.00
Epoch: 1 [88/13480 (1%)]	Batch Loss: 22759.00	Epoch Loss: 33113.00
Epoch: 1 [132/13480 (1%)]	Batch Loss: 20600.00	Epoch Loss: 28974.00
'''

# Output for train on custom MPNN (with high-speed discrete graphics card)
'''
Epoch: 1 [0/13478 (0%)]	Batch Loss: 49005.00	Epoch Loss: 49005.00
Epoch: 1 [2695/13478 (20%)]	Batch Loss: 34606.00	Epoch Loss: 34611.00
Epoch: 1 [5390/13478 (40%)]	Batch Loss: 19776.00	Epoch Loss: 27195.00
Epoch: 1 [8085/13478 (60%)]	Batch Loss: 19633.00	Epoch Loss: 24675.00
Epoch: 1 [10780/13478 (80%)]	Batch Loss: 19374.00	Epoch Loss: 23350.00
Epoch: 1 [13475/13478 (100%)]	Batch Loss: 18702.00	Epoch Loss: 22420.00
Average Loss: 22420.12 

...

Epoch: 5 [0/13478 (0%)]	Batch Loss: 148.00	Epoch Loss: 148.00
Epoch: 5 [2695/13478 (20%)]	Batch Loss: 11514.00	Epoch Loss: 11510.00
Epoch: 5 [5390/13478 (40%)]	Batch Loss: 11640.00	Epoch Loss: 11575.00
Epoch: 5 [8085/13478 (60%)]	Batch Loss: 11229.00	Epoch Loss: 11459.00
Epoch: 5 [10780/13478 (80%)]	Batch Loss: 11225.00	Epoch Loss: 11401.00
Epoch: 5 [13475/13478 (100%)]	Batch Loss: 10779.00	Epoch Loss: 11276.00
Average Loss: 11276.78

...

Epoch: 10 [0/13478 (0%)]	Batch Loss: 4457.00	Epoch Loss: 4457.00
Epoch: 10 [2695/13478 (20%)]	Batch Loss: 7007.00	Epoch Loss: 7006.00
Epoch: 10 [5390/13478 (40%)]	Batch Loss: 6615.00	Epoch Loss: 6811.00
Epoch: 10 [8085/13478 (60%)]	Batch Loss: 6603.00	Epoch Loss: 6741.00
Epoch: 10 [10780/13478 (80%)]	Batch Loss: 6489.00	Epoch Loss: 6678.00
Epoch: 10 [13475/13478 (100%)]	Batch Loss: 6101.00	Epoch Loss: 6563.00
Average Loss: 6562.86 

...

Epoch: 15 [0/13478 (0%)]	Batch Loss: 14334.00	Epoch Loss: 14334.00
Epoch: 15 [2695/13478 (20%)]	Batch Loss: 4727.00	Epoch Loss: 4731.00
Epoch: 15 [5390/13478 (40%)]	Batch Loss: 4701.00	Epoch Loss: 4716.00
Epoch: 15 [8085/13478 (60%)]	Batch Loss: 4882.00	Epoch Loss: 4771.00
Epoch: 15 [10780/13478 (80%)]	Batch Loss: 4562.00	Epoch Loss: 4719.00
Epoch: 15 [13475/13478 (100%)]	Batch Loss: 4519.00	Epoch Loss: 4679.00
Average Loss: 4678.87 

...

Epoch: 20 [0/13478 (0%)]	Batch Loss: 4333.00	Epoch Loss: 4333.00
Epoch: 20 [2695/13478 (20%)]	Batch Loss: 3833.00	Epoch Loss: 3833.00
Epoch: 20 [5390/13478 (40%)]	Batch Loss: 3955.00	Epoch Loss: 3894.00
Epoch: 20 [8085/13478 (60%)]	Batch Loss: 3969.00	Epoch Loss: 3919.00
Epoch: 20 [10780/13478 (80%)]	Batch Loss: 3634.00	Epoch Loss: 3848.00
Epoch: 20 [13475/13478 (100%)]	Batch Loss: 3888.00	Epoch Loss: 3856.00
Average Loss: 3856.48 

'''

# Output for train on reduced features (w/ energy feaature)
'''
Epoch: 1 [0/13478 (0%)]	Batch Loss: 44627.00	Epoch Loss: 44627.00
Epoch: 1 [2695/13478 (20%)]	Batch Loss: 33025.00	Epoch Loss: 33030.00
Epoch: 1 [5390/13478 (40%)]	Batch Loss: 22461.00	Epoch Loss: 27747.00
Epoch: 1 [8085/13478 (60%)]	Batch Loss: 20027.00	Epoch Loss: 25174.00
Epoch: 1 [10780/13478 (80%)]	Batch Loss: 16405.00	Epoch Loss: 22982.00
Epoch: 1 [13475/13478 (100%)]	Batch Loss: 12521.00	Epoch Loss: 20890.00
Average Loss: 20891.38 

Epoch: 2 [0/13478 (0%)]	Batch Loss: 3083.00	Epoch Loss: 3083.00
Epoch: 2 [2695/13478 (20%)]	Batch Loss: 11075.00	Epoch Loss: 11072.00
Epoch: 2 [5390/13478 (40%)]	Batch Loss: 10140.00	Epoch Loss: 10606.00
Epoch: 2 [8085/13478 (60%)]	Batch Loss: 9159.00	Epoch Loss: 10124.00
Epoch: 2 [10780/13478 (80%)]	Batch Loss: 8855.00	Epoch Loss: 9807.00
Epoch: 2 [13475/13478 (100%)]	Batch Loss: 8362.00	Epoch Loss: 9518.00
Average Loss: 9517.3 

(capped around 5k)

'''

### NOTE TO RONAN 
Save model with code below (keep in mind auto-saving only happens when an epoch completes (and if the loss is below our minimum loss of any model)

In [12]:
checkpoint = {'state_dict': model.state_dict()}
torch.save(checkpoint, "SavedModels/4k_ish_rj_electron.pth")

In [7]:
test(model, "Electron")

2519.398824028712

# Sources and References

1. https://arxiv.org/pdf/1704.01212.pdf
2. https://arxiv.org/pdf/1806.03146.pdf
3. https://chemrxiv.org/engage/api-gateway/chemrxiv/assets/orp/resource/item/60c7579dbb8c1a48b63dc892/original/a-graph-neural-network-for-predicting-energy-and-stability-of-known-and-hypothetical-crystal-structures.pdf#page=11&zoom=100,76,125
4. 