Size-extensive neural net

this neural net takes in molecules-embs of different sizes makes a proposition about each atomistic contribution to a size-extensive property


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.nn.functional as F
import pandas as pd

np.random.seed(44)

'''
Last Modified: 2024/10/02

Simple architecture of a size-extensive neural network which uses 
latent space of a pretrained model to fine-tune and make predictions

    atomwise_nn
        - the class that will define the neural network 
        architecture for size-extensive activation 
        patching transfer learning 
'''


#ATOMWISE size extensive neural network class
class atomwise_nn(nn.Module):
    '''
    Defininig size-extensive activation patching 
    transfer learning model architecture to train 

        INPUT_SIZE
            - the number dimensions in the input feature space
        HIDDEN_SIZE
            - the number of parameters in the one hidden layer model
        OUTPUT_SIZE
            - the number of dimensions in the output feature space 
              (1 for scalar)

    Returns:
        sizeext_quantity
            - the final output quantity as predicted by the network
    '''
    def __init__(self, INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE):
        super(atomwise_nn, self).__init__()
        self.INPUT_SIZE = INPUT_SIZE
        self.HIDDEN_SIZE = HIDDEN_SIZE
        self.output_size = OUTPUT_SIZE

        # Define layers
        self.fc1 = nn.Linear(INPUT_SIZE, HIDDEN_SIZE).double()
        self.fc2 = nn.Linear(HIDDEN_SIZE, OUTPUT_SIZE).double()

    def forward(self, x):
        sizeext_quantity = 0
        for each_atomemb in range(len(x)):
            # Forward pass through the network
            emb = x[each_atomemb]
            emb = F.relu(self.fc1(emb))
            emb = self.fc2(emb)

            #sum the atomwise outputs for each latent space vector (embedding)
            #going through the atomwise neural network
            sizeext_quantity = sizeext_quantity + emb

        return sizeext_quantity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
'''
Training atomwise_nn

This is where you define the exact size of the architecture, 
and other hyperparameters associated 

    INPUT_SIZE
        - number of dimensions in input latent space (embeddings)
    HIDDEN_SIZE
        - number of parameteres in the one-layer size-extensive model
    OUTPUT_SIZE
        - number of output dimensions of the final molecular property
    LEARNING_RATE
        - the rate of learning
    NUM_EPOCHS
        - number of epochs before the training loop finished  
    NUM_TRAIN_SAMPLES
        - number of training molecules
    NUM_VAL_SAMPLES
        - number of validation molecules
    EMBS_PATH
        - where the latent vectors (embeddings) filepath is located
    MOL_PROPERTY_PATH
        - where the molecular property filpath is located
'''

# Define hyperparameters
INPUT_SIZE = 128
HIDDEN_SIZE = 200
OUTPUT_SIZE = 1
LEARNING_RATE = 0.001
NUM_EPOCHS = 10000
NUM_TRAIN_SAMPLES = 450
NUM_VAL_SAMPLES = 100

EMBS_PATH = '../data/datasets/embsMP/embslayer5.csv'
N_EMBS_FEATURES = 128
MOL_PROPERTY_PATH = '../data/datasets/embsMP/mps.csv'

model = atomwise_nn(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE)

# Define the loss function (criterion) and the optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

#read in the latent space data and the final property data to train on 
embs = pd.read_csv(EMBS_PATH)
mps_true = pd.read_csv(MOL_PROPERTY_PATH)

embs_features = embs.iloc[:,0:N_EMBS_FEATURES].values
normalize_embs_128 = nn.BatchNorm1d(N_EMBS_FEATURES).double()
embs_featuresnorm = normalize_embs_128(torch.tensor(embs_features))

embs_norm = np.hstack((embs_featuresnorm.detach().numpy(),embs.iloc[:,N_EMBS_FEATURES:].values))

print(np.shape(embs_norm))

'''
Run Training Loop

'''

# Training loop
for epoch in range(NUM_EPOCHS):
    total_train_loss = 0.0
    model.train()
    train_loss = []
    val_loss = []
    for each_molecule in range(NUM_TRAIN_SAMPLES):
        # Generate random data for each batch
        X = embs_norm[embs_norm[:,128] == each_molecule]
        X = X[:,0:128]
        y = mps_true.iloc[each_molecule].values
        
        # Convert data to PyTorch tensors
        X_tensor = torch.tensor(X)
        y_tensor = torch.tensor(y)

        # Zero gradients, forward pass, backward pass, and update weights
        optimizer.zero_grad()
        output = model(X_tensor)

        loss = criterion(output, y_tensor)
        loss.backward()
        optimizer.step()

        # Accumulate loss for this epoch
        total_train_loss += loss.item()

    model.eval()
    outputs = []
    truths = []
    total_val_loss = 0.0
    with torch.no_grad():
        for each_molecule in range(NUM_VAL_SAMPLES):
            # Generate random data for each batch
            X = embs_norm[embs_norm[:,128] == each_molecule]
            X = X[:,0:128]
            y = mps_true.iloc[each_molecule].values
            
            # Convert data to PyTorch tensors
            X_tensor = torch.tensor(X)
            y_tensor = torch.tensor(y)

            # Zero gradients, forward pass, backward pass, and update weights
            optimizer.zero_grad()
            output = model(X_tensor)

            loss = criterion(output, y_tensor)

            # Accumulate loss for this epoch
            total_val_loss += loss.item()            

            outputs.append(output)
            truths.append(y)

    # Print the average loss for this epoch
    average_train_loss = total_train_loss / NUM_TRAIN_SAMPLES
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Train Loss: {average_train_loss:.4f}")

    # Print the average loss for this epoch
    average_val_loss = total_val_loss / NUM_VAL_SAMPLES
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Val Loss: {average_val_loss:.4f}")
    
    train_loss.append([epoch,average_train_loss])
    val_loss.append([epoch,average_val_loss])