# COVID-19 mRNA Vaccine Degradation Prediction using Graph Neural Networks
## Group 8: Anders Segerlund, Mathias Samuelsson, Pontus Havström, Sam Nehmé

This notebook includes the code used to generate the results presented in the report. The data sets were downloaded from https://www.kaggle.com/c/stanford-covid-vaccine/data.

The code is not very cleanly written, but should give an insight into how the models were implemented for the curious reader.

In [11]:
# Import packages
import json
import pandas as pd  
import numpy as np
from tqdm.notebook import tqdm

import torch
from torch.nn import Linear
from torch.nn import ReLU
import torch.nn.functional as F

## Load data

In [12]:
# Read data
train = pd.read_json('data/train.json', lines=True) 
test = pd.read_json('data/test.json', lines=True) 
# Divide test data into the two subsets: Private Test and Public Test
# seq_length=107 in Public Test while seq_length=130 in Private Test
test_public = test[test["seq_length"] == 107]
test_private = test[test["seq_length"] == 130]

# Print the first sample for testing
df = pd.DataFrame(train)
print(df.iloc[0])

# Optionally, only take training data which have passed the signal-to-noise filter
train_filtered = train[train["SN_filter"] == 1]

# Change apply_SN_filter to True to only train on filtered data, using the SN filter described in the Kaggle challenge (same which is used for public test data)
# As described in the report, we opted to using the provided noise filter rather than using our denoising autoencoder, as this gave better results.
apply_SN_filter = True
# apply_SN_filter = False 
if apply_SN_filter == True:
    train = train_filtered

index                                                                  0
id                                                          id_001f94081
sequence               GGAAAAGCUCUAAUAACAGGAGACUAGGACUACGUAUUUCUAGGUA...
structure              .....((((((.......)))).)).((.....((..((((((......
predicted_loop_type    EEEEESSSSSSHHHHHHHSSSSBSSXSSIIIIISSIISSSSSSHHH...
signal_to_noise                                                    6.894
SN_filter                                                              1
seq_length                                                           107
seq_scored                                                            68
reactivity_error       [0.1359, 0.20700000000000002, 0.1633, 0.1452, ...
deg_error_Mg_pH10      [0.26130000000000003, 0.38420000000000004, 0.1...
deg_error_pH10         [0.2631, 0.28600000000000003, 0.0964, 0.1574, ...
deg_error_Mg_50C       [0.1501, 0.275, 0.0947, 0.18660000000000002, 0...
deg_error_50C          [0.2167, 0.34750000000000003

## Structure adjacency matrix

In [13]:
def get_struct_adj(data = train, sequential_edges = False):
    # Get adjacency matrix from sample structure sequence
    # Include edges between base pairs
    # If sequential_edges == False, do not include edges between sequential bases
    # If sequential_edges == True, add these edges, which correspond to the diagonals -1 and 1 in the adjacency matrix (assuming undirected edges)
    struct_adj = []
    for ix in range(len(data)):
        seq_length = data["seq_length"].iloc[ix]
        structure = data["structure"].iloc[ix]
        sequence = data["sequence"].iloc[ix]

        queue = [] # Store indices corresponding to "(" in queue

        sample_struct_adj = np.zeros([seq_length, seq_length])
        for jx in range(seq_length):
            if structure[jx] == "(":
                queue.append(jx) # Append index of "(" in base pair to queue
            elif structure[jx] == ")":
                start = queue.pop() # Retrieve index of last "(" in queue, corresponding to ")" at jx
                sample_struct_adj[start, jx] = 1 # Add edge from "(" to ")"
                sample_struct_adj[jx, start] = 1 # Add edge from ")" to "(" (assume undirected)

        if sequential_edges == True:
            ones = np.ones(seq_length-1) # Match length of -1 and 1 diagonals in sample_struct_adj
            sample_struct_adj += np.diag(ones,1) # Add sequential edges (i,i+1) 
            sample_struct_adj += np.diag(ones,-1) # Add sequential edges (i+1,i) (assume non-directed)

        struct_adj.append(sample_struct_adj)

    struct_adj = np.array(struct_adj)
    return struct_adj 

## Distance adjacency matrix

In [59]:
# Function for constructing distance adjacency matrix
# Only returns one distance adjacency matrix, since it is identical for all samples (only depends on number of nodes)
def get_dist_adj(data = train, power = 1):
    # Get adjacency matrix from inverse index-based distance between nodes
    # power is the variable p in the expression D(i,j)
    dist_adj = []
    idx = np.arange(data["seq_length"].iloc[0]) # Get number of nodes
    for ix in range(len(idx)):
        d = np.abs(idx[ix] - idx) # Get distance from individual nodes to all other nodes
        dist_adj.append(d)

    # Convert distance to distance measure according to formula    
    dist_adj = np.array(dist_adj) + 1 # Add one to avoid singularity at d=0
    dist_adj = 1/dist_adj # Inverse of distance
    dist_adj = dist_adj**power # Apply the specified power
    return dist_adj 

## Base pair probabilities

In [16]:
# Load the provided base pair probability adjacency matrices for the samples included in the datasets

# Train
Adj_bpps = []
for id in tqdm(train["id"]):
    bpps = np.load(f"data/bpps/{id}.npy")
    Adj_bpps.append(bpps)
Adj_bpps = np.array(Adj_bpps)

# Public test
Adj_bpps_test_public = []
for id in tqdm(test_public["id"]):
    bpps = np.load(f"data/bpps/{id}.npy")
    Adj_bpps_test_public.append(bpps)
Adj_bpps_test_public = np.array(Adj_bpps_test_public)

  0%|          | 0/1589 [00:00<?, ?it/s]

  0%|          | 0/629 [00:00<?, ?it/s]

## Node features

In [17]:
def get_node_features(data = train):
    # Create a node feature matrix for each sample in data
    # Encode feature vectors as one-hot arrays  
    # Included features: 
    #   Base (given by sequence)
    #   Loop type (given by predicted_loop_type)
    # Could also include sequence, i.e. "." "(" and ")", but I don't see how this provides any interesting information if the structure adjacency matrix is used
    X = [] # Stacked node feature matrices for all samples in data
    
    for ix in range(len(data)):
        seq_length = data["seq_length"].iloc[ix]
        sequence = data["sequence"].iloc[ix]
        predicted_loop_type = data["predicted_loop_type"].iloc[ix]

        X_sample = [] # Node feature matrix for current sample

        for jx in range(seq_length):
            # Base one hot
            bases = np.array(['A', 'G', 'U', 'C']) # Different order than reference notebook (A,G,C,U)
            x_base = np.zeros(len(bases))
            x_base[bases == sequence[jx]] = 1 # Set base one-hot to 1 at correct index

            # Predicted Loop Type one hot
            loop_types = np.array(['S', 'M', 'I', 'B', 'H', 'E', 'X'])
            x_loop = np.zeros(len(loop_types))
            x_loop[loop_types == predicted_loop_type[jx]] = 1 # Set loop-type one-hot to 1 at correct index

            x = np.concatenate((x_base,x_loop)) # Concatenate to one node feature vector
            X_sample.append(x) # Append node feature vector to node feature matrix
        X_sample = np.array(X_sample)
        X.append(X_sample) # Append node feature matrix for current graph
    X = np.array(X)
    return X

## Construct and pretrain denoise model and encode targets

In [8]:
from torch.utils.data import DataLoader 
import torch

target_labels = ["reactivity", "deg_Mg_pH10", "deg_Mg_50C", "deg_pH10", "deg_50C"]
error_labels = ["reactivity_error", "deg_error_Mg_pH10", "deg_error_Mg_50C", "deg_error_pH10", "deg_error_50C"]

train_ae = train[train.signal_to_noise > 4].reset_index(drop = True) # remove noisy data
y_train_ae = []

# Construct target labels
for target in target_labels:
    y_ae = np.vstack(train_ae[target]) # Create (n_samples, seq_scored) arrays for each target
    y_train_ae.append(y_ae) # Append array for each target
y_train_ae = np.stack(y_train_ae, axis=2) # Join the target arrays along last axis to match shape of feature arrays
y_train_ae = y_train_ae.reshape(y_train_ae.shape[0],-1).astype(float)
print(y_train_ae.shape)

# Construct error labels
y_error_ae = []
for label in error_labels:
    y_ae = np.vstack(train_ae[label]) # Create (n_samples, seq_scored) arrays for each target
    y_error_ae.append(y_ae) # Append array for each target
y_error_ae = np.stack(y_error_ae, axis=2) # Join the target arrays along last axis to match shape of feature arrays
y_error_ae = y_error_ae.reshape(y_error_ae.shape[0],-1).astype(float) # flatten and cast to float


y_train_loader = DataLoader(y_train_ae, batch_size=16, shuffle=True)
y_error_loader = DataLoader(y_error_ae, batch_size=16, shuffle=True)
print(len(y_train_loader))
print(len(y_error_loader))




(1350, 340)
85
85


In [9]:
import torch
from torch.nn import Linear
from torch.nn import ReLU
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import random

class AE(nn.Module):
  def __init__(self, **kwargs):
    super(AE,self).__init__()
    torch.manual_seed(12345) # For reproducible results

    self.encoder=nn.Sequential(
                  nn.Linear(kwargs["input_shape"],512),
                  nn.ReLU(True),
                  nn.Linear(512,512),
                  nn.ReLU(True),
                  nn.Linear(512,512),
                  #nn.ReLU(True)
        
                  )
    
    self.decoder=nn.Sequential(
                  nn.Linear(512,512),
                  nn.ReLU(True),
                  nn.Linear(512,512),
                  nn.ReLU(True),
                  nn.Linear(512,kwargs["input_shape"]),
                  )
    
 
  def forward(self,x):
    x = F.dropout(x, p=0.4, training=True)
    x=self.encoder(x)
    x=self.decoder(x)
    
    return x 

In [10]:
#Train AE_model

autoencoder = AE(input_shape=340)
mse = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)
EPOCHS = 50
i = 0

for epoch in range(EPOCHS): 
    
    running_loss = 0.0
    for data, label_errors in zip(y_train_loader, y_error_loader):
        # get the inputs; data is a list of [inputs, labels]

        
        
        #construct random tensor [-1,1]
        rand_array = torch.rand(size=(label_errors.shape[0], label_errors.shape[1]))*4-2 
        
        label_errors = torch.mul(label_errors.float(), rand_array.float())
        # zero the parameter gradients
        optimizer.zero_grad()
        x = data + label_errors
        # forward + backward + optimize
        outputs = autoencoder(x.float())
        loss = mse(data.float(), outputs)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        i = i + 1
        if i % 100 == 0:    # print every 2000 mini-batches
            print('[%d, %10d] loss: %.5f' %
                  (epoch + 1, i + 1, running_loss / 100))
            running_loss = 0.0

print('Finished Training')


[2,        101] loss: 0.02427
[3,        201] loss: 0.04576
[4,        301] loss: 0.06072
[5,        401] loss: 0.07357
[6,        501] loss: 0.08695
[8,        601] loss: 0.00552
[9,        701] loss: 0.02099
[10,        801] loss: 0.03492
[11,        901] loss: 0.04768
[12,       1001] loss: 0.06101
[13,       1101] loss: 0.07501
[15,       1201] loss: 0.00830
[16,       1301] loss: 0.02205
[17,       1401] loss: 0.03406
[18,       1501] loss: 0.04649
[19,       1601] loss: 0.05780
[20,       1701] loss: 0.06962
[22,       1801] loss: 0.01159
[23,       1901] loss: 0.02291
[24,       2001] loss: 0.03413
[25,       2101] loss: 0.04524
[26,       2201] loss: 0.05667
[28,       2301] loss: 0.00335
[29,       2401] loss: 0.01395
[30,       2501] loss: 0.02523
[31,       2601] loss: 0.03545
[32,       2701] loss: 0.04580
[33,       2801] loss: 0.05611
[35,       2901] loss: 0.00692
[36,       3001] loss: 0.01726
[37,       3101] loss: 0.02711
[38,       3201] loss: 0.03641
[39,       3301

In [None]:
# Construct target arrays for training data
target_labels = ["reactivity", "deg_Mg_pH10", "deg_Mg_50C", "deg_pH10", "deg_50C"]

y_train = []
seq_length = train["seq_length"].iloc[0] # Get number of nodes (lenght of sequence)
seq_scored = train["seq_scored"].iloc[0] # Get number of nodes with ground truth targets
save_index = []
for i in range(len(train)):
    if float(train["signal_to_noise"].iloc[i]) <= 1:
           save_index.append(i)
for target in target_labels:
    y = np.vstack(train[target]) # Create (n_samples, seq_scored) arrays for each target
    y_train.append(y) # Append array for each target
y_train = np.stack(y_train, axis=2) # Join the target arrays along last axis to match shape of feature arrays
y_train = y_train.astype(np.float32) # Convert to floats to prepare for torch model
print("Shape of targets: ", y_train.shape)

In [12]:
# As described in the report, applying the autoencoder to the targets was ignored when generating the final results. 

autoencoder.eval()
print(y_train.shape)
for i in save_index:
    y_train_tensor = torch.Tensor(y_train[i,:,:].astype(float).flatten())
    
    outputs = autoencoder(y_train_tensor)   
    outputs = outputs.reshape(68,5)
    y_train[i,:,:] = outputs.detach().numpy() 
print(y_train.shape)

(2400, 68, 5)
(2400, 68, 5)


## Construct targets without applying denoising autoencoder

In [24]:
# Construct target arrays for training data
target_labels = ["reactivity", "deg_Mg_pH10", "deg_Mg_50C", "deg_pH10", "deg_50C"]

y_train = []
seq_length = train["seq_length"].iloc[0] # Get number of nodes (lenght of sequence)
seq_scored = train["seq_scored"].iloc[0] # Get number of nodes with ground truth targets
for target in target_labels:
    y = np.vstack(train[target]) # Create (n_samples, seq_scored) arrays for each target
    y_train.append(y) # Append array for each target
y_train = np.stack(y_train, axis=2) # Join the target arrays along last axis to match shape of feature arrays
y_train = y_train.astype(np.float32) # Convert to floats to prepare for torch model
print("Shape of targets: ", y_train.shape)

Shape of targets:  (1589, 68, 5)


## Define loss function

In [26]:
# Define MCRMSE loss function
# Include all 5 targets by default, allow optional argument to calculate MCRMSE of scored targets only.
# Assumes targets are ordered such that the first 3 targets are the scored ones.
# Inputs should have dimensions (n_samples, n_nodes, n_targets)
def MCRMSE(y_true, y_pred, only_scored=False, data = train, all_102 = False):
    # Reshape if input only includes one sample and has dimensions (n_nodes, n_targets)
    if y_true.dim() == 2:
        y_true = y_true[None, :, :]
    if y_pred.dim() == 2:
        y_pred = y_pred[None, :, :]

    # Extract the scored targets
    seq_scored = data["seq_scored"].iloc[0] # Get number of nodes with ground truth targets
    if all_102 == True:
        seq_scored = 102
    y_pred = y_pred[:, :seq_scored, :] 
    # true = y_true[:, :seq_scored, :] # Not necessary since only scored targets are included, could include dummy values instead as in reference notebook

    y_diff = y_pred - y_true
    mse = torch.mean(y_diff**2, axis=1) # Average over nodes in each sample for every target
    rmse = torch.sqrt(mse)
    
    num_scored = 5 # Include all targets by default
    if only_scored == True:
        num_scored = 3 # Include only scored targets if specified by keyword (assumes correct ordering of targets in y_true and y_pred)

    mcrmse = torch.mean(rmse[:, :num_scored], axis=1) # Average over included targets

    return mcrmse

## Load test data

In [53]:
# Load post deadline data
test_postdeadline = pd.read_csv('data/post_deadline_files/private_test_labels.csv') 
# Print the first sample for testing
df_pd = pd.DataFrame(test_postdeadline)
print(df_pd.iloc[0])
print("Number of samples in private_test_labels.csv: ", len(test_postdeadline))
# There are two filters, one is S/N filter, which should be 1 for all samples in test_postdeadline (but not for all samples in test_private, "pre deadline")
# This is because the private test data was changed, they decided to include the filters there as well. 
# To my understanding, S/N filter is 1 for private test data which 
test_postdeadline_SNfiltered = test_postdeadline[test_postdeadline["S/N filter"] == 1]
print("Number of samples in private_test_labels.csv with S/N_filter=1: ", len(test_postdeadline_SNfiltered))
# Note that there is a test_filter variable, which is not 1 for all samples
# Not sure what this filter indicates, could be if all filters are passed
test_postdeadline_filtered = test_postdeadline[test_postdeadline["test_filter"] == 1]
print("Number of samples in private_test_labels.csv with test_filter=1: ", len(test_postdeadline_filtered))

# Set apply_test_filter to true to set private post deadline test data to only be samples which pass the filter
# This should probably not be done, as the private test data is "unseen" data, where we should have no information about the targets
apply_test_filter = False
# apply_test_filter = True
if apply_test_filter == True:
    test_postdeadline = test_postdeadline_filtered

id                                                           id_40f52a81b
ID                                                               10207086
sequence                GGAAAUUUUCGCGGGACGGGCGGCCGGGCGGAGGCGGCGCGAGGGC...
structure               .......(((((.((.((..(.(((..(((...((..((((....(...
seqpos                  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
reactivity              [0.6009, 1.3193, 1.5475, 0.5852, 1.566, 0.3387...
deg_Mg_pH10             [0.5866, 1.4956, 1.3765, 0.5714, 2.9199, 0.925...
deg_Mg_50C              [1.2183, 1.718, 0.8737, 0.9644, 2.7502, 0.5862...
errors                  [0.4419, 0.4736, 0.4529, 0.30820000000000003, ...
deg_pH10_Mg_errors      [0.4333, 0.491, 0.4375, 0.3079, 0.5534, 0.3432...
deg_50C_Mg_errors       [0.5922000000000001, 0.5873, 0.434700000000000...
S/N filter                                                              1
predicted_loop_type     EEEEEEESSSSSISSISSIISISSSIISSSIIISSIISSSSIIIIS...
seq_scored                            

In [54]:
# Load the provided base pair probability adjacency matrices for the samples in post deadline private test

# Private test, post deadline
Adj_bpps_test_postdeadline = []
for id in tqdm(test_postdeadline["id"]):
    bpps = np.load(f"data/bpps/{id}.npy")
    Adj_bpps_test_postdeadline.append(bpps)
Adj_bpps_test_postdeadline = np.array(Adj_bpps_test_postdeadline)

  0%|          | 0/2493 [00:00<?, ?it/s]

In [64]:
# Construct target arrays for post deadline private test data
target_labels = ["reactivity", "deg_Mg_pH10", "deg_Mg_50C"]

y_test_postdeadline = []
seq_length = test_postdeadline["seq_length"].iloc[0] # Get number of nodes (lenght of sequence)
seq_scored = test_postdeadline["seq_scored"].iloc[0] # Get number of nodes with ground truth targets
# seq_scored = 102
for target in target_labels:
    y_strings = np.vstack(test_postdeadline[target]) # Create (n_samples, seq_scored) arrays for each target
    y_temp = [] # For converting strings to arrays
    # Post deadline targets are given as strings, iterate over samples and convert strings to arrays
    for ix in range(len(y_strings)): 
        y_strings[ix][0] = y_strings[ix][0].replace('[','') # Remove first bracket character
        y_strings[ix][0] = y_strings[ix][0].replace(']','') # Remove final bracket character
        y_temp.append(y_strings[ix][0].split(",")) # Split the comma separated string
    y_test_postdeadline.append(np.array(y_temp)) # Append array for each target
y_test_postdeadline = np.stack(y_test_postdeadline, axis=2) # Join the target arrays along last axis to match shape of feature arrays
y_test_postdeadline = y_test_postdeadline.astype(np.float32) # Convert to floats to prepare for torch model
# Post deadline data includes more targets (102) than seq_scored indicates (92). Remove the extra values for now, only include up to seq_scored=92
y_test_postdeadline = y_test_postdeadline[:,:seq_scored,:]
print("Shape of targets: ", y_test_postdeadline.shape)


Shape of targets:  (2493, 92, 3)


# Train and evaluate models
The following cells are used to define the models, train them and evaluate them on the test set. Note that the models are overwritten and that most of the cells are copied with minor adjustments.

# GCN

In [184]:
# Construct node features and adjacency matrix for training data
print("Shapes of inputs - Train")

# Feature exctraction for the GCN models
# A includes base pairs and sequential neighbors
# B includes bpps
# No distance matrices used

# Node features
X = get_node_features(data = train)
X = X.astype(np.float32) # Convert to floats to prepare for torch model
print("Node features X: (n_samples, n_nodes, n_node_features) ", X.shape)
# Structure adjacency 
Adj_pairs = get_struct_adj(data = train, sequential_edges=True)
print("Structure adjacency matrices: (n_samples, n_nodes, n_nodes) ", Adj_pairs.shape)
# Base pair probability adjacency
print("Base pair probability adjacency matrices: (n_samples, n_nodes, n_nodes) ", Adj_bpps.shape)
# Concatenate adjacency matrices into one array along last dimension
Adj = np.concatenate([Adj_pairs[:,:,:,None], Adj_bpps[:,:,:,None]], axis = 3) # Expand dimensions of adjacency matrices and stack along new dimension 
Adj = Adj.astype(np.float32) # Convert to floats to prepare for torch model
print("Total adjacency matrix: (n_samples, n_nodes, n_nodes, n_edge_features) ", Adj.shape)

Shapes of inputs - Train
Node features X: (n_samples, n_nodes, n_node_features)  (1589, 107, 11)
Structure adjacency matrices: (n_samples, n_nodes, n_nodes)  (1589, 107, 107)
Base pair probability adjacency matrices: (n_samples, n_nodes, n_nodes)  (1589, 107, 107)
Total adjacency matrix: (n_samples, n_nodes, n_nodes, n_edge_features)  (1589, 107, 107, 2)


In [185]:
class myGraphConv(torch.nn.Module):
    """
    Adjuster from the graph neural network operator from the “Weisfeiler and Leman Go 
    Neural: Higher-order Graph Neural Networks” paper

    x' = x_i W_0.T + (Adj_1 x_i) W_1.T + ... +  (Adj_k x_i) W_k.T

    Contributions from additional edge features are added as separate terms

    Arguments:
        in_channels (int): Number of features (size) of each input node
        out_channels (int): Number of features (size) of each output node
        n_edge_features (int): Number of edge features, i.e. Adj.shape[-1]
    
    forward performs the graph neural network operation
    Arguments:
        x (torch tensor): The input node features of shape (n_samples, n_nodes, in_channels) 
        Adj (torch tensor): The adjacency matrix of the graph of shape (n_samples, n_nodes, n_nodes, n_edge_features)
    Returns: 
        x' (torch tensor): Output node feature matrix of shape (n_samples, n_nodes, out_channels)
    """


    def __init__(self, in_channels, out_channels, n_edge_features):
        super(myGraphConv, self).__init__()

        self.in_channels = in_channels
        self.out_channels = out_channels

        self.n_edge_features = n_edge_features # Get number of edge features (number of stacked adjacency matrices)

        self.lin_self = Linear(in_channels, out_channels, bias=True) 
        if self.n_edge_features >= 1:
            self.lin_1 = Linear(in_channels, out_channels, bias=True)  
        if self.n_edge_features >= 2:
            self.lin_2 = Linear(in_channels, out_channels, bias=True)
        if self.n_edge_features >= 3:
            self.lin_3 = Linear(in_channels, out_channels, bias=True)
        if self.n_edge_features >= 4:
            self.lin_4 = Linear(in_channels, out_channels, bias=True)
        if self.n_edge_features >= 5:
            self.lin_5 = Linear(in_channels, out_channels, bias=True)
        if self.n_edge_features >= 6:
            raise ValueError("Number of edge features can not be larger than 5") # "Hard code" up to 5 edge features

        self.reset_parameters()
   
    def reset_parameters(self):
        self.lin_self.reset_parameters()
        if self.n_edge_features >= 1:
            self.lin_1.reset_parameters()
        if self.n_edge_features >= 2:
            self.lin_2.reset_parameters()
        if self.n_edge_features >= 3:
            self.lin_3.reset_parameters()
        if self.n_edge_features >= 4:
            self.lin_4.reset_parameters()
        if self.n_edge_features >= 5:
            self.lin_5.reset_parameters()
        
        

    def forward(self, x, Adj):
        # Shapes of arguments, weight matrices and output
        # x: (n_samples, n_nodes, in_channels) 
        # Adj: (n_samples, n_nodes, n_nodes, n_edge_features)
        # W_1: (in_channels, out_channels)
        # W_2: (in_channels, out_channels)
        # out: (n_samples, n_nodes, out_channels)

        # Confirm that the input variable n_edge_features matches the adjacency matrix
        if self.n_edge_features != Adj.shape[-1]:
            raise ValueError("Specified number of edge features must match last dimensino in adjacency matrix") 

        # Calculate contribution from self (node)
        out = self.lin_self(x)

        # Add contributions from edges
        # Calculate contributions from adjacent nodes
        # Use separate weights for each edge feature
        if self.n_edge_features >= 1:
            out_1 = torch.matmul(Adj[..., 0], x) # This is equivalent to summing over edge weights assuming Adj contains the edge weights
            out_1 = self.lin_1(out_1) # Multiply with weight matrix for adjacent nodes
            out += out_1 # Add contribution from first edge feature
        # Repeat for all edge weights
        if self.n_edge_features >= 2:
            out_2 = torch.matmul(Adj[..., 1], x) 
            out_2 = self.lin_2(out_2) 
            out += out_2 # Add contribution from second edge feature
        if self.n_edge_features >= 3:
            out_3 = torch.matmul(Adj[..., 2], x)
            out_3 = self.lin_3(out_3) 
            out += out_3 # Add contribution from third edge feature
        if self.n_edge_features >= 4:
            out_4 = torch.matmul(Adj[..., 3], x)
            out_4 = self.lin_4(out_4) 
            out += out_4 # Add contribution from fourth edge feature
        if self.n_edge_features >= 5:
            out_5 = torch.matmul(Adj[..., 4], x)
            out_5 = self.lin_5(out_5) 
            out += out_5 # Add contribution from fifth edge feature
        return out

    # The method that returns a printable representation of the operator, copy to match GraphConv source code 
    def __repr__(self):
        return '{}({}, {})'.format(self.__class__.__name__, self.in_channels,
                                   self.out_channels)


In [192]:
# The GCN models, where the number of hidden layers can be varied
# Note that code has to be commented to change the number of layers (sorry)
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, n_edge_features, n_node_features = 11):
        super(GNN, self).__init__()
        torch.manual_seed(12345) # For reproducible results
        self.conv = myGraphConv(n_node_features, hidden_channels, n_edge_features)
        hidden_channels2 = 64
        self.conv2 = myGraphConv(hidden_channels, hidden_channels2, n_edge_features)
        hidden_channels3 = 64
        self.conv3 = myGraphConv(hidden_channels2, hidden_channels3, n_edge_features)
        # hidden_channels4 = 64
        # self.conv4 = myGraphConv(hidden_channels3, hidden_channels4, n_edge_features)
        # hidden_channels5 = 64
        # self.conv5 = myGraphConv(hidden_channels4, hidden_channels5, n_edge_features)
        hidden_channels = hidden_channels3
        self.lin = Linear(hidden_channels, 5) # Map to the 5 output targets with dense layer
        self.relu = ReLU()

    def forward(self, x, Adj):
        # 1. Obtain node embeddings, use GraphConv layers with ReLU for non-linearity
        x = self.conv(x, Adj) # Give adjacency matrix instead of edge_index and edge_weight
        x = self.relu(x)

        x = self.conv2(x, Adj) # Give adjacency matrix instead of edge_index and edge_weight
        x = self.relu(x)

        x = self.conv3(x, Adj) # Give adjacency matrix instead of edge_index and edge_weight
        x = self.relu(x)

        # x = self.conv4(x, Adj) # Give adjacency matrix instead of edge_index and edge_weight
        # x = self.relu(x)

        # x = self.conv5(x, Adj) # Give adjacency matrix instead of edge_index and edge_weight
        # x = self.relu(x)

        # 2. Readout layer
        # No pooling is required, we want target labels for each node, not for the entire graph

        # 3. Apply a final classifier 
        # Use a single layer as classifier to map to the targets
        x = self.lin(x)

        # No LogSoftmax needed, possibly some other function to map to correct targets?

        return x
    

In [193]:
# Instantiate GCN model, optimizer and loss function
model = GNN(hidden_channels=64, n_edge_features = Adj.shape[-1])
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # Adjust learning rate
criterion = MCRMSE # Mean column-wise root mean square error (MCRMSE) loss

# Define trainer function for GNN
def run_training(X_data, Adj_data, batch_size = 1, n_epochs = 1):
    model.train()
    for epoch in range(n_epochs):
        print(f"=== Starting epoch {epoch + 1} ===")
        # Get permutation of sample indices for shuffling
        permutation = torch.randperm(len(X_data))
        # permutation = range(len(X_data))
        # Define variable for printing training loss
        running_loss = 0.
        # Run training over mini-batches for current epoch
        for ix in range(0, len(X_data), batch_size):  # Iterate over samples in the training dataset
            batch_indices = permutation[ix:ix+batch_size] # Get shuffled indices for minibatch
            X_batch, Adj_batch = X_data[batch_indices], Adj_data[batch_indices] # Minibatch of X and Adj
            y_batch = y_train[batch_indices] # Minibatch of ground truths
            out = model(X_batch, Adj_batch) # Perform forward pass
            loss = criterion(torch.tensor(y_batch), out)  # Compute the loss
            # As the loss function is defined per sample, we have to reduce the loss for
            # each mini-batch to a singular value in some way.
            # Could for example use mean, sum, or random sample. This is a design choice.
            loss = torch.mean(loss) # Calculate average loss for minibatch
            loss.backward()  # Derive gradients
            optimizer.step()  # Update parameters based on gradients
            optimizer.zero_grad()  # Clear gradients
            
            # Print statistics every print_batch minibatches 
            print_batch = int(len(X_data)/10/batch_size) # Set to print every 1/10 of all samples
            running_loss += loss.item() # Add (average) loss from minibatch
            if int(ix/batch_size) % print_batch == 0 and ix != 0: # Ignore first minibatch
                print('[batch %5d, sample %5d] loss: %.3f' % 
                        (int(ix/batch_size), ix, 
                        running_loss / print_batch)) # Average running loss
                running_loss = 0. # Reset running loss

# Convert training data inputs to pytorch tensors and run training
X_torch = torch.tensor(X)
Adj_torch = torch.tensor(Adj)
run_training(X_torch, Adj_torch, batch_size = 16, n_epochs = 10)


GNN(
  (conv): myGraphConv(11, 64)
  (conv2): myGraphConv(64, 64)
  (conv3): myGraphConv(64, 64)
  (lin): Linear(in_features=64, out_features=5, bias=True)
  (relu): ReLU()
)
=== Starting epoch 1 ===
[batch     9, sample   144] loss: 0.597
[batch    18, sample   288] loss: 0.442
[batch    27, sample   432] loss: 0.403
[batch    36, sample   576] loss: 0.375
[batch    45, sample   720] loss: 0.377
[batch    54, sample   864] loss: 0.363
[batch    63, sample  1008] loss: 0.364
[batch    72, sample  1152] loss: 0.355
[batch    81, sample  1296] loss: 0.371
[batch    90, sample  1440] loss: 0.358
[batch    99, sample  1584] loss: 0.346
=== Starting epoch 2 ===
[batch     9, sample   144] loss: 0.378
[batch    18, sample   288] loss: 0.353
[batch    27, sample   432] loss: 0.335
[batch    36, sample   576] loss: 0.332
[batch    45, sample   720] loss: 0.338
[batch    54, sample   864] loss: 0.331
[batch    63, sample  1008] loss: 0.328
[batch    72, sample  1152] loss: 0.334
[batch    81, s

In [None]:
# Define prediction function
def run_prediction(X_data, Adj_data):
    model.eval()
    y_pred = []
    outs = model(X_data, Adj_data) # Feed the data through the network 
    for yx in outs:
        y_pred.append(yx.detach().numpy())
    y_pred = np.array(y_pred)
    return y_pred

In [189]:
# Construct node features and adjacency matrix for post deadline test data
print("Shapes of inputs - Private test, post deadline")

# GCN
# A includes base pairs and sequential neighbors
# B includes bpps
# No distance matrices used

# Node features
X_test_postdeadline = get_node_features(data = test_postdeadline)
X_test_postdeadline = X_test_postdeadline.astype(np.float32) # Convert to floats to prepare for torch model
print("Node features X: (n_samples, n_nodes, n_node_features) ", X_test_postdeadline.shape)
# Structure adjacency 
Adj_pairs_test_postdeadline = get_struct_adj(data = test_postdeadline, sequential_edges=True)
print("Structure adjacency matrices: (n_samples, n_nodes, n_nodes) ", Adj_pairs_test_postdeadline.shape)
# Base pair probability adjacency
print("Base pair probability adjacency matrices: (n_samples, n_nodes, n_nodes) ", Adj_bpps_test_postdeadline.shape)
# Concatenate adjacency matrices into one array along last dimension
Adj_test_postdeadline = np.concatenate([Adj_pairs_test_postdeadline[:,:,:,None], Adj_bpps_test_postdeadline[:,:,:,None]], axis = 3) # Expand dimensions of adjacency matrices and stack along new dimension
Adj_test_postdeadline = Adj_test_postdeadline.astype(np.float32) # Convert to floats to prepare for torch model
print("Total adjacency matrix: (n_samples, n_nodes, n_nodes, n_edge_features) ", Adj_test_postdeadline.shape)

Shapes of inputs - Private test, post deadline
Node features X: (n_samples, n_nodes, n_node_features)  (2493, 130, 11)
Structure adjacency matrices: (n_samples, n_nodes, n_nodes)  (2493, 130, 130)
Distance adjacency matrices: (n_samples, n_nodes, n_nodes)  (2493, 130, 130, 3)
Base pair probability adjacency matrices: (n_samples, n_nodes, n_nodes)  (2493, 130, 130)
Total adjacency matrix: (n_samples, n_nodes, n_nodes, n_edge_features)  (2493, 130, 130, 2)


## Results for 1-layer GCN 

In [71]:
# Run prediction on training data as a test run
y_train_pred = run_prediction(X_torch, Adj_torch)
y_train_pred = y_train_pred.astype(np.float32)

# Calculate score on training data
y_train_torch = torch.tensor(y_train)
y_train_pred_torch = torch.tensor(y_train_pred)
training_score = MCRMSE(y_train_torch, y_train_pred_torch, only_scored=False)
training_score_only_scored = MCRMSE(y_train_torch, y_train_pred_torch, only_scored=True)

print(f"Mean score on training data, all 5 targets: {float(torch.mean(training_score)):.5}")
print(f"Mean score on training data, only scored targets: {float(torch.mean(training_score_only_scored)):.5}")

# Training score
# 1 layer

Mean score on training data, all 5 targets: 0.33861
Mean score on training data, only scored targets: 0.34802


In [69]:
X_test_postdeadline_torch = torch.tensor(X_test_postdeadline)
Adj_test_postdeadline_torch = torch.tensor(Adj_test_postdeadline)
y_postdeadline_pred = run_prediction(X_test_postdeadline_torch, Adj_test_postdeadline_torch)
y_postdeadline_pred = y_postdeadline_pred.astype(np.float32)
# Post deadline data only includes the scored targets. Remove the two unscored targets from predictions when calculating score
y_postdeadline_pred =  y_postdeadline_pred[:,:,:3]# Remove deg_pH10 and deg_50C (unscored targets, to match post deadline test data)

# Calculate score on post deadline test data
y_test_postdeadline_torch = torch.tensor(y_test_postdeadline)
y_postdeadline_pred_torch = torch.tensor(y_postdeadline_pred)
postdeadline_score_only_scored = MCRMSE(y_test_postdeadline_torch, y_postdeadline_pred_torch, only_scored=True, data=test_postdeadline)

print(f"Mean score on private test data, only scored targets: {float(torch.mean(postdeadline_score_only_scored)):.5}")

# Print number of trainable parameters
print("Number of trainable model parameters: ", sum(p.numel() for p in model.parameters() if p.requires_grad))

# Test score
# 5 layers

Mean score on private test data, only scored targets: 0.43396
Number of trainable model parameters:  2629


## Results for 3-layer GCN 

In [116]:
# Run prediction on training data as a test run
y_train_pred = run_prediction(X_torch, Adj_torch)
y_train_pred = y_train_pred.astype(np.float32)

# Calculate score on training data
y_train_torch = torch.tensor(y_train)
y_train_pred_torch = torch.tensor(y_train_pred)
training_score = MCRMSE(y_train_torch, y_train_pred_torch, only_scored=False)
training_score_only_scored = MCRMSE(y_train_torch, y_train_pred_torch, only_scored=True)

print(f"Mean score on training data, all 5 targets: {float(torch.mean(training_score)):.5}")
print(f"Mean score on training data, only scored targets: {float(torch.mean(training_score_only_scored)):.5}")

# Training score
# 3 layers

Mean score on training data, all 5 targets: 0.30659
Mean score on training data, only scored targets: 0.31539


In [117]:
X_test_postdeadline_torch = torch.tensor(X_test_postdeadline)
Adj_test_postdeadline_torch = torch.tensor(Adj_test_postdeadline)
y_postdeadline_pred = run_prediction(X_test_postdeadline_torch, Adj_test_postdeadline_torch)
y_postdeadline_pred = y_postdeadline_pred.astype(np.float32)
# Post deadline data only includes the scored targets. Remove the two unscored targets from predictions when calculating score
y_postdeadline_pred =  y_postdeadline_pred[:,:,:3]# Remove deg_pH10 and deg_50C (unscored targets, to match post deadline test data)

# Calculate score on post deadline test data
y_test_postdeadline_torch = torch.tensor(y_test_postdeadline)
y_postdeadline_pred_torch = torch.tensor(y_postdeadline_pred)
postdeadline_score_only_scored = MCRMSE(y_test_postdeadline_torch, y_postdeadline_pred_torch, only_scored=True, data=test_postdeadline)

print(f"Mean score on private test data, only scored targets: {float(torch.mean(postdeadline_score_only_scored)):.5}")

# Print number of trainable parameters
print("Number of trainable model parameters: ", sum(p.numel() for p in model.parameters() if p.requires_grad))

# Test score
# 3 layers

Mean score on private test data, only scored targets: 0.41284
Number of trainable model parameters:  15109


## Results for 5-layer GCN 

In [140]:
# Run prediction on training data as a test run
y_train_pred = run_prediction(X_torch, Adj_torch)
y_train_pred = y_train_pred.astype(np.float32)

# Calculate score on training data
y_train_torch = torch.tensor(y_train)
y_train_pred_torch = torch.tensor(y_train_pred)
training_score = MCRMSE(y_train_torch, y_train_pred_torch, only_scored=False)
training_score_only_scored = MCRMSE(y_train_torch, y_train_pred_torch, only_scored=True)

print(f"Mean score on training data, all 5 targets: {float(torch.mean(training_score)):.5}")
print(f"Mean score on training data, only scored targets: {float(torch.mean(training_score_only_scored)):.5}")

# Training score
# 5 layers

Mean score on training data, all 5 targets: 0.30503
Mean score on training data, only scored targets: 0.31377


In [139]:
X_test_postdeadline_torch = torch.tensor(X_test_postdeadline)
Adj_test_postdeadline_torch = torch.tensor(Adj_test_postdeadline)
y_postdeadline_pred = run_prediction(X_test_postdeadline_torch, Adj_test_postdeadline_torch)
y_postdeadline_pred = y_postdeadline_pred.astype(np.float32)
# Post deadline data only includes the scored targets. Remove the two unscored targets from predictions when calculating score
y_postdeadline_pred =  y_postdeadline_pred[:,:,:3]# Remove deg_pH10 and deg_50C (unscored targets, to match post deadline test data)

# Calculate score on post deadline test data
y_test_postdeadline_torch = torch.tensor(y_test_postdeadline)
y_postdeadline_pred_torch = torch.tensor(y_postdeadline_pred)
postdeadline_score_only_scored = MCRMSE(y_test_postdeadline_torch, y_postdeadline_pred_torch, only_scored=True, data=test_postdeadline)

print(f"Mean score on private test data, only scored targets: {float(torch.mean(postdeadline_score_only_scored)):.5}")

# Print number of trainable parameters
print("Number of trainable model parameters: ", sum(p.numel() for p in model.parameters() if p.requires_grad))

# Test score
# 5 layers

Mean score on private test data, only scored targets: 0.4087
Number of trainable model parameters:  15109


# MLP

In [None]:
# Construct node features and adjacency matrix for training data
print("Shapes of inputs - Train")

# Feature exctraction for the MLP model

# Node features
X = get_node_features(data = train)
X = X.astype(np.float32) # Convert to floats to prepare for torch model
print("Node features X: (n_samples, n_nodes, n_node_features) ", X.shape)
Adj =  get_struct_adj(data = train, sequential_edges=True) # Not used for the MLP

In [160]:
# The simple MLP model using self-contribution and no neighbourhood aggregation
class MLP(torch.nn.Module):
    def __init__(self, hidden_channels, n_node_features = 11, n_edge_features = None):
        super(MLP, self).__init__()
        torch.manual_seed(12345) # For reproducible results
        self.hidden = Linear(n_node_features, hidden_channels)
        self.lin = Linear(hidden_channels, 5) # Map to the 5 output targets with dense layer
        self.relu = ReLU()

    def forward(self, x, Adj=None):
        # 1. Obtain node embeddings, use GraphConv layers with ReLU for non-linearity
        x = self.hidden(x)
        x = self.relu(x)

        # 2. Readout layer
        # No pooling is required, we want target labels for each node, not for the entire graph

        # 3. Apply a final classifier 
        # Use a single layer as classifier to map to the targets
        x = self.lin(x)

        # No LogSoftmax needed, possibly some other function to map to correct targets?

        return x

In [None]:
# Instantiate MLP model, optimizer and loss function
model = MLP(hidden_channels=64, n_edge_features = 0)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # Adjust learning rate
criterion = MCRMSE # Mean column-wise root mean square error (MCRMSE) loss

# Define trainer function for GNN
def run_training(X_data, Adj_data, batch_size = 1, n_epochs = 1):
    model.train()
    for epoch in range(n_epochs):
        print(f"=== Starting epoch {epoch + 1} ===")
        # Get permutation of sample indices for shuffling
        permutation = torch.randperm(len(X_data))
        # permutation = range(len(X_data))
        # Define variable for printing training loss
        running_loss = 0.
        # Run training over mini-batches for current epoch
        for ix in range(0, len(X_data), batch_size):  # Iterate over samples in the training dataset
            batch_indices = permutation[ix:ix+batch_size] # Get shuffled indices for minibatch
            X_batch, Adj_batch = X_data[batch_indices], Adj_data[batch_indices] # Minibatch of X and Adj
            y_batch = y_train[batch_indices] # Minibatch of ground truths
            out = model(X_batch, Adj_batch) # Perform forward pass
            loss = criterion(torch.tensor(y_batch), out)  # Compute the loss
            # As the loss function is defined per sample, we have to reduce the loss for
            # each mini-batch to a singular value in some way.
            # Could for example use mean, sum, or random sample. This is a design choice.
            loss = torch.mean(loss) # Calculate average loss for minibatch
            loss.backward()  # Derive gradients
            optimizer.step()  # Update parameters based on gradients
            optimizer.zero_grad()  # Clear gradients
            
            # Print statistics every print_batch minibatches 
            print_batch = int(len(X_data)/10/batch_size) # Set to print every 1/10 of all samples
            running_loss += loss.item() # Add (average) loss from minibatch
            if int(ix/batch_size) % print_batch == 0 and ix != 0: # Ignore first minibatch
                print('[batch %5d, sample %5d] loss: %.3f' % 
                        (int(ix/batch_size), ix, 
                        running_loss / print_batch)) # Average running loss
                running_loss = 0. # Reset running loss

# Convert training data inputs to pytorch tensors and run training
X_torch = torch.tensor(X)
Adj_torch = torch.tensor(Adj)
run_training(X_torch, Adj_torch, batch_size = 16, n_epochs = 10)


In [None]:
# Define prediction function
def run_prediction(X_data, Adj_data):
    model.eval()
    y_pred = []
    outs = model(X_data, Adj_data) # Feed the data through the network 
    for yx in outs:
        y_pred.append(yx.detach().numpy())
    y_pred = np.array(y_pred)
    return y_pred

In [64]:
# Construct node features and adjacency matrix for post deadline test data
print("Shapes of inputs - Private test, post deadline")

# Node features
X_test_postdeadline = get_node_features(data = test_postdeadline)
X_test_postdeadline = X_test_postdeadline.astype(np.float32) # Convert to floats to prepare for torch model
print("Node features X: (n_samples, n_nodes, n_node_features) ", X_test_postdeadline.shape)
Adj_test_postdeadline =  get_struct_adj(data = test_postdeadline, sequential_edges=True) # Not used for the MLP

Shapes of inputs - Private test, post deadline
Node features X: (n_samples, n_nodes, n_node_features)  (2493, 130, 11)
Structure adjacency matrices: (n_samples, n_nodes, n_nodes)  (2493, 130, 130)
Distance adjacency matrices: (n_samples, n_nodes, n_nodes)  (2493, 130, 130)
Base pair probability adjacency matrices: (n_samples, n_nodes, n_nodes)  (2493, 130, 130)
Total adjacency matrix: (n_samples, n_nodes, n_nodes, n_edge_features)  (2493, 130, 130, 2)


In [52]:
# Run prediction on training data as a test run
y_train_pred = run_prediction(X_torch, Adj_torch)
y_train_pred = y_train_pred.astype(np.float32)

# Calculate score on training data
y_train_torch = torch.tensor(y_train)
y_train_pred_torch = torch.tensor(y_train_pred)
training_score = MCRMSE(y_train_torch, y_train_pred_torch, only_scored=False)
training_score_only_scored = MCRMSE(y_train_torch, y_train_pred_torch, only_scored=True)

print(f"Mean score on training data, all 5 targets: {float(torch.mean(training_score)):.5}")
print(f"Mean score on training data, only scored targets: {float(torch.mean(training_score_only_scored)):.5}")

# Training score
# MLP

Mean score on training data, all 5 targets: 0.33861
Mean score on training data, only scored targets: 0.34802


In [59]:
# Run prediction on test data
X_test_postdeadline_torch = torch.tensor(X_test_postdeadline)
Adj_test_postdeadline_torch = torch.tensor(Adj_test_postdeadline)
y_postdeadline_pred = run_prediction(X_test_postdeadline_torch, Adj_test_postdeadline_torch)
y_postdeadline_pred = y_postdeadline_pred.astype(np.float32)
# Post deadline data only includes the scored targets. Remove the two unscored targets from predictions when calculating score
y_postdeadline_pred =  y_postdeadline_pred[:,:,:3]# Remove deg_pH10 and deg_50C (unscored targets, to match post deadline test data)

# Calculate score on post deadline test data
y_test_postdeadline_torch = torch.tensor(y_test_postdeadline)
y_postdeadline_pred_torch = torch.tensor(y_postdeadline_pred)
postdeadline_score_only_scored = MCRMSE(y_test_postdeadline_torch, y_postdeadline_pred_torch, only_scored=True, data=test_postdeadline)

print(f"Mean score on private test data, only scored targets: {float(torch.mean(postdeadline_score_only_scored)):.5}")
# Test score
# MLP

Mean score on private test data, only scored targets: 0.47487


# SIGN

In [123]:
# Construct node features and adjacency matrix for training data
print("Shapes of inputs - Train")

# Feature extraction for the SIGN model
# A includes base pairs 
# B includes bpps
# D_i includes distance measure, with power i (use several i)

# Node features
X = get_node_features(data = train)
X = X.astype(np.float32) # Convert to floats to prepare for torch model
print("Node features X: (n_samples, n_nodes, n_node_features) ", X.shape)
# Structure adjacency 
Adj_pairs = get_struct_adj(data = train, sequential_edges=False)
print("Structure adjacency matrices: (n_samples, n_nodes, n_nodes) ", Adj_pairs.shape)
# Distance adjacency
Adj_dist_1 = get_dist_adj(data = train, power = 1)
Adj_dist_2 = get_dist_adj(data = train, power = 2)
Adj_dist_3 = get_dist_adj(data = train, power = 3)
Adj_dist = np.concatenate([Adj_dist_1[:,:,None], Adj_dist_2[:,:,None], Adj_dist_3[:,:,None]], axis = 2)
Adj_dist = Adj_dist[None, :,:,:] # Expand the dimensions of the array to allow stacking matrices for all samples 
Adj_dist = np.repeat(Adj_dist, len(train), axis = 0) # Repeat the distance array for each sample (they are identical, simply to match the data shape)
print("Distance adjacency matrices: (n_samples, n_nodes, n_nodes) ", Adj_dist.shape)
# Base pair probability adjacency
print("Base pair probability adjacency matrices: (n_samples, n_nodes, n_nodes) ", Adj_bpps.shape)
# Concatenate adjacency matrices into one array along last dimension
Adj = np.concatenate([Adj_pairs[:,:,:,None], Adj_bpps[:,:,:,None], Adj_dist[:,:,:,:]], axis = 3) # Expand dimensions of adjacency matrices and stack along new dimension
Adj = Adj.astype(np.float32) # Convert to floats to prepare for torch model
print("Total adjacency matrix: (n_samples, n_nodes, n_nodes, n_edge_features) ", Adj.shape)

Shapes of inputs - Train
Node features X: (n_samples, n_nodes, n_node_features)  (1589, 107, 11)
Structure adjacency matrices: (n_samples, n_nodes, n_nodes)  (1589, 107, 107)
Distance adjacency matrices: (n_samples, n_nodes, n_nodes)  (1589, 107, 107, 2)
Base pair probability adjacency matrices: (n_samples, n_nodes, n_nodes)  (1589, 107, 107)
Total adjacency matrix: (n_samples, n_nodes, n_nodes, n_edge_features)  (1589, 107, 107, 4)


In [124]:
class myGraphConv_expand(torch.nn.Module):
    """
    Alternative Graph Convolution operation, used in the SIGN architecture.
    Here, all neighbourhood aggregators are applied in one layer and are then concatenated to a single vector.

    x'_i = [(Adj_1 x_i) W_1.T; (Adj_2 x_i) W_2.T;...]

    where [a;b;c] are the arrays a, b and c stacked contiguously
    """

    def __init__(self, in_channels, out_channels, n_edge_features):
        super(myGraphConv_expand, self).__init__()

        self.in_channels = in_channels
        self.out_channels = out_channels

        self.n_edge_features = n_edge_features # Get number of edge features (number of stacked adjacency matrices)

        self.lin_self = Linear(in_channels, out_channels, bias=True) # bias=False to match GraphConv? Check source code
        if self.n_edge_features >= 1:
            self.lin_1 = Linear(in_channels, out_channels, bias=True)  
        if self.n_edge_features >= 2:
            self.lin_2 = Linear(in_channels, out_channels, bias=True)
        if self.n_edge_features >= 3:
            self.lin_3 = Linear(in_channels, out_channels, bias=True)
        if self.n_edge_features >= 4:
            self.lin_4 = Linear(in_channels, out_channels, bias=True)
        if self.n_edge_features >= 5:
            self.lin_5 = Linear(in_channels, out_channels, bias=True)
        if self.n_edge_features >= 6:
            raise ValueError("Number of edge features can not be larger than 5") # "Hard code" up to 5 edge features

        self.reset_parameters()
   
    def reset_parameters(self):
        self.lin_self.reset_parameters()
        if self.n_edge_features >= 1:
            self.lin_1.reset_parameters()
        if self.n_edge_features >= 2:
            self.lin_2.reset_parameters()
        if self.n_edge_features >= 3:
            self.lin_3.reset_parameters()
        if self.n_edge_features >= 4:
            self.lin_4.reset_parameters()
        if self.n_edge_features >= 5:
            self.lin_5.reset_parameters()
        
        

    def forward(self, x, Adj):
        # Confirm that the input variable n_edge_features matches the adjacency matrix
        if self.n_edge_features != Adj.shape[-1]:
            raise ValueError("Specified number of edge features must match last dimensino in adjacency matrix") 

        # Calculate contribution from self (node)
        out = self.lin_self(x)

        # Add contributions from edges
        # Calculate contributions from adjacent nodes
        # Use separate weights for each edge feature
        if self.n_edge_features >= 1:
            out_1 = torch.matmul(Adj[..., 0], x) # This is equivalent to summing over edge weights assuming Adj contains the edge weights
            out_1 = self.lin_1(out_1) # Multiply with weight matrix for adjacent nodes
            out = torch.cat([out, out_1], axis=-1) # Add contribution from first edge feature
        # Repeat for all edge weights
        if self.n_edge_features >= 2:
            out_2 = torch.matmul(Adj[..., 1], x) 
            out_2 = self.lin_2(out_2) 
            out = torch.cat([out, out_2], axis=-1) # Add contribution from second edge feature
        if self.n_edge_features >= 3:
            out_3 = torch.matmul(Adj[..., 2], x)
            out_3 = self.lin_3(out_3) 
            out = torch.cat([out, out_3], axis=-1) # Add contribution from third edge feature
        if self.n_edge_features >= 4:
            out_4 = torch.matmul(Adj[..., 3], x) 
            out_4 = self.lin_4(out_4) 
            out = torch.cat([out, out_4], axis=-1) # Add contribution from fourth edge feature
        if self.n_edge_features >= 5:
            out_5 = torch.matmul(Adj[..., 4], x)
            out_5 = self.lin_5(out_5) 
            out = torch.cat([out, out_5], axis=-1) # Add contribution from fifth edge feature
        return out

    # The method that returns a printable representation of the operator, copy to match GraphConv source code 
    def __repr__(self):
        return '{}({}, {})'.format(self.__class__.__name__, self.in_channels,
                                   self.out_channels*(1+self.n_edge_features))

In [125]:
# The SIGN model
class GNN_expand(torch.nn.Module):
    def __init__(self, hidden_channels, n_edge_features, n_node_features = 11):
        super(GNN_expand, self).__init__()
        torch.manual_seed(12345) # For reproducible results
        self.conv = myGraphConv_expand(n_node_features, hidden_channels, n_edge_features)
        hidden_channels2 = 64
        self.lin2 = Linear(hidden_channels*(n_edge_features+1), hidden_channels2)#, n_edge_features
        self.lin = Linear(hidden_channels2, 5) # Map to the 5 output targets with dense layer
        self.relu = ReLU()

    def forward(self, x, Adj):
        # 1. Obtain node embeddings, use GraphConv layers with ReLU for non-linearity
        x = self.conv(x, Adj) # Give adjacency matrix instead of edge_index and edge_weight
        x = self.relu(x)

        # 2. Readout layer
        # No pooling is required, we want target labels for each node, not for the entire graph
        x = self.lin2(x)
        x = self.relu(x)
        # 3. Apply a final classifier 
        # Use a single layer as classifier to map to the targets
        x = self.lin(x)

        # No LogSoftmax needed, possibly some other function to map to correct targets?

        return x

In [126]:
# Instantiate SIGN model, optimizer and loss function
model = GNN_expand(hidden_channels=64, n_edge_features = Adj.shape[-1])
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # Adjust learning rate
criterion = MCRMSE # Mean column-wise root mean square error (MCRMSE) loss

# Define trainer function for GNN
def run_training(X_data, Adj_data, batch_size = 1, n_epochs = 1):
    model.train()
    for epoch in range(n_epochs):
        print(f"=== Starting epoch {epoch + 1} ===")
        # Get permutation of sample indices for shuffling
        permutation = torch.randperm(len(X_data))
        # permutation = range(len(X_data))
        # Define variable for printing training loss
        running_loss = 0.
        # Run training over mini-batches for current epoch
        for ix in range(0, len(X_data), batch_size):  # Iterate over samples in the training dataset
            batch_indices = permutation[ix:ix+batch_size] # Get shuffled indices for minibatch
            X_batch, Adj_batch = X_data[batch_indices], Adj_data[batch_indices] # Minibatch of X and Adj
            y_batch = y_train[batch_indices] # Minibatch of ground truths
            out = model(X_batch, Adj_batch) # Perform forward pass
            loss = criterion(torch.tensor(y_batch), out)  # Compute the loss
            # As the loss function is defined per sample, we have to reduce the loss for
            # each mini-batch to a singular value in some way.
            # Could for example use mean, sum, or random sample. This is a design choice.
            loss = torch.mean(loss) # Calculate average loss for minibatch
            loss.backward()  # Derive gradients
            optimizer.step()  # Update parameters based on gradients
            optimizer.zero_grad()  # Clear gradients
            
            # Print statistics every print_batch minibatches 
            print_batch = int(len(X_data)/10/batch_size) # Set to print every 1/10 of all samples
            running_loss += loss.item() # Add (average) loss from minibatch
            if int(ix/batch_size) % print_batch == 0 and ix != 0: # Ignore first minibatch
                print('[batch %5d, sample %5d] loss: %.3f' % 
                        (int(ix/batch_size), ix, 
                        running_loss / print_batch)) # Average running loss
                running_loss = 0. # Reset running loss

# Convert training data inputs to pytorch tensors and run training
X_torch = torch.tensor(X)
Adj_torch = torch.tensor(Adj)
run_training(X_torch, Adj_torch, batch_size = 16, n_epochs = 10)


GNN_expand(
  (conv): myGraphConv_expand(11, 320)
  (lin2): Linear(in_features=320, out_features=64, bias=True)
  (lin): Linear(in_features=64, out_features=5, bias=True)
  (relu): ReLU()
)
=== Starting epoch 1 ===
[batch     9, sample   144] loss: 0.541
[batch    18, sample   288] loss: 0.446
[batch    27, sample   432] loss: 0.397
[batch    36, sample   576] loss: 0.372
[batch    45, sample   720] loss: 0.378
[batch    54, sample   864] loss: 0.366
[batch    63, sample  1008] loss: 0.361
[batch    72, sample  1152] loss: 0.371
[batch    81, sample  1296] loss: 0.366
[batch    90, sample  1440] loss: 0.370
[batch    99, sample  1584] loss: 0.362
=== Starting epoch 2 ===
[batch     9, sample   144] loss: 0.404
[batch    18, sample   288] loss: 0.377
[batch    27, sample   432] loss: 0.351
[batch    36, sample   576] loss: 0.347
[batch    45, sample   720] loss: 0.354
[batch    54, sample   864] loss: 0.344
[batch    63, sample  1008] loss: 0.338
[batch    72, sample  1152] loss: 0.353


In [None]:
# Define prediction function
def run_prediction(X_data, Adj_data):
    model.eval()
    y_pred = []
    outs = model(X_data, Adj_data) # Feed the data through the network 
    for yx in outs:
        y_pred.append(yx.detach().numpy())
    y_pred = np.array(y_pred)
    return y_pred

In [52]:
# Construct node features and adjacency matrix for post deadline test data
print("Shapes of inputs - Private test, post deadline")

# Version 3
# A includes base pairs 
# B includes bpps
# D_i includes distance measure, with power i (use several i)

# Node features
X_test_postdeadline = get_node_features(data = test_postdeadline)
X_test_postdeadline = X_test_postdeadline.astype(np.float32) # Convert to floats to prepare for torch model
print("Node features X: (n_samples, n_nodes, n_node_features) ", X_test_postdeadline.shape)
# Structure adjacency 
Adj_pairs_test_postdeadline = get_struct_adj(data = test_postdeadline, sequential_edges=False)
print("Structure adjacency matrices: (n_samples, n_nodes, n_nodes) ", Adj_pairs_test_postdeadline.shape)
# Base pair probability adjacency
print("Base pair probability adjacency matrices: (n_samples, n_nodes, n_nodes) ", Adj_bpps_test_postdeadline.shape)
# Distance adjacency
Adj_dist_1_pd = get_dist_adj(data = test_postdeadline, power = 1)
Adj_dist_2_pd = get_dist_adj(data = test_postdeadline, power = 2)
Adj_dist_3_pd = get_dist_adj(data = test_postdeadline, power = 3)
Adj_dist_test_postdeadline = np.concatenate([Adj_dist_1_pd[:,:,None], Adj_dist_2_pd[:,:,None], Adj_dist_3_pd[:,:,None]], axis = 2)
Adj_dist_test_postdeadline = Adj_dist_test_postdeadline[None, :,:,:] # Expand the dimensions of the array to allow stacking matrices for all samples 
Adj_dist_test_postdeadline = np.repeat(Adj_dist_test_postdeadline, len(test_postdeadline), axis = 0) # Repeat the distance array for each sample (they are identical, simply to match the data shape)
print("Distance adjacency matrices: (n_samples, n_nodes, n_nodes) ", Adj_dist_test_postdeadline.shape)
# Concatenate adjacency matrices into one array along last dimension
Adj_test_postdeadline = np.concatenate([Adj_pairs_test_postdeadline[:,:,:,None], Adj_bpps_test_postdeadline[:,:,:,None], Adj_dist_test_postdeadline[:,:,:,:]], axis = 3) # Expand dimensions of adjacency matrices and stack along new dimension
Adj_test_postdeadline = Adj_test_postdeadline.astype(np.float32) # Convert to floats to prepare for torch model
print("Total adjacency matrix: (n_samples, n_nodes, n_nodes, n_edge_features) ", Adj_test_postdeadline.shape)

Shapes of inputs - Private test, post deadline


NameError: name 'test_postdeadline' is not defined

In [166]:
# Run prediction on training data as a test run
y_train_pred = run_prediction(X_torch, Adj_torch)
y_train_pred = y_train_pred.astype(np.float32)

# Calculate score on training data
y_train_torch = torch.tensor(y_train)
y_train_pred_torch = torch.tensor(y_train_pred)
training_score = MCRMSE(y_train_torch, y_train_pred_torch, only_scored=False)
training_score_only_scored = MCRMSE(y_train_torch, y_train_pred_torch, only_scored=True)

print(f"Mean score on training data, all 5 targets: {float(torch.mean(training_score)):.5}")
print(f"Mean score on training data, only scored targets: {float(torch.mean(training_score_only_scored)):.5}")

# Training score
# SIGN

Mean score on training data, all 5 targets: 0.3076
Mean score on training data, only scored targets: 0.31642


In [167]:
X_test_postdeadline_torch = torch.tensor(X_test_postdeadline)
Adj_test_postdeadline_torch = torch.tensor(Adj_test_postdeadline)
y_postdeadline_pred = run_prediction(X_test_postdeadline_torch, Adj_test_postdeadline_torch)
y_postdeadline_pred = y_postdeadline_pred.astype(np.float32)
# Post deadline data only includes the scored targets. Remove the two unscored targets from predictions when calculating score
y_postdeadline_pred =  y_postdeadline_pred[:,:,:3]# Remove deg_pH10 and deg_50C (unscored targets, to match post deadline test data)

# Calculate score on post deadline test data
y_test_postdeadline_torch = torch.tensor(y_test_postdeadline)
y_postdeadline_pred_torch = torch.tensor(y_postdeadline_pred)
postdeadline_score_only_scored = MCRMSE(y_test_postdeadline_torch, y_postdeadline_pred_torch, only_scored=True, data=test_postdeadline)

print(f"Mean score on private test data, only scored targets: {float(torch.mean(postdeadline_score_only_scored)):.5}")

# Print number of trainable parameters
print("Number of trainable model parameters: ", sum(p.numel() for p in model.parameters() if p.requires_grad))
# Test score
# SIGN

Mean score on private test data, only scored targets: 0.41776
Number of trainable model parameters:  29573
