# Implementation of Utility functions required for Protein Structure Prediction

## Importing required Libraries

In [1]:
import collections
import math
import os
from datetime import datetime
import torch
import torch.utils.data
import torch.nn.functional as F
import h5py
import PeptideBuilder
import Bio.PDB
from Bio.Data.IUPACData import protein_letters_1to3
import numpy as np
from torch.nn.utils.rnn import pad_sequence

### Defining Amino Acid Dictionary and PyTorch Tensor

In [2]:
AA_ID_DICT = {'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9,
              'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17,
              'V': 18, 'W': 19, 'Y': 20}

PI_TENSOR = torch.tensor([3.141592])

In [3]:
PI_TENSOR

tensor([3.1416])

### Function to create a PyTorch DataLoader for Loading the Protein data from the database

In [4]:
def contruct_dataloader_from_disk(filename, minibatch_size):
    return torch.utils.data.DataLoader(H5PytorchDataset(filename),
                                       batch_size=minibatch_size,
                                       shuffle=True,
                                       collate_fn=merge_samples_to_minibatch)

### Function to access the Protein database as a Map Type Dataset for the DataLoader Class

In [6]:
class H5PytorchDataset(torch.utils.data.Dataset):
    def __init__(self, filename):
        super(H5PytorchDataset, self).__init__()

        self.h5pyfile = h5py.File(filename, 'r')
        self.num_proteins, self.max_sequence_len = self.h5pyfile['primary'].shape

    def __getitem__(self, index):
        mask = torch.Tensor(self.h5pyfile['mask'][index, :]).type(dtype=torch.bool)
        prim = torch.masked_select(
            torch.Tensor(self.h5pyfile['primary'][index, :]).type(dtype=torch.int),
            mask)
        tertiary = torch.Tensor(self.h5pyfile['tertiary'][index][:int(mask.sum())])
        return prim, tertiary, mask

    def __len__(self):
        return self.num_proteins

### Function to merge the Protein Samples into a Mini Batch for Mini-Batch Gradient Descent Algorithm

In [7]:
def merge_samples_to_minibatch(samples):
    samples_list = []
    for sample in samples:
        samples_list.append(sample)
    # sort according to length of aa sequence
    samples_list.sort(key=lambda x: len(x[0]), reverse=True)
    return zip(*samples_list)

### Function to set the Experiment Id for each of the Mini-Batch

In [8]:
def set_experiment_id(data_set_identifier, learning_rate, minibatch_size):
    output_string = datetime.now().strftime('%Y-%m-%d_%H_%M_%S')
    output_string += "-" + str(os.getpid())
    output_string += "-" + data_set_identifier
    output_string += "-LR" + str(learning_rate).replace(".", "_")
    output_string += "-MB" + str(minibatch_size)
    globals().__setitem__("experiment_id", output_string)

### Function to get the Experiment Id of a Mini Batch

In [9]:
def get_experiment_id():
    return globals().get("experiment_id")

### Function to save the model into an output file

In [10]:
def write_model_to_disk(model):
    path = "output/models/" + globals().get("experiment_id") + ".model"
    torch.save(model, path)
    return path

### Function to write the prediction data into a file

In [11]:
def write_prediction_data_to_disk(prediction_data):
    filepath = "output/predictions/" + globals().get("experiment_id") + ".txt"
    output_file = open(filepath, 'w')
    output_file.write(prediction_data)
    output_file.close()

### Function to write the summary of the experiment into a file

In [12]:
def write_result_summary(accuracy):
    output_string = globals().get("experiment_id") + ": " + str(accuracy) + "\n"
    with open("output/result_summary.txt", "a+") as output_file:
        output_file.write(output_string)
        output_file.flush()
    print(output_string, end="")

### Function to convert protein id into a string

In [13]:
def protein_id_to_str(protein_id_list):
    _aa_dict_inverse = {v: k for k, v in AA_ID_DICT.items()}
    aa_list = []
    for protein_id in protein_id_list:
        aa_symbol = _aa_dict_inverse[protein_id.item()]
        aa_list.append(aa_symbol)
    return aa_list