In [7]:
#%pip install biopython
#%pip install scikit-learn
#%pip install pandas
#%pip install numpy
#%pip install torch
#%pip install torchmetrics
%pip install lightning

Collecting lightning
  Downloading lightning-2.2.0.post0-py3-none-any.whl.metadata (56 kB)
     ---------------------------------------- 0.0/56.8 kB ? eta -:--:--
     ---------------------------------------- 56.8/56.8 kB 2.9 MB/s eta 0:00:00
Downloading lightning-2.2.0.post0-py3-none-any.whl (2.1 MB)
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   -------- ------------------------------- 0.5/2.1 MB 14.2 MB/s eta 0:00:01
   ---------------------------- ----------- 1.5/2.1 MB 18.5 MB/s eta 0:00:01
   ---------------------------------------- 2.1/2.1 MB 16.3 MB/s eta 0:00:00
Installing collected packages: lightning
Successfully installed lightning-2.2.0.post0
Note: you may need to restart the kernel to use updated packages.


In [9]:
import os 
import numpy as np
import pandas as pd
import scipy
import sklearn.metrics as skmetrics

# plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import lightning as L

import torchmetrics
from torchmetrics.regression import PearsonCorrCoef

In [None]:
dataset_train.df

In [57]:
from Bio.PDB import PDBList, PDBParser, MMCIFParser

def downloadStructure(name):
    pdb_id = name

    pdb_file = PDBList().retrieve_pdb_file(pdb_id)

    return MMCIFParser().get_structure(pdb_id, pdb_file)


In [29]:
from Bio.PDB import NeighborSearch
from Bio.Data.IUPACData import protein_letters_3to1

def removeHeteroatoms(structure):
    model = structure[0]
    chains = model.get_chains()
    chain = next(iter(chains))
    residues = chain.get_residues()
    #print(next(iter(residues)))

    #remove all heteroatoms from chain
    heteroatoms = []
    for residue in chain:
        if(residue.id[0] != ' '):
            #remove residue from chain
            heteroatoms.append(residue.id)

    for heteroatom in heteroatoms:
        chain.detach_child(heteroatom)

    names = []
    #print all residues
    for residue in chain:
        names.append(residue.get_resname())

    string = ''.join([protein_letters_3to1[aa.get_resname()[0]+aa.get_resname()[1:].lower()] for aa in chain])

    return chain, string


In [30]:
dataset_train.df["wt_seq"][0][0]

'SPEVQIAILTEQINNLNEHLRVHKKDHHSRRGLLKMVGKRRRLLAYLRNKDVARYREIVEKLG'

In [91]:
import numpy as np
from Bio.Data.IUPACData import protein_letters_3to1

def getNClosestAAs(chain, index, nClosestAAs = 10, maxRadius = 20.0):
    #distance matix
    # Create a list of all atoms in the structure
    atoms = [atom for atom in chain.get_atoms()]
    
    # Create a NeighborSearch object
    ns = NeighborSearch(atoms)

    residue = chain[index]

    # Calculate the distance between the residues
    neighbors = ns.search(residue["CA"].coord, level="R", radius=maxRadius)
    distances = [(protein_letters_3to1[aa.get_resname()[0]+aa.get_resname()[1:].lower()], residue["CA"]-aa["CA"]) for aa in neighbors]
    #sort by distance
    distances.sort(key=lambda x: x[1])
    # Ensure distance is always nClosestAAs large, if too short, fill with 0
    if len(distances) < nClosestAAs:
        distances.extend([("0", 0)] * (nClosestAAs - len(distances)))
    else:
        distances = distances[:nClosestAAs]

    return distances




In [194]:
aa_alphabet = 'ACDEFGHIKLMNPQRSTVWY' # amino acid alphabet
aa_to_int = {aa: i for i, aa in enumerate(aa_alphabet)} # mapping from amino acid to number

def one_hot_encode(sequence):
    # initialize a zero matrix of shape (len(sequence), len(amino_acids))
    one_hot = torch.zeros(len(sequence), len(aa_alphabet))
    if sequence != "0":
        for i, aa in enumerate(sequence):
            # set the column corresponding to the amino acid to 1
            one_hot[i].scatter_(0, torch.tensor([aa_to_int[aa]]), 1)
    return one_hot

In [144]:
dataset_train.df["mut_type"][0][19*0+0]
#dataset_train.df["mut_type"][0][1194], dataset_train.df["mut_type"][0][-1]


'S1Q'

In [205]:
def getTensorFromProtein(proteinName, wt_seq, mut_types, nClosestAAs = 5):

    ###for each protein
    structure = downloadStructure(proteinName[:4])

    chain, string = removeHeteroatoms(structure)

    print(removeHeteroatoms(structure))
    #wt_seq = dataset_train.df["wt_seq"][0][proteinIndex]

    matching = (string.find(wt_seq),string.find(wt_seq)+len(wt_seq))
    print(matching)




    

    #amount of proteins, len of matching sequence, amount of amino acids, amount of closest AAs, one hot encoding of amino acid + distance
    #len(wt_seq), len(aa_alphabet)
    aaDistances = np.zeros((matching[1]-matching[0]+1, len(aa_alphabet), nClosestAAs, len(aa_alphabet)+1))

    # Iterate over each residue in the chain
    #for index, residue in enumerate(chain):
    for residueIndex in range(len(wt_seq)):#matching[1]-matching[0]):
        distances = getNClosestAAs(chain, matching[0]+residueIndex, nClosestAAs = nClosestAAs)




        #change wt residue to mutation
        for mutationIndex in range((len(aa_alphabet)-1)-2): #19 because 20 amino acids, -1 because wt is already in the list
                ###for each mutation
                #print(residueIndex*(len(aa_alphabet)-1)+mutationIndex)
                mut_type = mut_types[residueIndex*(len(aa_alphabet)-1)+mutationIndex]
                mutation_pos = mut_type[1:-1]
                #print("mut_type:" + str(mut_type))
                #print("mutation_pos:" + str(mutation_pos))
                #print(mut_type)
                newDist = distances
                newDist[0] = (mut_type[-1], 0.0) #mutated AA, distance to itself is 0

                for idx, aa in enumerate(newDist):
                    #print(aa[0],aa[1])
                    #print(proteinIndex,residueIndex,mutationIndex,idx)
                    aaDistances[residueIndex][mutationIndex][idx] = np.append(one_hot_encode(aa[0]),aa[1])
                #for idx, distance in enumerate(distances):
                    
        #aaDistances[residueIndex] = [np.append(one_hot_encode(aa[0]),aa[1]) for aa in distances]
        
    #print(distances, len(distances))
    return aaDistances

In [196]:
#torch.from_numpy(aaDistances)

In [213]:


# sequence data, comes already batched, so treat accordingly in dataloader (batch_size=1)
class SequenceData(Dataset):
    def __init__(self, csv_file, label_col="ddG_ML"):
        """
        Initializes the dataset. 
        input:
            csv_file: path to the relevant data file, eg. "/home/data/mega_train.csv"
        """
        
        self.min_size = 72
        
        self.df = pd.read_csv(csv_file, sep=",")
        self.label_col = label_col
        # only have mutation rows
        self.df = self.df[self.df.mut_type!="wt"]
        # process the mutation row
        self.df["mutation_pos"] = self.df["mut_type"].apply(lambda x: int(x[1:-1])-1) # make position start at zero
        self.df["mutation_to"] = self.df["mut_type"].apply(lambda x: aa_to_int[x[-1]]) # give numerical label to mutation

        # group by wild type
        self.df = self.df.groupby("WT_name").agg(list)
        # get wild type names
        self.wt_names = self.df.index.values
        # precompute one-hot encoding for faster training
        self.encoded_seqs = {}
        for wt_name in self.wt_names:
            # get the correct row
            mut_row = self.df.loc[wt_name]
            seq = mut_row["wt_seq"][0]
            self.encoded_seqs[wt_name] = one_hot_encode(seq)
            for i in range(self.min_size-len(self.encoded_seqs[wt_name])):
                #print(wt_name, i)
                self.encoded_seqs[wt_name] = torch.cat((self.encoded_seqs[wt_name], one_hot_encode("0")),0)
                

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx): #one batch = one protein
        # get the wild type name
        wt_name = self.wt_names[idx]
        # get the correct row
        mut_row = self.df.loc[wt_name]
        # get the wt sequence in one hot encoding
        #sequence_encoding = self.encoded_seqs[wt_name]

        # create mask and target tensors
        #mask = torch.zeros((1, len(sequence_encoding),20)) # will be 1 where we have a measurement
        #target = torch.zeros((1, len(sequence_encoding),20)) # ddg values
        # all mutations from df
        #positions = torch.tensor(mut_row["mutation_pos"])
        #amino_acids = torch.tensor(mut_row["mutation_to"])
        # get the labels
        labels = torch.tensor(mut_row[self.label_col])

        #for i in range(len(sequence_encoding)):
        #    mask[0,i,amino_acids[positions==i]] = 1 # one where we have data
        #    target[0,i,amino_acids[positions==i]] = labels[positions==i] # fill with ddG values
        
        # returns encoded sequence, mask and target sequence 
        wt_seq = self.df["wt_seq"][idx][0]
        mut_type = self.df["mut_type"][idx]
        nClosest = 5
        tensorLabels = torch.empty((len(wt_seq), len(aa_alphabet)))

        labelIndex = 0
        for label in labels:
            #TODO labelIndex is not correct
            tensorLabels[labelIndex][labelIndex] = torch.full((nClosest, len(aa_alphabet)+1), label)
            labelIndex += 1
        
        return {"sequence": getTensorFromProtein(wt_name,wt_seq,mut_type,nClosestAAs=nClosest), "labels": tensorLabels}

In [215]:
# usage
dataset_train = SequenceData('project_data/project_data/mega_train.csv')
dataset_val= SequenceData('project_data/project_data/mega_val.csv')
dataset_test = SequenceData('project_data/project_data/mega_test.csv')

# use batch_size=1 bc we treat each sequence as one batch
dataloader_val = DataLoader(dataset_val, batch_size=1, shuffle=False)
dataloader_train = DataLoader(dataset_train, batch_size=1, shuffle=True)
dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=False)

In [216]:
dataset_train.__getitem__(2)

RuntimeError: expand(torch.FloatTensor{[5, 21]}, size=[20]): the number of sizes provided (1) must be greater or equal to the number of dimensions in the tensor (2)

In [None]:
dataset_train.df["mut_type"][1]

In [204]:
print(len(dataset_train.df.loc["1A32.pdb"]["ddG_ML"]))
print(getTensorFromProtein("1A32.pdb", dataset_train.df["wt_seq"][0][0], dataset_train.df["mut_type"][0]).size)

1195
Structure exists: 'c:\Users\chtim\Documents\EigeneProjekte\Programmieren\Python\ProteinStabilityNN\a3\1a32.cif' 
(<Chain id=A>, 'LTQERKREIIEQFKVHENDTGSPEVQIAILTEQINNLNEHLRVHKKDHHSRRGLLKMVGKRRRLLAYLRNKDVARYREIVEKLGL')
(21, 84)




134400
