In [7]:
#%pip install biopython
#%pip install scikit-learn
#%pip install pandas
#%pip install numpy
#%pip install torch
#%pip install torchmetrics
%pip install lightning

Collecting lightning
  Downloading lightning-2.2.0.post0-py3-none-any.whl.metadata (56 kB)
     ---------------------------------------- 0.0/56.8 kB ? eta -:--:--
     ---------------------------------------- 56.8/56.8 kB 2.9 MB/s eta 0:00:00
Downloading lightning-2.2.0.post0-py3-none-any.whl (2.1 MB)
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   -------- ------------------------------- 0.5/2.1 MB 14.2 MB/s eta 0:00:01
   ---------------------------- ----------- 1.5/2.1 MB 18.5 MB/s eta 0:00:01
   ---------------------------------------- 2.1/2.1 MB 16.3 MB/s eta 0:00:00
Installing collected packages: lightning
Successfully installed lightning-2.2.0.post0
Note: you may need to restart the kernel to use updated packages.


In [9]:
import os 
import numpy as np
import pandas as pd
import scipy
import sklearn.metrics as skmetrics

# plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import lightning as L

import torchmetrics
from torchmetrics.regression import PearsonCorrCoef

In [None]:
dataset_train.df

In [595]:
from Bio.PDB import PDBList, PDBParser, MMCIFParser
PDBList().get_all_entries()

def downloadStructure(name):
    pdb_id = name

    #check if file is available


    pdb_file = PDBList().retrieve_pdb_file(pdb_id, file_format='mmCif')

    if pdb_file is None:
        return None
    
    return MMCIFParser().get_structure(pdb_id, pdb_file)


Retrieving index file. Takes about 27 MB.


In [29]:
from Bio.PDB import NeighborSearch
from Bio.Data.IUPACData import protein_letters_3to1

def removeHeteroatoms(structure):
    model = structure[0]
    chains = model.get_chains()
    chain = next(iter(chains))
    residues = chain.get_residues()
    #print(next(iter(residues)))

    #remove all heteroatoms from chain
    heteroatoms = []
    for residue in chain:
        if(residue.id[0] != ' '):
            #remove residue from chain
            heteroatoms.append(residue.id)

    for heteroatom in heteroatoms:
        chain.detach_child(heteroatom)

    names = []
    #print all residues
    for residue in chain:
        names.append(residue.get_resname())

    string = ''.join([protein_letters_3to1[aa.get_resname()[0]+aa.get_resname()[1:].lower()] for aa in chain])

    return chain, string


In [30]:
dataset_train.df["wt_seq"][0][0]

'SPEVQIAILTEQINNLNEHLRVHKKDHHSRRGLLKMVGKRRRLLAYLRNKDVARYREIVEKLG'

In [572]:
import numpy as np
from Bio.Data.IUPACData import protein_letters_3to1

def getNClosestAAs(chain, index, nClosestAAs = 10, maxRadius = 20.0):
    #distance matix
    # Create a list of all atoms in the structure
    atoms = [atom for atom in chain.get_atoms()]
    
    # Create a NeighborSearch object
    ns = NeighborSearch(atoms)

    #print(next(iter(chain)))
    #print("get residue at index", index)
    residue = chain[index] #because numbering starts at 1
    #print(residue)

    # Calculate the distance between the residues
    neighbors = ns.search(residue["CA"].coord, level="R", radius=maxRadius)
    distances = [(protein_letters_3to1[aa.get_resname()[0]+aa.get_resname()[1:].lower()], residue["CA"]-aa["CA"]) for aa in neighbors]
    #sort by distance
    distances.sort(key=lambda x: x[1])
    # Ensure distance is always nClosestAAs large, if too short, fill with 0
    if len(distances) < nClosestAAs:
        distances.extend([("0", 0)] * (nClosestAAs - len(distances)))
    else:
        distances = distances[:nClosestAAs]

    return distances




In [573]:
aa_alphabet = 'ACDEFGHIKLMNPQRSTVWY' # amino acid alphabet
aa_to_int = {aa: i for i, aa in enumerate(aa_alphabet)} # mapping from amino acid to number

def one_hot_encode(sequence):
    # initialize a zero matrix of shape (len(sequence), len(amino_acids))
    one_hot = torch.zeros(len(sequence), len(aa_alphabet))
    if sequence != "0":
        for i, aa in enumerate(sequence):
            # set the column corresponding to the amino acid to 1
            one_hot[i].scatter_(0, torch.tensor([aa_to_int[aa]]), 1)
    return one_hot

In [574]:
dataset_train.df["mut_type"][0][19*0+0]
#dataset_train.df["mut_type"][0][1194], dataset_train.df["mut_type"][0][-1]


'S1Q'

In [606]:
def getTensorFromProtein(proteinName, wt_seq, mut_types, nClosestAAs = 5):

    print("Getting " + proteinName + " tensor: " + wt_seq + " " + mut_types[0] + " " + mut_types[-1])
    ###for each protein
    structure = downloadStructure(proteinName[:4])

    chain, string = removeHeteroatoms(structure)

    print("Structure sequence: " + string)
    #wt_seq = dataset_train.df["wt_seq"][0][proteinIndex]

    matching = (string.find(wt_seq),string.find(wt_seq)+len(wt_seq))
    print(matching)

    matchingSeq_len = len(wt_seq) # = matching[1]-matching[0]

    print(len(chain))

    

    #len(wt_seq), len(aa_alphabet)
    
    # Iterate over each residue in the chain
    #for index, residue in enumerate(chain):

    #len of matching sequence, max amount of mutations at the position, amount of closest AAs, one hot encoding of amino acid 
    aaDistances = torch.zeros((matchingSeq_len, len(aa_alphabet), nClosestAAs, len(aa_alphabet)))

    for residueIndex in range(1, matchingSeq_len):
        
        distances = getNClosestAAs(chain, matching[0]+residueIndex, nClosestAAs = nClosestAAs)
        #change wt residue to mutation
        
        mut_toThatHaveData = [mut_type[-1] for mut_type in mut_types if str(residueIndex+matching[0]) in mut_type]
        for mutationIndex in range(len(aa_alphabet)-1): #19 because 20 amino acids, -1 because wt is already in the list
               ###for each mutation
               #print(residueIndex*(len(aa_alphabet)-1)+mutationIndex)
            
            #check if mutation is at this position exists
            mut_to = aa_alphabet[mutationIndex]

            if(mut_toThatHaveData.__contains__(mut_to)):
                print("Mutation at position " + str(residueIndex) + " to " + mut_to + " in " + proteinName)
                #mut_type = mut_types[residueIndex*(len(aa_alphabet)-1)+mutationIndex]
                #mutation_pos = mut_type[1:-1]
               #print("mut_type:" + str(mut_type))
               #print("mutation_pos:" + str(mutation_pos))
               #print(mut_type)
                distances[0] = (mut_to, 0.0) #mutated AA, distance to itself is 0
                aaDistances[residueIndex][mutationIndex][0] = one_hot_encode(distances[0][0])*-1

                for idx, aa in enumerate(distances[1:], start=1):
                    # Rest of the code
                #print(aa[0],aa[1])
                #print(proteinIndex,residueIndex,mutationIndex,idx)
                    aaDistances[residueIndex][mutationIndex][idx] = one_hot_encode(aa[0])*aa[1]
               #for idx, distance in enumerate(distances):
            else:
                aaDistances[residueIndex][mutationIndex] = torch.zeros((nClosestAAs, len(aa_alphabet)))
                print("No data for mutation at position " + str(residueIndex) + " to " + mut_to + " in " + proteinName)
                   
       #aaDistances[residueIndex] = [np.append(one_hot_encode(aa[0]),aa[1]) for aa in distances]
       
   #print(distances, len(distances))
    return aaDistances

    #aaDistances = torch.empty(0)
    aaDistances = []
    distances = []
    oldMutationPos = -1

    for mutation in mut_types:
        mutation_pos = int(mutation[1:-1])
        mutatedFrom = mutation[0]
        mutatedTo = mutation[-1]
        print(mutation_pos)
        
        if(mutation_pos != oldMutationPos):
            distances = getNClosestAAs(chain, matching[0]+mutation_pos, nClosestAAs = nClosestAAs)
        
        oldMutationPos = mutation_pos

        distances[0] = (mutatedTo, 0.0) #mutated AA, distance to itself is 0

        #print(one_hot_encode(distances[0][0]))
        aaDistances.append(one_hot_encode(distances[0][0])*-1)
#        aaDistances = torch.cat((aaDistances,one_hot_encode(distances[0][0])*-1))
        #print(torch.flatten(torch.stack([one_hot_encode(aa[0])*aa[1] for aa in distances[1:]])))
        [aaDistances.append(one_hot_encode(aa[0])*aa[1]) for aa in distances[1:]]
#        aaDistances = torch.cat((aaDistances,torch.flatten(torch.stack([one_hot_encode(aa[0])*aa[1] for aa in distances[1:]]))))
        #aaDistances.append(np.append(one_hot_encode(aa[0]),aa[1]) for aa in distances)

    return aaDistances

In [607]:
wt_name = dataset_train.wt_names[1]
# get the correct row
mut_row = dataset_train.df.loc[wt_name]
print("mut_type_len =", len(mut_row["mut_type"]))
getTensorFromProtein(wt_name, mut_row["wt_seq"][0], mut_row["mut_type"])[0][0]

mut_type_len = 1287
Getting 1AOY.pdb tensor: QEELVKAFKALLKEEKFSSQGEIVAALQEQGFDNINQSKVSRMLTKFGAVRTRNAKMEMVYCLPAELGV Q1E V69C
Structure exists: 'c:\Users\chtim\Documents\EigeneProjekte\Programmieren\Python\ProteinStabilityNN\ao\1aoy.cif' 
Structure sequence: MRSSAKQEELVKAFKALLKEEKFSSQGEIVAALQEQGFDNINQSKVSRMLTKFGAVRTRNAKMEMVYCLPAELGVPTT
(6, 75)
78
Mutation at position 1 to A in 1AOY.pdb
Mutation at position 1 to C in 1AOY.pdb
Mutation at position 1 to D in 1AOY.pdb
Mutation at position 1 to E in 1AOY.pdb
Mutation at position 1 to F in 1AOY.pdb
Mutation at position 1 to G in 1AOY.pdb
Mutation at position 1 to H in 1AOY.pdb
Mutation at position 1 to I in 1AOY.pdb
Mutation at position 1 to K in 1AOY.pdb
Mutation at position 1 to L in 1AOY.pdb
Mutation at position 1 to M in 1AOY.pdb
Mutation at position 1 to N in 1AOY.pdb
Mutation at position 1 to P in 1AOY.pdb
Mutation at position 1 to Q in 1AOY.pdb
Mutation at position 1 to R in 1AOY.pdb
Mutation at position 1 to S in 1AOY.pdb
Mutation at p

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [596]:


# sequence data, comes already batched, so treat accordingly in dataloader (batch_size=1)
class SequenceData(Dataset):
    def __init__(self, csv_file, label_col="ddG_ML"):
        """
        Initializes the dataset. 
        input:
            csv_file: path to the relevant data file, eg. "/home/data/mega_train.csv"
        """
        
        self.min_size = 72
        
        self.df = pd.read_csv(csv_file, sep=",")
        self.label_col = label_col
        # only have mutation rows
        self.df = self.df[self.df.mut_type!="wt"]
        # process the mutation row
        self.df["mutation_pos"] = self.df["mut_type"].apply(lambda x: int(x[1:-1])-1) # make position start at zero
        self.df["mutation_to"] = self.df["mut_type"].apply(lambda x: aa_to_int[x[-1]]) # give numerical label to mutation

        # group by wild type
        self.df = self.df.groupby("WT_name").agg(list)
        # get wild type names
        self.wt_names = self.df.index.values
        # precompute one-hot encoding for faster training
        self.encoded_seqs = {}
        for wt_name in self.wt_names:
            # get the correct row
            
            #download structure
            structure = downloadStructure(wt_name[:4])
            if structure is None:
                print("No structure found for " + wt_name)
                #remove from df
                self.df = self.df.drop(wt_name)
                continue
           
            mut_row = self.df.loc[wt_name]
            seq_tensor = getTensorFromProtein(wt_name, mut_row["wt_seq"][0], mut_row["mut_type"])
            #seq = mut_row["wt_seq"][0]
            #self.encoded_seqs[wt_name] = one_hot_encode(seq)
            #for i in range(self.min_size-len(self.encoded_seqs[wt_name])):
            #    #print(wt_name, i)
            #    self.encoded_seqs[wt_name] = torch.cat((self.encoded_seqs[wt_name], one_hot_encode("0")),0)
                

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx): #one batch = one protein
        # get the wild type name
        wt_name = self.wt_names[idx]
        # get the correct row
        mut_row = self.df.loc[wt_name]
        # get the wt sequence in one hot encoding
        #sequence_encoding = self.encoded_seqs[wt_name]

        # create mask and target tensors
        #mask = torch.zeros((1, len(sequence_encoding),20)) # will be 1 where we have a measurement
        #target = torch.zeros((1, len(sequence_encoding),20)) # ddg values
        # all mutations from df
        #positions = torch.tensor(mut_row["mutation_pos"])
        #amino_acids = torch.tensor(mut_row["mutation_to"])
        # get the labels
        labels = torch.tensor(mut_row[self.label_col])
        sequence = torch.stack(getTensorFromProtein(wt_name, mut_row["wt_seq"][0], mut_row["mut_type"])).squeeze(1)
        #for i in range(len(sequence_encoding)):
        #    mask[0,i,amino_acids[positions==i]] = 1 # one where we have data
        #    target[0,i,amino_acids[positions==i]] = labels[positions==i] # fill with ddG values
        
        # returns encoded sequence, mask and target sequence 
#        wt_seq = self.df["wt_seq"][idx][0]
#        mut_type = self.df["mut_type"][idx]
#        nClosest = 5
#        tensorLabels = torch.empty((len(wt_seq), len(aa_alphabet), nClosest, len(aa_alphabet)+1))
#
#        #labelIndex = 0
#        for label in range(labels.size(0)):
#            #TODO labelIndex is not correct
#            posIndex = label//len(aa_alphabet)
#            mutIndex = label%len(aa_alphabet)
#            #print(posIndex, mutIndex)
#            #print(torch.full((nClosest, len(aa_alphabet)+1), labels[label].item()).shape)
#            #print(tensorLabels[posIndex][mutIndex].shape)
#            tensorLabels[posIndex][mutIndex] = torch.full((nClosest, len(aa_alphabet)+1), labels[label].item())
#            #labelIndex += 1
        
        return {"sequence": sequence, "labels": labels}

In [597]:
# usage
dataset_train = SequenceData('project_data/project_data/mega_train.csv')
dataset_val= SequenceData('project_data/project_data/mega_val.csv')
dataset_test = SequenceData('project_data/project_data/mega_test.csv')

# use batch_size=1 bc we treat each sequence as one batch
dataloader_val = DataLoader(dataset_val, batch_size=1, shuffle=False)
dataloader_train = DataLoader(dataset_train, batch_size=1, shuffle=True)
dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=False)

Structure exists: 'c:\Users\chtim\Documents\EigeneProjekte\Programmieren\Python\ProteinStabilityNN\a3\1a32.cif' 
Getting 1A32.pdb tensor: SPEVQIAILTEQINNLNEHLRVHKKDHHSRRGLLKMVGKRRRLLAYLRNKDVARYREIVEKLG S1Q G63C
Structure exists: 'c:\Users\chtim\Documents\EigeneProjekte\Programmieren\Python\ProteinStabilityNN\a3\1a32.cif' 
Structure sequence: LTQERKREIIEQFKVHENDTGSPEVQIAILTEQINNLNEHLRVHKKDHHSRRGLLKMVGKRRRLLAYLRNKDVARYREIVEKLGL
(21, 84)
85
tensor([-1., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
        -0., -0., -0., -0., -0., -0.])
tensor([-0., -1., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
        -0., -0., -0., -0., -0., -0.])
tensor([-0., -0., -1., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
        -0., -0., -0., -0., -0., -0.])
tensor([-0., -0., -0., -1., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
        -0., -0., -0., -0., -0., -0.])
tensor([-0., -0., -0., -0., -1., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
        -0.,

KeyboardInterrupt: 

In [599]:
print(dataset_train.__getitem__(2)["sequence"].shape) # 5x618x1x20
print(dataset_train.__getitem__(2)["labels"].shape)

Getting 1E0L.pdb tensor: GATAVSEWTEYKTADGKTYYYNNRTLESTWEKPQELK G1Q K37C
Structure exists: 'c:\Users\chtim\Documents\EigeneProjekte\Programmieren\Python\ProteinStabilityNN\e0\1e0l.cif' 
Structure sequence: GATAVSEWTEYKTADGKTYYYNNRTLESTWEKPQELK
(0, 37)
37


KeyError: (' ', 0, ' ')

In [570]:
#dataloader_test.dataset
#batch = next(iter(dataloader_train))
x = batch["sequence"].shape
y = batch["labels"].shape

print(x, y)

torch.Size([1, 6780, 1, 20]) torch.Size([1, 1356])


In [558]:
# your code
class linModel(nn.Module):
    
    # Network Initialisation
    def __init__(self, params):
        
        super(linModel, self).__init__() #initialize parent pytorch module

        # read parameters
        inputSize = params["inputShape"]
        
        #print(inputShape)
        self.fc1 = nn.Linear(int(inputSize), 300)

        self.fc2 = nn.Linear(300, 1)

    def forward(self,X):
        # our network's forward pass
        
        X = F.tanh(self.fc1(X))

        X = self.fc2(X)

        return X

In [567]:
class LitMRIModel(L.LightningModule):
    def __init__(self, model, learning_rate=1e-3):
        super().__init__()
        ######## TODO ##########
        # pass our model 
        self.model = model
        #pass the learning rate
        self.lr = learning_rate
        # define loss function
        self.loss_function = nn.MSELoss() #TODO
        # define accuracy metric (torchmetrics)
        #self.accuracy = torchmetrics.classification.Accuracy(task="multiclass", num_classes=2)
        ########################

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.

        x = batch["sequence"][0]
        y = batch["labels"][0]

        # run data through model
        predictions = self.model(x)
        #print(predictions.shape)

        # compute loss
        loss = self.loss_function(predictions, y)
        ##############################

        # logging the values (will appear in progress bar and on dashboard)
        self.log("train_loss", loss, on_epoch=True, prog_bar=True)

        return loss

    def configure_optimizers(self):
        ############## TODO ################
        # define the optimizer, let's use Adam
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        ####################################
        return optimizer

    def test_step(self, batch, batch_idx):
        # this is the test loop

        ############### TODO #############
        # read from batch
        x = batch["sequence"][0]
        y = batch["labels"][0]

        # run data through model
        predictions = self.model(x)
        #print(predictions.shape)

        # compute loss
        loss = self.loss_function(predictions, y)
        ##############################

        # logging
        self.log("test_loss", loss, prog_bar=True)
        return loss#, acc


    def validation_step(self, batch, batch_idx):
        # this is the validation loop
        ############### TODO #############
        # read from batch
        x = batch["sequence"][0]
        y = batch["labels"][0]

        # run data through model
        predictions = self.model(x)
        #print(predictions.shape)

        # compute loss
        loss = self.loss_function(predictions, y)
        ##############################

        # logging
        self.log("val_loss", loss, on_epoch=True, prog_bar=True)
        #self.log("val_acc", acc, on_epoch=True, prog_bar=True)
        return loss 

In [568]:
# define parameters
# the last dimension of the input tensor will equal in_features of the linear layer
nClosestAAs = 5
seq_len = 20
params_model={
    "inputShape": nClosestAAs*len(aa_alphabet)*seq_len
}

# define computation hardware approach (GPU/CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Instantiate the model
cnn_model = linModel(params_model)
# moves the model to GPU if available
cnn_model = cnn_model.to(device)

#channel, batch, weight
#inputShape (1, 1024, 768)

In [569]:
# train model
########## TODO #############
# instantiate lightning model with the cnn_model and learning_rate=1e-3
model = LitMRIModel(cnn_model, learning_rate=1e-3)
############################

# instantiate the lightning trainer 
trainer = L.Trainer(max_epochs=20, log_every_n_steps=1)#, callbacks=[FineTuneLearningRateFinder(milestones=(5, 10))])
# train
trainer.fit(model, dataloader_train, dataloader_val)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type     | Params
-------------------------------------------
0 | model         | linModel | 600 K 
1 | loss_function | MSELoss  | 0     
-------------------------------------------
600 K     Trainable params
0         Non-trainable params
600 K     Total params
2.402     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\chtim\AppData\Local\Programs\Python\Python311\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Getting 1I6C.pdb tensor: KLPPGWEKRMSRSSGRVYYFNHITNASQWERPSGNSSSG K1Q G39C
Structure exists: 'c:\Users\chtim\Documents\EigeneProjekte\Programmieren\Python\ProteinStabilityNN\i6\1i6c.cif' 
Structure sequence: KLPPGWEKRMSRSSGRVYYFNHITNASQWERPSGNSSSG
(0, 39)
39
1
get residue at index 1
<Residue LYS het=  resseq=1 icode= >
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
get residue at index 2
<Residue LEU het=  resseq=2 icode= >
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
3
get residue at index 3
<Residue PRO het=  resseq=3 icode= >
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
4
get residue at index 4
<Residue PRO het=  resseq=4 icode= >
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
5
get residue at index 5
<Residue GLY het=  resseq=5 icode= >
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
6
get residue at index 6
<Residue TRP het=  resseq=6 icode= >
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
7
get residue at index 7
<Residue GLU het=  resseq=7 icode= >
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
8
get residue at index 8
<Residue LYS het=  resseq=8 icode= >
8
8


RuntimeError: mat1 and mat2 shapes cannot be multiplied (3430x20 and 2000x300)

In [372]:
dataset_train.df["wt_seq"][2][0]
len(dataset_train.df["mutation_to"][2])

len(dataset_train.df["ddG_ML"][2])

618

Getting 1AOY.pdb tensor: QEELVKAFKALLKEEKFSSQGEIVAALQEQGFDNINQSKVSRMLTKFGAVRTRNAKMEMVYCLPAELGV Q1E V69C
Structure exists: 'c:\Users\chtim\Documents\EigeneProjekte\Programmieren\Python\ProteinStabilityNN\ao\1aoy.cif' 




Structure sequence: MRSSAKQEELVKAFKALLKEEKFSSQGEIVAALQEQGFDNINQSKVSRMLTKFGAVRTRNAKMEMVYCLPAELGVPTT
(6, 75)


['s',
 [tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
  tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 3.8356,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000]]),
  tensor([[0.0000, 0.0000, 0.0000, 3.8357, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000]]),
  tensor([[0.0000, 0.0000, 0.0000, 5.1925, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000]]),
  tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           5.7218, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000]])],
 [tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
  tensor([[0.0000, 0.

In [373]:
dataset_train.df.loc["1E0L.pdb"]

Unnamed: 0      [200028, 200029, 200030, 200031, 200032, 20003...
name            [1E0L.pdb_G1Q, 1E0L.pdb_G1E, 1E0L.pdb_G1N, 1E0...
ddG_ML          [-0.134363487967859, -0.2192838778718004, -0.1...
mut_type        [G1Q, G1E, G1N, G1H, G1D, G1R, G1K, G1T, G1S, ...
aa_seq          [QATAVSEWTEYKTADGKTYYYNNRTLESTWEKPQELK, EATAVS...
wt_seq          [GATAVSEWTEYKTADGKTYYYNNRTLESTWEKPQELK, GATAVS...
mutation_pos    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
mutation_to     [13, 3, 11, 6, 2, 14, 8, 16, 15, 0, 10, 9, 17,...
Name: 1E0L.pdb, dtype: object

In [310]:
print(len(dataset_train.df["mut_type"][2]))
dataset_train.df["mut_type"][2]

618


['G1Q',
 'G1E',
 'G1N',
 'G1H',
 'G1D',
 'G1R',
 'G1K',
 'G1T',
 'G1S',
 'G1A',
 'G1M',
 'G1L',
 'G1V',
 'G1I',
 'G1W',
 'G1Y',
 'G1F',
 'G1P',
 'A2Q',
 'A2E',
 'A2N',
 'A2H',
 'A2D',
 'A2R',
 'A2K',
 'A2T',
 'A2S',
 'A2G',
 'A2M',
 'A2L',
 'A2V',
 'A2I',
 'A2W',
 'A2Y',
 'A2F',
 'A2P',
 'T3Q',
 'T3E',
 'T3N',
 'T3H',
 'T3D',
 'T3R',
 'T3K',
 'T3S',
 'T3A',
 'T3G',
 'T3M',
 'T3L',
 'T3V',
 'T3I',
 'T3W',
 'T3Y',
 'T3F',
 'T3P',
 'T3C',
 'A4Q',
 'A4E',
 'A4N',
 'A4H',
 'A4D',
 'A4R',
 'A4K',
 'A4T',
 'A4S',
 'A4G',
 'A4M',
 'A4L',
 'A4V',
 'A4I',
 'A4W',
 'A4Y',
 'A4F',
 'A4P',
 'V5Q',
 'V5E',
 'V5N',
 'V5H',
 'V5D',
 'V5R',
 'V5K',
 'V5T',
 'V5S',
 'V5A',
 'V5G',
 'V5M',
 'V5L',
 'V5I',
 'V5W',
 'V5Y',
 'V5F',
 'V5P',
 'S6Q',
 'S6E',
 'S6N',
 'S6H',
 'S6D',
 'S6R',
 'S6K',
 'S6T',
 'S6A',
 'S6G',
 'S6M',
 'S6L',
 'S6V',
 'S6I',
 'S6W',
 'S6Y',
 'S6F',
 'S6P',
 'S6C',
 'E7Q',
 'E7N',
 'E7H',
 'E7D',
 'E7R',
 'E7K',
 'E7T',
 'E7S',
 'E7A',
 'E7G',
 'E7M',
 'E7L',
 'E7V',
 'E7I',
 'E7W',


In [309]:
print(len(dataset_train.df.loc["1E0L.pdb"]["ddG_ML"]))
print(getTensorFromProtein("1E0L.pdb", dataset_train.df["wt_seq"][0][0], dataset_train.df["mut_type"][0]).size)

618
Getting 1E0L.pdb tensor: SPEVQIAILTEQINNLNEHLRVHKKDHHSRRGLLKMVGKRRRLLAYLRNKDVARYREIVEKLG S1Q G63C
Structure exists: 'c:\Users\chtim\Documents\EigeneProjekte\Programmieren\Python\ProteinStabilityNN\e0\1e0l.cif' 
Structure sequence: GATAVSEWTEYKTADGKTYYYNNRTLESTWEKPQELK
(-1, 62)




KeyError: (' ', 0, ' ')