## Info about training data:


Raw data is given in Numpy compressed file with an array of pdb chain ids (pdbids) and a 3 dimensional array of input output features:
- 1st dimension: samples
- 2nd dimension: sequence position
- 3rd dimension: input features

[0:20] Amino Acids (sparse encoding)

Unknown residues are stored as an all-zero vector

[20:50] hmm profile

[50] Seq mask (1 = seq, 0 = empty)

[51] Disordered mask (0 = disordered, 1 = ordered)

[52] Evaluation mask (For CB513 dataset, 1 = eval, 0 = ignore)
[53] ASA (isolated)

[54] ASA (complexed)

[55] RSA (isolated)

[56] RSA (complexed)

[57:65] Q8 GHIBESTC (Q8 -> Q3: HHHEECCC)

[65:67] Phi+Psi

[67] ASA_max

In [1]:
from numpy import load
import numpy as np
import pandas as pd

In [2]:
data = load('Train_MMseqs.npz', allow_pickle=True)
lst = data.files

In [3]:
import torch

letters = ["A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y"]

def Prep_data(Subset_name,Subset_points):
    Sequences = []
    ASA_i = []

    for i in range(len(Subset_name)):
        Seq = Subset_points[i]
        index = ""
        for j in range(100):
            try:
                index += str((letters[np.where(Seq[j,0:20] == 1)[0][0]])) + " "
            except:
                    index += "X" + " "

        t = torch.tensor(Subset_points[i][0:100,54])
        ASA_i.append(t.unsqueeze(-1)) 
        Sequences.append(index)
        
    d = {"Subset_name" : Subset_name,"Sequence" : Sequences}
    df = pd.DataFrame(data = d)
    
    return df,ASA_i

In [4]:
# subset
Subset_name = []
Subset_points = []

for i in range(10):
    Rands = np.random.randint(10848,size=50);
    Subset_name += list(data[lst[0]][Rands])
    Subset_points += list(data[lst[1]][Rands])

Subset_name = np.array(Subset_name)

KeyboardInterrupt: 

In [4]:
df_train,targs_train = Prep_data(Subset_name,Subset_points)
df_train.head()

Unnamed: 0,Subset_name,Sequence
0,2wp7-A,M E P P N L Y P V K L Y V Y D L S K G L A R R ...
1,5ayn-A,M K V Q S L L R I E T Q L L L G R L L T R S G ...
2,4r0s-B,M H R S P L A W L R L L L A A V L G A F L L G ...
3,1c0a-A,M R T E Y C G Q L R L S H V G Q Q V T L C G W ...
4,4jzp-B,S X L D V G N A E V K L E E E N R S L K A D L ...


Now the test data:

In [5]:
len(Subset_name)

500

In [6]:
df_train.to_excel("500_train.xlsx")

In [7]:
data_test = load('CASP12_MMseqs.npz', allow_pickle=True)
lst = data.files

In [8]:
Subset_name_test = []
Subset_points_test = []

for i in range(len(data_test[lst[0]])):
    Subset_name_test.append(data_test[lst[0]][i])
    Subset_points_test.append(data_test[lst[1]][i])

Subset_name_test = np.array(Subset_name_test)

In [9]:
df_test,targs_test = Prep_data(Subset_name_test,Subset_points_test)
df_test.head()

Unnamed: 0,Subset_name,Sequence
0,5a7d-L,S L R F T A S T S T P K S G S K I A K R G K K ...
1,5aot-A,M G A E E E D T A I L Y P F T I S G N D R N G ...
2,5d9g-A,G H X A S G P W K L T A S K T H I X K S A D V ...
3,5fhy-A,M G H H H H H H G G S E N L Y F Q G N E D I L ...
4,5g5n-A,G L P V P S P P G T L L P G Q S P D E A F A R ...


In [10]:
df_test.to_excel("CASP12_test.xlsx")

In [16]:
train_targ = torch.concat(targs_train[:-50],1)
valid_targ = torch.concat(targs_train[-50:],1)
test_targ = torch.concat(targs_test,1)

# list of concatenated targets (ASA_i)
targ_all = [train_targ,valid_targ,test_targ]

In [20]:
torch.save(targ_all,"targs_ASA")

In [2]:
! pip install ipywidgets rich seaborn torch datasets transformers tokenizers sentencepiece sacremoses --quiet

%matplotlib inline

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch
from torch import nn
import math
from functools import partial
from pathlib import Path
from tqdm import tqdm
import rich
from typing import List, Tuple, Optional, Dict, Any
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import transformers
import tokenizers
import datasets
import zipfile
from huggingface_hub import hf_hub_download

sns.set()

# define the device to use
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
rich.print(f"Device: [red]{DEVICE}")

# control verbosity
transformers.logging.set_verbosity_error()
datasets.logging.set_verbosity_error()

In [3]:
from transformers import BertModel, BertTokenizer
import re
tokenizer = BertTokenizer.from_pretrained('Rostlab/prot_bert_bfd', do_lower_case=False)
model_embedder = BertModel.from_pretrained("Rostlab/prot_bert_bfd")

In [4]:
df_train = pd.read_excel("500_train.xlsx")

In [5]:
t_data = []
test_data = []

In [6]:
for seq in range(450,500):
    sequence_train = df_train["Sequence"][seq]
    encoded_input = tokenizer(sequence_train, return_tensors='pt')
    output = model_embedder(**encoded_input)
    t_data.append(output[0])

In [24]:
for test_seq in range(len(df_test)):
    sequence_test = df_test["Sequence"][test_seq]
    encoded_input = tokenizer(sequence_test, return_tensors='pt')
    output = model_embedder(**encoded_input)
    test_data.append(output[0])

In [7]:
len(t_data)

50

In [7]:
Old = torch.load('0_450_embedded')

In [8]:
Old.shape

torch.Size([450, 102, 1024])

In [9]:
New = torch.concat(t_data)

In [10]:
torch.cat((Old,New),0).shape

torch.Size([500, 102, 1024])

In [11]:
Final = torch.cat((Old,New),0)
torch.save(Final,"0_500_embedded")

In [49]:
train_data = torch.concat(t_data[:-5])
valid_data = torch.concat(t_data[-5:])
test_data = torch.concat(test_data)

# list of all embeddings for the train, valid and test.
data_embedded = [train_data,valid_data,test_data]

TypeError: concat(): argument 'tensors' (position 1) must be tuple of Tensors, not Tensor

In [59]:
# Saving them.
df_train.to_excel("df_train.xlsx")
torch.save(data_embedded,'data_embedded')
torch.save(targ_all,"targs_ASA")