# Data preparation

### Import libraries

In [1]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
import pickle

### Functions

In [2]:
def replace_all(text, dic):
    for i, j in dic.items():
        text = text.replace(i, j)
    return text


def string_to_float(a):
    s1 = replace_all(a, rep)
    s1 = s1.split(' ')
    s1 = [float(i) for i in s1]
    return s1


def load_data(data_name):
    pkl_file = open(data_name + '.pkl', 'rb')
    data = pickle.load(pkl_file)
    pkl_file.close()
    return data

# Sequential data

### FASTA sequences, polarity and radicals

In [43]:
#  Read fasta-files
with open('./train/train.fasta', 'r') as myfile:
    data = myfile.read().replace('\n', '')
    
    
#  Create dictionaries for classifying all aminoacids according to classe from wiki (polarity and radicals)
rad_dic = {'G': '0', 'L': '0', 'Y': '1', 'S': '2', 'E': '3', 'Q': '4', 'D': '3', 'N': '4', 'F': '1',
       'A': '0', 'K': '5', 'R': '5', 'H': '6', 'C': '7', 'V': '0', 'P': '6', 'W': '6', 'I': '0', 'M': '7', 'T': '2'}
pol_dic = {'G': '0', 'L': '0', 'Y': '1', 'S': '1', 'E': '2', 'Q': '1', 'D': '2', 'N': '1', 'F': '0',
        'A': '0', 'K': '1', 'R': '1', 'H': '3', 'C': '0', 'V': '0', 'P': '0', 'W': '0', 'I': '0', 'M': '0', 'T': '1'}


bad_prot = []
pdb_name = ''
seq = ''
j = 0
fasta_df = pd.DataFrame()
mm = 0


#  Record protein name, fasta sequences, radical and polarity classes into dataframe
for i in range(len(data) - 1):
    seq = ''
    
    if data[i] == '$' and data[i+5] == '%':
        pdb_name = data[i+1:i+5]
        
    if data[i] == '%':
        j = i+1
        
        while data[j] != '$' and j != len(data)-1:
            
            if data[j] not in rad_dic:
                if pdb_name not in bad_prot:
                    bad_prot.append(pdb_name)
            
            seq += data[j]
            j = j + 1
            
            if j == len(data) - 1:
                seq += data[j]
                
        if len(seq) > mm:
            mm = len(seq)
            
        replace_all(seq, rad_dic)
        temp_df = pd.DataFrame({'pdb_name': [pdb_name], 'FASTA':[seq], 'Radical':[replace_all(seq, rad_dic)]
                           , 'Polarity':[replace_all(seq, pol_dic)]})
        fasta_df = pd.concat([fasta_df, temp_df], ignore_index=True)
        
        
print("Record FASTA df: OK")

Record FASTA df: OK


In [44]:
output = open('fasta1.pkl', 'wb')
pickle.dump(fasta_df, output)
output.close()
print("Record data: OK")

Record data: OK


### PSSM matrix

In [3]:
#  Read PSSM matrix and also record it into dataframe
file = open('./train/train.pssm', 'r')
pssm = file.readlines()
rep = {'G ':'', 'L ':'', 'Y ':'', 'S ':'', 'E ':'', 'Q ':'', 'D ':'', 'N ':'', 'F ':'', 
       'A ':'', 'K ':'', 'R ':'', 'H ':'', 'C ':'', 'V ':'', 'P ':'', 'W ':'', 'I ':'', 'M ':'', 'X ':'', 'T ':'', '\n':''}


pssm_df = pd.DataFrame()
gd = []
j = 0


for i in range(len(pssm)-1, -1, -1):
    
    if pssm[i][0] == '>':
        gd = list(reversed(gd))
        temp_df = pd.DataFrame({'PSSM': [gd]})
        pssm_df = pd.concat([pssm_df, temp_df], ignore_index = True)
        gd = []
        
    if pssm[i][0]!='>':
        gd.append(string_to_float(pssm[i]))

        
print("Record PSSM df: OK")


pssm_df = pssm_df.iloc[::-1]
pssm_df = pssm_df.reset_index(drop=True)

Record PSSM df: OK


In [47]:
output = open('pssm1.pkl', 'wb')
pickle.dump(pssm_df, output)
output.close()
print("Record data: OK")

Record data: OK


### Solvent accessibility and secondary structure classes

In [4]:
# Read solvent accessibility and secondary structure classes
file = open('./train/train.acc', 'r')
acc = file.readlines()


file = open('./train/train.ss', 'r')
ss = file.readlines()
ss_acc_df = pd.DataFrame()


for i in range(len(ss)):
    if i%2 != 0:
        temp = pd.DataFrame({'SS': [ss[i].replace('\n', '')], 'ACC':[acc[i].replace('\n', '')]})
        ss_acc_df = pd.concat([ss_acc_df, temp], ignore_index=True)
        
        
print("Record SS and ACC df: OK")

Record SS and ACC df: OK


In [16]:
ss_acc_df

Unnamed: 0,SS,ACC
0,CECCHHHHHHHHHHCCCCCECCECHHHHHHHHHHHHCCECCCEEEC...,ee-eeee--e--ee-e-eeeee-e--------ee--e-e-e-eeee...
1,CECCHHHHHHHHHHCCCCCECCECHHHHHHHHHHHHCCECCCEEEE...,ee-ee-e--e--eeee-ee-ee-e--------ee--e-e-e-eeee...
2,CECCHHHHHHHHHHCCCCCECCECHHHHHHHHHHHHCCECCCEEEE...,ee-ee-e--e--eeee-ee-ee-e--------ee--e-e-e-eeee...
3,CECCHHHHHHHHHHCCCCCECCECHHHHHHHHHHHHCCECCCEEEC...,ee-eeee--e--ee-e-eeeee-e--------ee--e-e-e-eeee...
4,CCHHHHHHHHHCCEEEEEECCCCCEEEECCEEEECCCCHHHHHHHH...,ee-ee--ee--e-eee--eeeeee-------e--eeeeeee--ee-...
...,...,...
8784,CCCCCCCEEEECCCCCCCCCC,eeeee-------eeeee-eee
8785,CCCCEEEECCCECCCCECCHHHEEEECCCCCCCEHHHHHHHCCCEE...,eee-----eee-eeeee-eee--eeee-eee--e-e--ee-ee-e-...
8786,CCCCEEEEEEEECCCCEEEECCCCCEEEECCCHHHHHHCCCEEEEE...,ee-ee-------eee----eeeeee----e-eeeeeee-------e...
8787,CCCCEEEECCCECCCCECCHHHEEEECCCCCCCEHHHHHHHCCCEE...,eee-----eee-eeeee-eee--eeee-eee--e-e--ee-ee-e-...


In [49]:
output = open('acc1.pkl', 'wb')
pickle.dump(ss_acc_df, output)
output.close()
print("Record data: OK")

Record data: OK


# All to one

Drop "bad proteins".

In [70]:
two_matrix = load_data('two_matrix_200')

# Concatenate all dataframes into one
result = pd.concat([fasta_df, ss_acc_df, pssm_df], axis=1, sort=False)
print("Concatenate dfs: OK")

# #  Delete proteins with untypical aminoacids in sequence or the ones with different sequence size
# bad = []

# for i in range(len(two_matrix)):
#     for j in range(len(result)):
        
#         if two_matrix[i][0] == result['pdb_name'][j]:
            
#             if len(two_matrix[i][2]) != len(result.FASTA[j]) or result.pdb_name.iloc[i] in bad_prot:
#                 bad.append(j)
        
# result = result.drop(bad)
# result = result.reset_index(drop=True)
# print("Drop bad proteins: OK")

bad = []
for i in range(len(result)):
    if result.pdb_name.iloc[i] in bad_prot:
        bad.append(i)
result = result.drop(bad)
result = result.reset_index(drop=True)
print("Drop bad proteins: OK")

Concatenate dfs: OK
Drop bad proteins: OK


In [64]:
result = result.drop(927)
result = result.reset_index(drop=True)

In [65]:
np.shape(result)

(6812, 7)

In [73]:
result.pdb_name.to_csv('good_prot.csv', index=False)
result.to_csv('pdb_and_features.csv', header='pdb_name')

  result.pdb_name.to_csv('good_prot.csv', index=False)


### Binarization

In [71]:
asd = []
for i in range(len(result)):
        
    fas = list(result.FASTA[i])
    ss1 = list(result.SS[i])
    acc1 = list(result.ACC[i])
    pol = list(result.Polarity[i])
    rad = list(result.Radical[i])
    
    lb = preprocessing.LabelBinarizer()
    lb1 = preprocessing.LabelBinarizer()
    lb2 = preprocessing.LabelBinarizer()
    lb3 = preprocessing.LabelBinarizer()
    lb4 = preprocessing.LabelBinarizer()
    
    lb.fit(['G', 'L', 'Y', 'S', 'E', 'Q', 'D', 'N', 'F', 'A', 'K', 'R', 'H', 'C', 'V', 'P', 'W', 'I', 'M', 'T'])
    a = lb.transform(fas)
    lb1.fit(['C', 'H', 'E'])
    b = lb1.transform(ss1)
    lb3.fit(['e', '-'])
    c = lb3.transform(acc1)
    lb2.fit(['0', '1', '2', '3', '4', '5', '6', '7'])
    d = lb2.transform(rad)
    lb4.fit(['0', '1', '2', '3'])
    e = lb4.transform(rad)
    
    pdb1 = np.concatenate((a, b, c, d, e, result.PSSM[i]), axis=1)
    asd.append([result['pdb_name'][i], pdb1])

### Save to files

In [72]:
#  Save final data into pickle file
output = open('train_data.pkl', 'wb')
pickle.dump(asd, output)
output.close()
print("Record data: OK")

Record data: OK


In [32]:
output = open('FASTA.pkl', 'wb')
pickle.dump(fasta, output)
output.close()
print("Record data: OK")

Record data: OK
