In [1]:
from PyBioMed.PyProtein import AAIndex, Autocorrelation
from tqdm import tqdm
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt

# AAindex1

In [2]:
def AAindex1_descriptor(sequences: list):
    # initialize
    if not os.path.exists('aaindex1'):
        AAIndex.GetAAIndex1('ANDN920101')
    # get all AAindex name
    f = open('aaindex1')
    data = f.read()
    idxes = []
    for i, item in  enumerate(data.split('//')):
        if i == len(data.split('//')) -1:
            break
        else:
            idxes.append(item.split()[1])
    # create AAindex1 dictionary
    print('-'*100)
    print('crating all AAindex1 dictionary ...')
    AAindex1 = {}
    for idx in tqdm(idxes):
        AAindex1[idx] = AAIndex.GetAAIndex1(idx)
    print('completed')
    print('-'*100)
    # calculate AAindex1 descriptor
    print('-'*100)
    print('calculating AAindex1 descriptors ...')
    descriptor_matrix = np.zeros([len(sequences), len(AAindex1)])
    for i, seq in enumerate(tqdm(sequences, total=len(sequences))):
        for j, aaindex1_dict in enumerate(AAindex1.values()):
            ss = []
            for s in seq:
                try:
                    if aaindex1_dict[s] is not None:
                        ss.append(aaindex1_dict[s])
                except KeyError:
                    pass
            descriptor_matrix[i,j] = np.mean(ss)
    print('completed')
    print('-'*100)
    return pd.DataFrame(descriptor_matrix, columns = list(AAindex1.keys()))

# AAindex23
ライブラリでファイルのパースがうまくいっていないようで, AAindex2,3については算出できない

# Autocorrelation

In [71]:
def acc_descriptor(sequences: list):
    descriptor_matrix = np.full((len(sequences), 720), np.nan)
    columns = Autocorrelation.CalculateAutoTotal('ALANINE').keys()
    for i, seq in enumerate(tqdm(sequences, total=len(sequences))):
        try: 
            disc = Autocorrelation.CalculateAutoTotal(seq)
            descriptor_matrix[i] = np.array(list(disc.values()))
        except KeyError:
            pass
    return pd.DataFrame(descriptor_matrix, columns=columns)

### 

# DeepLoc data set

In [4]:
df = pd.read_csv('../data/DeepLoc/DeepLocAll.csv')
sequences = df.iloc[:,0].tolist()
sequences = [''.join(s.split()) for s in sequences]
df.iloc[:,0] = sequences

In [6]:
aaindex = AAindex1_descriptor(sequences)

----------------------------------------------------------------------------------------------------
crating all AAindex1 dictionary ...


100%|██████████| 566/566 [00:21<00:00, 26.67it/s]


completed
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
calculating AAindex1 descriptors ...


100%|██████████| 8464/8464 [09:51<00:00, 14.30it/s]

completed
----------------------------------------------------------------------------------------------------





In [9]:
pd.concat([df, aaindex], axis=1).to_csv('../data/DeepLoc/DeepLocDescriptorAAindex.csv',index=False)

In [72]:
autocrr = acc_descriptor(sequences)
pd.concat([df, autocrr], axis=1).to_csv('../data/DeepLoc/DeepLocDescriptorAurocorrelation.csv',index=False)

100%|██████████| 8464/8464 [12:51<00:00, 10.97it/s]


# DeepPPI dataset

In [75]:
from Bio import SeqIO

fasta_path = '../data/DeepPPI/DeepPPI.fasta'
seq_dict = {}
for i, record in enumerate(tqdm(SeqIO.parse(fasta_path, 'fasta'), total=4424)):
    Id = record.id.split('|')[1]
    seq = str(record.seq) 
    seq_dict[str(Id)] = seq

100%|██████████| 4424/4424 [00:00<00:00, 47944.56it/s]


In [77]:
sequences = list(seq_dict.values())
aaindex = AAindex1_descriptor(sequences)
autocrr = acc_descriptor(sequences)

----------------------------------------------------------------------------------------------------
crating all AAindex1 dictionary ...


100%|██████████| 566/566 [00:20<00:00, 27.09it/s]


completed
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
calculating AAindex1 descriptors ...


100%|██████████| 4424/4424 [04:43<00:00, 15.62it/s]


completed
----------------------------------------------------------------------------------------------------


100%|██████████| 4424/4424 [05:59<00:00, 12.31it/s]


In [83]:
df_result = pd.DataFrame(seq_dict.keys(), columns=['UniprotID'])
df_result = pd.concat([df_result,aaindex], axis=1)
df_result.to_csv('../data/DeepPPI/DeepPPIDescriptorAAindex.csv', index=False)

In [84]:
df_result = pd.DataFrame(seq_dict.keys(), columns=['UniprotID'])
df_result = pd.concat([df_result,autocrr], axis=1)
df_result.to_csv('../data/DeepPPI/DeepPPIDescriptorAutocorrelation.csv', index=False)