In [76]:
from Bio import SeqIO, pairwise2
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.pairwise2 import format_alignment

import pandas as pd
import numpy as np
from tqdm import tqdm

1. iLearnPlusで記述子を計算するためには, 配列長がすべて揃ってないとならない  
2. 配列paddingをかけないとiLearnPlusでは記述子算出ができない  
3. FASTAのdescriptionが入っていないとエラーが出る
4. iLearnPlusではCSV出力するときにファイル名に.csvをつけないと正しく保存されない

### iLearn sample

load

In [54]:
fasta_path = '../data/DeepPPI/DeepPPI.fasta'
seq_dict = {}
for i, record in enumerate(SeqIO.parse(fasta_path, 'fasta')):
    if i == 100:
        break
    else:
        Id = record.id.split('|')[1]
        seq = str(record.seq)
        seq_dict[str(Id)] = seq

 sequence padding

In [114]:
sequences = np.array([str(v) for v in seq_dict.values()])
Ids = np.array([str(k) for k in seq_dict.keys()])
sequences_sorted = sequences[np.argsort([-len(s) for s in sequences])]

sequences_align = []
for seq in tqdm(sequences_sorted):
    if len(seq) < len(sequences_sorted[0]):
        seq = seq + '-'*(len(sequences_sorted[0]) - len(seq))
    sequences_align.append(seq)

100%|██████████| 100/100 [00:00<00:00, 45959.94it/s]


save

In [115]:
f_name =  f'../data/DeepPPI/DeepPPI_sample.fasta'
f = open(f_name, 'w')
for i, seq in enumerate(sequences_align):
    record = SeqRecord(Seq(seq), id=str(i), description='DeepPPI')
    SeqIO.write(record, f, "fasta")
f.close()

# DeepPPI padding

In [138]:
fasta_path = '../data/DeepPPI/DeepPPI.fasta'
seq_dict = {}
for i, record in enumerate(tqdm(SeqIO.parse(fasta_path, 'fasta'), total=4424)):
    Id = record.id.split('|')[1]
    seq = str(record.seq)
    if len(seq) < 4910:
       seq = seq + '-'*(4910-len(seq)) 
    seq_dict[str(Id)] = seq

chunk_size = 5
remainder = len(seq_dict)%chunk_size
resize = (chunk_size-remainder) + len(seq_dict)
idxes = np.split(np.arange(resize), chunk_size)
flag_idxes = [idx[0] for idx in idxes]
for i, (Id, seq) in tqdm(enumerate(seq_dict.items()), total=len(seq_dict)):
    for flag_idx in flag_idxes:
        if  i == flag_idx:
            f.close()
            f_name =  f'../data/DeepPPI/DeepPPI_Padd_{flag_idxes.index(i)}.fasta'
            f = open(f_name, 'w')
    record = SeqRecord(Seq(seq), id=Id, description='DeepPPI')
    SeqIO.write(record, f, "fasta")
f.close()

100%|██████████| 4424/4424 [00:00<00:00, 41693.29it/s]
100%|██████████| 4424/4424 [00:00<00:00, 11475.21it/s]


# DeepLoc padding

In [139]:
df = pd.read_csv('../data/DeepLoc/DeepLocAll.csv')
sequences = [''.join(seq.split()) for seq in df.iloc[:,0]]
max_length = max(len(seq) for seq in sequences)

chunk_size = 5
remainder = len(sequences)%chunk_size
resize = (chunk_size-remainder) + len(sequences)
idxes = np.split(np.arange(resize), chunk_size)
flag_idxes = [idx[0] for idx in idxes]
for i, seq in tqdm(enumerate(sequences), total=len(sequences)):
    for flag_idx in flag_idxes:
        if  i == flag_idx:
            f.close()
            f_name =  f'../data/DeepLoc/DeepLoc_Padd_{flag_idxes.index(i)}.fasta'
            f = open(f_name, 'w')
    if len(seq) < max_length:
        seq = seq + '-'*(max_length - len(seq))
    record = SeqRecord(Seq(seq), id=str(i), description='DeepLoc')
    SeqIO.write(record, f, "fasta")
f.close()

100%|██████████| 8464/8464 [00:01<00:00, 6568.92it/s]
