In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import multiprocessing as mp
import json

In [2]:
train_prot_file = "../../../DeepConv-DTI/epp_examples/training_dataset/training_protein.csv"
valid_prot_file = "../../../DeepConv-DTI/epp_examples/validation_dataset/validation_protein.csv" 
test_prot_file = "../../../DeepConv-DTI/epp_examples/test_dataset/kegg_protein.csv"

In [3]:
# protein pairwise similarity calculating function
from Bio import pairwise2
from Bio.Align import substitution_matrices
from itertools import starmap


def get_protein_similarity(seq1, seq2, matrix="BLOSUM62", gap_open=-10, gap_extend=-0.5):
    mat = substitution_matrices.load(name=matrix)
    alns = pairwise2.align.globalds(seq1, seq2, mat, gap_open, gap_extend)
    top_aln = alns[0]
    aln_human, aln_mouse, score, begin, end = top_aln
    return score/len(seq1)


def get_protein_bulk_similarity(seq1, list_of_seqs, matrix="BLOSUM62", gap_open=-10, gap_extend=-0.5):
    iterable = [(seq1, seq, matrix, gap_open, gap_extend) for seq in list_of_seqs]
    pool = mp.Pool(mp.cpu_count())
    scores = pool.starmap(get_protein_similarity, iterable)
    return np.array(list(scores))

In [4]:
# trying to map 1st validation protein similarity to all others

def create_prot_sim_dict(train_file, valid_file, output_file):
    df_train_ = pd.read_csv(train_file, usecols=[1,2], index_col=0)
    df_valid_ = pd.read_csv(valid_file, usecols=[1,2], index_col=0).iloc[:2, ]
    
    # get rid of illegal sequences
    illegal_proteins = list(df_train_.loc[df_train_.Sequence.str.contains("B|J|X|Z|O|U", regex=True)].index)
    df_train = df_train_.loc[~df_train_.index.isin(illegal_proteins)]
    
    illegal_proteins_valid = list(df_valid_.loc[df_valid_.Sequence.str.contains("B|J|X|Z|O|U", regex=True)].index)
    df_valid = df_valid_.loc[~df_valid_.index.isin(illegal_proteins_valid)]
    
    # creating the dictionary
    all_train_prots = list(df_train.index)
    all_train_seq = df_train.values.flatten()
    prot_sim_dict = dict()

    for prot, seq_info in tqdm(df_valid.iterrows()):
        seq = seq_info.values[0]
        sim_scores = get_protein_bulk_similarity(seq, all_train_seq)
        prot_sim_dict[prot] = {train_prot:sim_score for train_prot,sim_score in zip(all_train_prots, sim_scores)}
    
    # saving the dictionary as json
    with open(output_file, 'w') as fp:
        json.dump(prot_sim_dict, fp, indent=4)
    return output_file

In [5]:
create_prot_sim_dict(train_prot_file, valid_prot_file, "prot_sim_valid_dict.json")

2it [53:38, 1609.50s/it]


'prot_sim_valid_dict.json'