# Using pantherdb to get the orthologs from ref proteom

In [1]:
import pandas as pd
import Bio
import numpy as np
import pickle
import matplotlib.pyplot as plt

In [2]:
experiment_name = "FOCAL_MOUSE2HUMAN"
back_path = "../../"
orth_file = "MOUSE_HUMAN_ref.tab"
exp_protein_file = "MOUSE_focal.txt"
FROM="MOUSE"
TO="HUMAN"

In [3]:
with open(back_path+'05_Output/'+experiment_name+"/"+experiment_name+'_matches2refproteom.pickle', 'rb') as f:
    uniprot2seq = pickle.load(f)

# Retrieving the orthologs

In [4]:
orthologs_table = pd.read_csv(back_path+'01_Reference/'+experiment_name+"/"+orth_file,sep='\t',header=None)

orthologs_table.head()

Unnamed: 0,0,1,2,3,4
0,HUMAN|HGNC=11477|UniProtKB=Q15528,MOUSE|MGI=MGI=98446|UniProtKB=Q62276,LDO,Euarchontoglires,PTHR12434
1,HUMAN|HGNC=28143|UniProtKB=Q53S58,MOUSE|MGI=MGI=1913593|UniProtKB=Q8BPE4,LDO,Euarchontoglires,PTHR21824
2,HUMAN|HGNC=3042|UniProtKB=Q9UKA8,MOUSE|MGI=MGI=1858220|UniProtKB=Q9JKK0,LDO,Euarchontoglires,PTHR10300
3,HUMAN|HGNC=3040|UniProtKB=P53805,MOUSE|MGI=MGI=1890564|UniProtKB=Q9JHG6,LDO,Euarchontoglires,PTHR10300
4,HUMAN|HGNC=3041|UniProtKB=Q14206,MOUSE|MGI=MGI=1858219|UniProtKB=Q9JHG2,LDO,Euarchontoglires,PTHR10300


In [5]:
d = {}
for i in orthologs_table.index:
    if TO in orthologs_table[0][i]:
        for el in orthologs_table[0][i].split("|"):
            if "UniProtKB" in el:
                to = el.split("UniProtKB=")[1]
        for el in orthologs_table[1][i].split("|"):
            if "UniProtKB" in el:
                var_from = el.split("UniProtKB=")[1]
    else:
        for el in orthologs_table[1][i].split("|"):
            if "UniProtKB" in el:
                to = el.split("UniProtKB=")[1]
        for el in orthologs_table[0][i].split("|"):
            if "UniProtKB" in el:
                var_from = el.split("UniProtKB=")[1]
    store = (to,orthologs_table[2][i])
    if var_from in d:
        d[var_from].append(store)
    else:
        d[var_from] = [store]

In [6]:
len(d)

19050

In [7]:
d['Q62276']

[('Q15528', 'LDO')]

# Matching from old uniprot id to orthologs 

In [8]:
orthologs_found = {}
orthologs_not_found = {}

for key, values in uniprot2seq.items():
    found = False
    for value in values:
        from_ref_id = value.name.split("|")[1]
        if from_ref_id not in d:
            continue
        else:
            to_ref_id = ""
            b=True
            for el in d[from_ref_id]:
                if el[1] == 'LDO':
                    to_ref_id = el[0]
                    b=False
                    break
                else:
                    to_ref_id = el[0]
            if b :
                print(key,from_ref_id,to_ref_id)
            orthologs_found[key] = to_ref_id
            found = True
            break
            
    if not found :
        orthologs_not_found[key] = value

Q8R4V3 Q80X81 Q9BWD1
Q3UL78 P60766 P60953
Q60872 Q60872 P47813
P01899 P01897 P01893
P01901 P01901 P01893
Q8C622 Q8CGP2 Q99877
Q3U5K8 Q64282 P09914
Q3TIN2 Q8BML9 P47897


In [9]:
len(orthologs_found)

1143

In [10]:
len(orthologs_not_found)

11

# lets check how many lines we get from the filtered set of proteins

In [12]:
filtered_prots = pd.read_csv(back_path+"00_InputData/"+experiment_name+"/"+exp_protein_file, sep="\t",header=None)
filtered_prots

Unnamed: 0,0
0,Q9CSS6
1,Q9D2R0
2,Q3UD67
3,P61222
4,Q99LE6
...,...
1150,P68510
1151,P68254
1152,P63101
1153,Q3U0F2


In [13]:
list_of_found_indices = []
list_of_not_found_indices = []
for i in filtered_prots.index:
    s = filtered_prots[0][i]
    l = s.split(";")
    for el in l:
        if el in orthologs_found:
            list_of_found_indices.append(i)
            break
    if list_of_found_indices[-1] != i:
        list_of_not_found_indices.append(i)

In [14]:
len(list_of_found_indices)

1143

In [15]:
len(list_of_not_found_indices)

12

In [16]:
len(list_of_not_found_indices)+len(list_of_found_indices)

1155

In [17]:
print(list_of_not_found_indices)

[31, 193, 339, 386, 423, 426, 469, 474, 539, 859, 1013, 1018]


In [18]:
found_orthologs_table = pd.DataFrame([[key,val] for key, val in orthologs_found.items()],columns= [FROM+"_UniProtKB",TO+"_UniprotKB"])
found_orthologs_table

Unnamed: 0,MOUSE_UniProtKB,HUMAN_UniprotKB
0,Q9CSS6,E9PRG8
1,Q9D2R0,Q86V21
2,Q3UD67,P49588
3,P61222,P61221
4,Q99LE6,Q9UG63
...,...,...
1138,P68510,Q04917
1139,P68254,P27348
1140,P63101,P63104
1141,Q3U0F2,O00488


In [19]:
not_found_orthologs_table = pd.DataFrame([[key,np.nan] for key, val in orthologs_not_found.items()],columns= [FROM+"_UniProtKB",TO+"_UniprotKB"])
not_found_orthologs_table

Unnamed: 0,MOUSE_UniProtKB,HUMAN_UniprotKB
0,Q3URZ6,
1,Q9CPQ1,
2,P62862,
3,Q9DAS9,
4,P43274,
5,Q61635,
6,Q9QZ85,
7,Q922Q8,
8,Q6ZWZ4,
9,P37804,


In [20]:
matches = pd.concat([found_orthologs_table,not_found_orthologs_table])
matches

Unnamed: 0,MOUSE_UniProtKB,HUMAN_UniprotKB
0,Q9CSS6,E9PRG8
1,Q9D2R0,Q86V21
2,Q3UD67,P49588
3,P61222,P61221
4,Q99LE6,Q9UG63
...,...,...
6,Q9QZ85,
7,Q922Q8,
8,Q6ZWZ4,
9,P37804,


In [21]:
matches.to_csv(back_path+"05_Output/"+experiment_name+"/"+experiment_name+"_matches_uniprotKB.tsv",sep="\t",index=None)