In [3]:
from pathlib import Path
import os
import pandas as pd
import urllib.parse
import urllib.request
os.chdir(Path.home() / "master_thesis_folder")

In [4]:
#Function converts ids and returns a dataframe using the UNIPROT ID Mapper.
def id_converter(From, To, Query, id_names):
    url = 'https://www.uniprot.org/uploadlists/'

    params = {
        'from' : From,
        'to' : To,
        'format' : 'tab',
        'query': Query
    }

    data = urllib.parse.urlencode(params)
    data = data.encode('utf-8')
    req = urllib.request.Request(url, data)
    with urllib.request.urlopen(req) as f:
        response = f.read()

    h_ensembl = response.decode('utf-8')
    ensembl_file = open('h_ensembl.tsv', 'w')
    ensembl_file.write(h_ensembl)
    ensembl_file.close()

    hp_ids_df = pd.read_csv("h_ensembl.tsv", sep= '\t', names = id_names)
    return hp_ids_df

Mouse orthologues are searched for a list of human proteins (4981) used in the tissue predictor.
The human proteins are stored in a list variable called 'h_prots_uniprot'. These are uniprot ID's.

First, BioMart is used to identify human-mouse orthologues. This file is stored in a panda dataframe.
The proteins however are identified as ensembl_ids.

- (KEGG)
- BioMart


In [5]:
h_prots_file = open("used_features.txt", "r")
h_prots = h_prots_file.read().split(", ")
h_prots_uniprot = [prot[1:-1] for prot in h_prots]
len(h_prots_uniprot)

4981

In [6]:
hmo_df = pd.read_csv("mart_export.csv")
hmo_df = hmo_df.rename(columns={
    "Protein stable ID" : "human_eid",
    "Mouse protein or transcript stable ID" : "mouse_eid",
    "Mouse orthology confidence [0 low, 1 high]": "Orthology confidence"
})
hmo_df.drop('Query protein or transcript ID', inplace=True, axis=1)
hmo_df.head()

Unnamed: 0,Orthology confidence,human_eid,mouse_eid
0,0,ENSP00000354687,ENSMUSP00000080991
1,1,ENSP00000355046,ENSMUSP00000080992
2,1,ENSP00000354499,ENSMUSP00000080993
3,1,ENSP00000354876,ENSMUSP00000080994
4,0,ENSP00000355265,ENSMUSP00000080995


In [7]:
hmo_df.shape

(47031, 3)

The database identifier mapping ('Retrieve/ID mapping') of UNIPROT was used to convert the uniprot_id of the human proteins into an ensembl_id. <br>
This is stored into a dataframe called `hp_ids_df` and has 12194 rows.

In [8]:
hp_ids_df = id_converter('ACC+ID', 'ENSEMBL_PRO_ID', " ".join(h_prots_uniprot), ["h_uniprot", "human_eid"])

hp_ids_df.shape

(12194, 2)

In [9]:
hp_ids_df.head()

Unnamed: 0,h_uniprot,human_eid
0,From,To
1,A0A075B6H7,ENSP00000374782
2,A0A075B6H7,ENSP00000487957
3,A0A075B6H8,ENSP00000374813
4,A0A075B6H9,ENSP00000374817


One uniprot_id is linked with several ensembl_ids. <br>
However, **20** ensembl ids were not found. These uniprot_ids are stored in the variable '`eid_not_found`'.

In [10]:
eid_not_found = []
for prot in h_prots_uniprot:
    if prot not in hp_ids_df.h_uniprot.values:
        eid_not_found.append(prot)
print(len(eid_not_found))

20


Now the ensembl_ids of the uniprot entries are matched with the orthologues of the BioMart file. <br>
The mouse orthologues are converted to a string in order to access the database identifier mapping ('Retrieve/ID mapping') of UNIPROT again. <br>
Following this, the 2 dataframes are merged once more and the orthologues are linked by uniprot ID.

In [11]:
hmo_df_1 = hp_ids_df.merge(hmo_df, on = "human_eid", how = "inner")
hmo_df_1.shape

(11210, 4)

In [12]:
hmo_df_1.head()

Unnamed: 0,h_uniprot,human_eid,Orthology confidence,mouse_eid
0,A0A075B6H7,ENSP00000374782,0,ENSMUSP00000100174
1,A0A075B6H8,ENSP00000374813,0,ENSMUSP00000100124
2,A0A075B6H8,ENSP00000374813,0,ENSMUSP00000142805
3,A0A075B6H9,ENSP00000374817,0,ENSMUSP00000100465
4,A0A075B6I1,ENSP00000374819,0,ENSMUSP00000100465


In [13]:
mouse_orthologue_serie = hmo_df_1.mouse_eid
mouse_orth_string = " ".join(mouse_orthologue_serie.values)

mo_ids_df = id_converter("ENSEMBL_PRO_ID", "ACC", mouse_orth_string, ["mouse_eid", "mouse_uniprot"])

mo_ids_df.shape

(5003, 2)

In [14]:
hmo_df_2 = mo_ids_df.merge(hmo_df_1, on = "mouse_eid", how = "inner")
hmo_df_2

Unnamed: 0,mouse_eid,mouse_uniprot,h_uniprot,human_eid,Orthology confidence
0,ENSMUSP00000100174,A0A075B5M9,A0A075B6H7,ENSP00000374782,0
1,ENSMUSP00000100174,A0A075B5M9,A0A087WSY6,ENSP00000403672,0
2,ENSMUSP00000100174,A0A075B5M9,A0A0C4DH25,ENSP00000374805,0
3,ENSMUSP00000100174,A0A075B5M9,P01619,ENSP00000418649,0
4,ENSMUSP00000100124,A0A0B4J1I1,A0A075B6H8,ENSP00000374813,0
...,...,...,...,...,...
11217,ENSMUSP00000094176,Q69ZX6,Q9Y6X9,ENSP00000380763,1
11218,ENSMUSP00000123354,Q8C5W4,Q9Y6X9,ENSP00000215862,0
11219,ENSMUSP00000123354,Q8C5W4,Q9Y6X9,ENSP00000380763,0
11220,ENSMUSP00000023918,Q920Q8,Q9Y6Y0,ENSP00000356468,1


`hmo_uniprot_id_df` is a subset containing the uniprot_ids of `hmo_df_2` which is the linked dataframe.

In [15]:
hmo_uniprot_id_df = hmo_df_2[["h_uniprot", "mouse_uniprot", "Orthology confidence"]]
hmo_uniprot_id_df.head()

Unnamed: 0,h_uniprot,mouse_uniprot,Orthology confidence
0,A0A075B6H7,A0A075B5M9,0
1,A0A087WSY6,A0A075B5M9,0
2,A0A0C4DH25,A0A075B5M9,0
3,P01619,A0A075B5M9,0
4,A0A075B6H8,A0A0B4J1I1,0


For 4308 human proteins (86.5%), at least 1 mouse orthologue is found with a high orthology confidence (as stated by BioMart). <br>

In [16]:
hmo_uniprot_id_df.loc[hmo_uniprot_id_df["Orthology confidence"] == 1].groupby(["h_uniprot"]).mouse_uniprot.count()

h_uniprot
A0A0A0MS15    1
A0A0U1RQS6    1
A0AVT1        2
A0MZ66        5
A0PJW6        1
             ..
Q9Y6X5        1
Q9Y6X8        1
Q9Y6X9        2
Q9Y6Y0        1
Q9Y6Y8        1
Name: mouse_uniprot, Length: 4308, dtype: int64

If the orthology score is not taken into account, 4704 (94.4%) human-mouse orthologues are found.

In [17]:
hmo_uniprot_id_df.groupby(["h_uniprot"]).mouse_uniprot.count()

h_uniprot
A0A075B6H7    1
A0A075B6H8    2
A0A075B6H9    1
A0A075B6I1    1
A0A087WSY6    1
             ..
Q9Y6X5        1
Q9Y6X8        1
Q9Y6X9        4
Q9Y6Y0        1
Q9Y6Y8        1
Name: mouse_uniprot, Length: 4704, dtype: int64

The uniprot_ids whereof no orthologue was found are stored in `no_orthologue_found`.

In [18]:
no_orthologue_found = []
for uid in h_prots_uniprot:
    if uid not in hmo_uniprot_id_df.h_uniprot.values:
        no_orthologue_found.append(uid)
for i in eid_not_found:
    no_orthologue_found.remove(i)
len(no_orthologue_found)

257

To choose 1 protein of the different matched orthologues, several options can be considered: <br>
- at random (not preferable)
- %id. target mouse gene identical to query gene (if the human gene is larger than the mouse gene)
- %id. query gene identical to target mouse gene (if the mouse gene is larger than the human gene) 
<br>

The last two criteria usually correlate pretty well with each other