Here we will take the cadherome interactome data and generate the output require to combine with the mouse lysine acylation data

In [83]:
import os
import collections

from bioservices import UniProt

import pandas as pd
import numpy as np

import CGAT.IOTools as IOTools

In [94]:
raw_base = "../raw/"
results_dir = "./results/"

In [135]:
orthologs_inf = os.path.join(raw_base, 'Homo_sapiens-Mus_musculus.txt')
mmUni2Ens_inf = os.path.join(raw_base, 'MOUSE_10090_idmapping_selected.tab.gz')
hgUni2Ens_inf = os.path.join(raw_base, 'HUMAN_9606_idmapping_selected.tab.gz')

high_confidence_cadherin_interactome = os.path.join(raw_base, 'Cadherin_adhesome.csv')

ecadherin_glass_inf = os.path.join(raw_base, 'ECadherin_interactome_glass.csv')
ecadherin_biotin_inf = os.path.join(raw_base, 'ECadherin_interactome.csv')

Below we generate the required dictionaries to map human ensembl ids to mouse uniprot ids

In [154]:
mouse2humanEnsPro = collections.defaultdict(set)
human2mouseEnsPro = collections.defaultdict(set)


with IOTools.openFile(orthologs_inf, 'r') as inf:
    for line in inf:
        line = line.strip().split('\t')
        if line[5] == 'ortholog':
            
            if line[1] == 'Homo sapiens':
                human_id = line[0]
                mouse_id = line[2]

            elif line[1] == 'Mus musculus':
                human_id = line[2]
                mouse_id = line[0]

            else:
                raise ValueError(line)

            mouse2humanEnsPro[mouse_id].add(human_id)
            human2mouseEnsPro[human_id].add(mouse_id)

# some mouse IDs map to multiple human IDs. In many cases these will both map to the same uniprot ID
# so this is not an issue. We can track this following conversion
print(mouse2humanEnsPro['ENSMUSP00000107237'])


mouseUni2Ens = {}
mouseEns2Uni = collections.defaultdict(set)
no_id = 0
yes_id = 0
with IOTools.openFile(mmUni2Ens_inf, 'r') as inf:
    for line in inf:
        line = line.strip().split("\t")
        uni_id = line[0]
        try:
            yes_id += 1
            ens_ids = line[20].split('; ')
            mouseUni2Ens[uni_id] = ens_ids
            for ens_id in ens_id:
                mouseUni2Ens[ens_id] = uni_id
        except:
            no_id += 1

print(no_id)
print(yes_id)

humanUni2Ens = {}
humanEns2Uni = collections.defaultdict(set)
no_id = 0
yes_id = 0
with IOTools.openFile(hgUni2Ens_inf, 'r') as inf:
    for line in inf:
        line = line.strip().split("\t")
        uni_id = line[0]
        try:
            yes_id += 1
            ens_ids = line[20].split('; ')
            humanUni2Ens[uni_id] = ens_ids
            for ens_id in ens_ids:
                humanEns2Uni[ens_id].add(uni_id)
        except:
            no_id += 1

print(no_id)
print(yes_id)


{'ENSP00000319690', 'ENSP00000365370'}
5242
81545
28098
157433


In [156]:
# This is working all OK for PKM - succesffuly converted from mouse to human ensembl and checked via human uniprot

# P52480 = mouse PKM
# P14618 = human PKM
print("\n\n----P52480----\n")
for ens_id in mouseUni2Ens['P52480']:
    if len(mouse2humanEnsPro[ens_id]) > 0:
        print(ens_id)
        print(mouse2humanEnsPro[ens_id])
    for human_ens in mouse2humanEnsPro[ens_id]:
        print(human_ens)
        print(humanEns2Uni[human_ens])

print("\n\n----P47915 (Mouse RL29) ----\n")
for ens_id in mouseUni2Ens['P47915']:
    print(ens_id)
    if len(mouse2humanEnsPro[ens_id]) > 0:
        print(ens_id)
        print(mouse2humanEnsPro[ens_id])
    for human_ens in mouse2humanEnsPro[ens_id]:
        print(human_ens)
        print(humanEns2Uni[human_ens])
        
        
print("\n\n----P47914 (Human RL29)----\n")
for ens_id in humanUni2Ens['P47914']:
    print(ens_id)
    if len(human2mouseEnsPro[ens_id]) > 0:
        print(ens_id)
        print(human2mouseEnsPro[ens_id])
    for mouse_ens in human2mouseEnsPro[ens_id]:
        print(mouse_ens)
        print(mosueEns2Uni[mouse_ens])
        



----P52480----

ENSMUSP00000034834
{'ENSP00000320171'}
ENSP00000320171
{'P14618'}


----P47915 (Mouse RL29) ----

ENSMUSP00000080203
ENSMUSP00000096592
ENSMUSP00000117834


----P47914 (Human RL29)----

ENSP00000294189
ENSP00000418868
ENSP00000417048
ENSP00000418153
ENSP00000418346
ENSP00000420673


OK so there are some orthologs which aren't covered so we will lose some data this way, typically conversion issues!

In [44]:
def ConvertMouseUniprot2HumanUniprot(mm_proteins, mouseUni2Ens, mouse2humanEnsPro, humanEns2Uni, one2one=True):
    mmUni2Ens_matched, mm2hgUni_matched, hgEns2Uni_matched, proteins_matched  = (0, 0, 0, 0)
    
    new_mm_proteins = []
    new_hg_proteins = []
    for protein in mm_proteins:
        matches = set()
        try:
            ens_ids = mouseUni2Ens[protein]
            if len(ens_ids) > 0 :
                mmUni2Ens_matched +=1 
                for ens_id in ens_ids:
                    human_ens = mouse2humanEnsPro[ens_id]

                    if len(human_ens) > 0:
                        mm2hgUni_matched += 1

                        for human_en in human_ens:
                            humanUnis = humanEns2Uni[human_en]
                            #print(human_en)
                            #print(humanUnis)

                            if len(humanUnis) > 0:
                                hgEns2Uni_matched += 1
                                matches.update(humanUnis)
                                #print(matches)
                    else:
                        #print(ens_ids)
                        #print(ens_id)
                        #print(human_ens)
                        #break
                        pass

                if len(matches) > 0:
                    if one2one and len(matches) > 1:
                        continue
                    proteins_matched += 1
                    for match in matches:
                        new_hg_proteins.append(match)
                        new_mm_proteins.append(protein)
            
        except KeyError:
            pass
            #print('protein not found in uniprot mapping file: ', protein)

    print(len(mm_proteins), mmUni2Ens_matched, mm2hgUni_matched, hgEns2Uni_matched, proteins_matched)
    
    return new_mm_proteins, new_hg_proteins


In [149]:
print(ConvertMouseUniprot2HumanUniprot(['P47915'], mouseUni2Ens, mouse2humanEnsPro, humanEns2Uni))

1 1 0 0 0
([], [])


In [25]:
ConvertMouseUniprot2HumanUniprot(['P52480'], mouseUni2Ens, mouse2humanEnsPro, humanEns2Uni, one2one=True)

1 1 1 1 1


(['P52480'], ['P14618'])

Now we have a convertion function we need to provide a sensible list of mouse ids to convert. We'll use the full swiss-prot collection as a startpoint.

In [28]:
u = UniProt()

u_results = u.search("organism:10090+and+reviewed:yes", columns="id,entry name", limit=-1)

In [31]:
uniprot_ids_df = pd.DataFrame.from_records(
        [x.split() for x in u_results.strip().split("\n")[1:]],
        columns=["uniprot_id", "name"])

print(uniprot_ids_df.head())
print(uniprot_ids_df.shape)

  uniprot_id         name
0     Q02248  CTNB1_MOUSE
1     Q62226    SHH_MOUSE
2     Q01705  NOTC1_MOUSE
3     P22725  WNT5A_MOUSE
4     P10417   BCL2_MOUSE
(16853, 2)


In [79]:
mm_proteins, hg_proteins = ConvertMouseUniprot2HumanUniprot(
    uniprot_ids_df['uniprot_id'].tolist(),
    mouseUni2Ens, mouse2humanEnsPro, humanEns2Uni, one2one=True)

# need to remove instances where the same hg protein is apparently a one-2-one ortholog of multiple mouse proteins
# this occurs because we previous;y only checked for 1-2-1 in mouse->human direction

mm2hg = {x:y for x, y in zip(mm_proteins, hg_proteins)}
hg2mm = {y:x for x, y in zip(mm_proteins, hg_proteins)}

print(len(mm2hg))
print(len(hg2mm))

for hg_protein, count in collections.Counter(hg_proteins).most_common():
    if count > 1:
        mm2hg.pop(hg2mm[hg_protein])
        hg2mm.pop(hg_protein)
    else:
        # using most_common so break as soon as we've reached the '1s'
        break
        
print(len(mm2hg))
print(len(hg2mm))

16853 16673 13101 13330 12317
12317
12165
12233
12081


so we have 16853 mouse uniprot ids converted to 12317 human uniprot ids via a mouse2human ensembl id map. The major step at which we lost proteins is the conversion from mouse to human ensembl protein ids (16673 --> 13101 = 78.6%). This is fine since this is the 1:1 ortholog conversion. The other steps resulted in only modest loss of ids.

In [98]:
cadherin_adhesome_df = pd.read_table(high_confidence_cadherin_interactome, sep="\t")


mm_uniprot_ids = []

for protein in cadherin_adhesome_df['UniProt ID']:
    protein = protein.replace(" ", "")
    if protein in hg2mm:
        mm_uniprot_ids.append(hg2mm[protein])
    else:
        mm_uniprot_ids.append(np.nan)

cadherin_adhesome_df['Mouse_uniprot_IDs'] = mm_uniprot_ids

print(cadherin_adhesome_df.shape)
print(sum(cadherin_adhesome_df['Mouse_uniprot_IDs'].notnull()))

cadherin_adhesome_df[['Mouse_uniprot_IDs']].to_csv(
    os.path.join(results_dir, "cadherine_proteins.tsv"), index=False)

(173, 11)
130


In [165]:
def parseECadherinInfile(infile, abundance_column=None):
    assert abundance_column is not None
    ecadherin_df = pd.read_table(infile, sep='\t')

    mm_uniprot_ids = []
    for protein_ids in ecadherin_df['Protein IDs']:
        matches = set()
        for protein in protein_ids.split(';'):
            if protein in hg2mm:
                matches.add(hg2mm[protein])
        if len(matches) <1:
            mm_uniprot_ids.append(np.nan)
        else:
            mm_uniprot_ids.append(';'.join(matches))

    ecadherin_df['Mouse_uniprot_IDs'] = mm_uniprot_ids

    ecadherin_df["rank"] = ecadherin_df[abundance_column].rank()
    ecadherin_df.rename(columns={abundance_column: "interactome_abundance",
                                 "Protein IDs": "Human_uniprot_IDs"},
                        inplace=True)

    ecadherin_df =  ecadherin_df[
        ['Mouse_uniprot_IDs', "Human_uniprot_IDs", "interactome_abundance", "rank"]]
    #ecadherin_df[]
    return ecadherin_df

    

In [169]:
ecadherin_glass_df = parseECadherinInfile(ecadherin_glass_inf, '%iBAQ')
ecadherin_biotin_df = parseECadherinInfile(ecadherin_biotin_inf, '(iBAQ ALL/iBAQ ALL sum)*100 (%)')
print(collections.Counter([len(str(x).split(";")) for x in ecadherin_glass_df['Mouse_uniprot_IDs']]))
print(collections.Counter([len(str(x).split(";")) for x in ecadherin_biotin_df['Mouse_uniprot_IDs']]))

print(ecadherin_glass_df.head())
ecadherin_glass_df.to_csv(os.path.join(results_dir, "ecadherin_glass.tsv"), index=False)
ecadherin_biotin_df.to_csv(os.path.join(results_dir, "ecadherin_biotin.tsv"), index=False)

Counter({1: 1841, 2: 105, 3: 20, 4: 8, 7: 3, 6: 2, 5: 1})
Counter({1: 511, 2: 36, 3: 11, 4: 1, 6: 1, 7: 1})
  Mouse_uniprot_IDs                                  Human_uniprot_IDs  \
0            P10126                        P68104;Q5VTE0;Q5JR01;A6PW80   
1            P68372                                             P68371   
2            P68373  F5H5D3;Q9BQE3;F8VVB9;F8VQQ4;F8VS66;F8VRZ4;F8VW...   
3            Q9D8N0                               P26641;B4DTG2;E7EMT2   
4            P62827          B5MDF5;P62826;J3KQE5;F5H018;H0YFC6;B4DV51   

   interactome_abundance    rank  
0               6.333808  1980.0  
1               2.546583  1979.0  
2               2.252746  1978.0  
3               1.795667  1977.0  
4               1.763019  1975.5  
