# cross-species protein - phenotype association prediction

### Mouse Protein - Mouse phenotype

### Mouse Protein - Human Phenotype

In [1]:
import os
import pandas as pd

In [2]:
input_dir = "/home/ec2-user/data/mouse_protein"

In [3]:
filepath = os.path.join(input_dir, "HMD_HumanPhenotype.rpt")

In [4]:
df = {"Human Marker Symbol":[], "Human Entrez Gene ID":[] ,"Mouse Marker Symbol":[], "MGI Marker Accession ID":[], "Mouse Phenotype ID":[]}
mouse_protein = []
with open(filepath, "r") as f:
    for line in f.readlines():
        elements = line.split("\t")
        df["Human Marker Symbol"].append(elements[0])
        df["Human Entrez Gene ID"].append(elements[1])
        df["Mouse Marker Symbol"].append(elements[2])
        df["MGI Marker Accession ID"].append(elements[3])
        df["Mouse Phenotype ID"].append(elements[4])

In [5]:
df = pd.DataFrame(df)

In [6]:
df

Unnamed: 0,Human Marker Symbol,Human Entrez Gene ID,Mouse Marker Symbol,MGI Marker Accession ID,Mouse Phenotype ID
0,A1BG,1,A1bg,MGI:2152878,
1,A1CF,29974,A1cf,MGI:1917115,"MP:0005367, MP:0005369, MP:0005370, MP:0005376..."
2,A2M,2,A2m,MGI:2449119,
3,A3GALT2,127550,A3galt2,MGI:2685279,
4,A4GALT,53947,A4galt,MGI:3512453,"MP:0005376, MP:0005386, MP:0010768"
...,...,...,...,...,...
29681,ZYG11A,440590,Zyg11a,MGI:2446208,
29682,ZYG11B,79699,Zyg11b,MGI:2685277,"MP:0005386, MP:0010768"
29683,ZYX,7791,Zyx,MGI:103072,MP:0005384
29684,ZZEF1,23140,Zzef1,MGI:2444286,"MP:0005367, MP:0005378, MP:0005386, MP:0005390..."


In [7]:
df.to_csv(os.path.join(input_dir,"human_protein_mouse_protein_mouse_phenotype.csv"),index=False)

In [8]:
df["MGI Marker Accession ID"].to_csv(os.path.join(input_dir,"mgi_index.csv"),index=False)

In [9]:
import csv

def parse_txt_to_csv(input_file, output_file):
    terms = []
    current_term = {}

    with open(input_file, 'r') as f:
        lines = f.readlines()

    for line in lines:
        line = line.strip()
        if line.startswith('[Term]'):
            if current_term:
                terms.append(current_term.copy())
                current_term.clear()
        elif line.startswith('id:'):
            current_term['id'] = line.split(': ', 1)[1]
        elif line.startswith('name:'):
            current_term['name'] = line.split(': ', 1)[1]
        elif line.startswith('def:'):
            current_term['def'] = line.split('"', 1)[1].rsplit('"', 1)[0]
        elif line.startswith('synonym:'):
            current_term.setdefault('synonyms', []).append(line.split('"', 1)[1].rsplit('"', 1)[0])
        elif line.startswith('is_obsolete:'):
            current_term['is_obsolete'] = line.split(': ', 1)[1]

    if current_term:
        terms.append(current_term)

    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = ['id', 'name', 'def', 'synonyms', 'is_obsolete']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for term in terms:
            writer.writerow(term)

In [10]:
parse_txt_to_csv(os.path.join(input_dir, "MPheno_OBO.ontology"), os.path.join(input_dir, "mouse_phenotype.csv"))

In [11]:
# merge all data to build the test set
df_mouse_phenotype = pd.read_csv(os.path.join(input_dir, "mouse_phenotype.csv"))

In [12]:
df_mouse_protein_phenotype = pd.read_csv(os.path.join(input_dir, "human_protein_mouse_protein_mouse_phenotype.csv"))

In [13]:
df_mouse_protein = pd.read_csv(os.path.join(input_dir, "mouse_mgi2protein.tsv"), sep="\t")

In [14]:
df_mouse_protein = df_mouse_protein[["From","Sequence"]].rename(columns={"From":"MGI"})

In [15]:
df_mouse_protein["Sequence"].isnull().sum()

0

In [16]:
df_mouse_protein_phenotype = df_mouse_protein_phenotype[["MGI Marker Accession ID","Mouse Phenotype ID"]].dropna().reset_index(drop=True).rename(
columns={"MGI Marker Accession ID":"MGI"})

In [17]:
df_mouse_protein_phenotype = df_mouse_protein_phenotype.merge(df_mouse_protein, on="MGI", how="left")

In [18]:
df_mouse_protein_phenotype = df_mouse_protein_phenotype[df_mouse_protein_phenotype["Sequence"].notnull()].reset_index(drop=True)

In [19]:
df_mouse_protein_phenotype = df_mouse_protein_phenotype.reset_index(drop=True)

In [20]:
df_mouse_protein_phenotype

Unnamed: 0,MGI,Mouse Phenotype ID,Sequence
0,MGI:1917115,"MP:0005367, MP:0005369, MP:0005370, MP:0005376...",MESNHKSGDGLSGTQKEAALRALVQRTGYSLVQENGQRKYGGPPPG...
1,MGI:3512453,"MP:0005376, MP:0005386, MP:0010768",MGISCSHLEETMSKPPDCLLRMLRGTPRQRVFTFFIISFKFMFLIS...
2,MGI:2143261,"MP:0002006, MP:0005381, MP:0005384, MP:0005385...",MLKEIYLSLSLVLVFACGLLYQLTMRSQCFFACLPPFSFPQGLDGL...
3,MGI:2443767,"MP:0005378, MP:0005386, MP:0005389",MCSLGLFPPPPPRGQVTLYEHNNELVTGNSYESPPPDFRGQWINLP...
4,MGI:1926144,MP:0005386,MSKLARLEREEIMECQVMWEPDSKKDTQMDRFRAAVGTACGLALGN...
...,...,...,...
12946,MGI:1915264,"MP:0005376, MP:0005378, MP:0005380, MP:0005385...",MWSRMNRAAEEFYARLRQEFNEEKKGASKDPFIYEADVQVQLISKG...
12947,MGI:1289227,"MP:0005376, MP:0005380, MP:0005384, MP:0005386...",MADAEKNAVAEKNNAVATKEVLAEAAAILEPVGLQEEAELPAKIME...
12948,MGI:2685277,"MP:0005386, MP:0010768",MPEDQAHAAMEEASPYSLLDICLSFLTTNLEKFCSARQDGTLCLQE...
12949,MGI:103072,MP:0005384,MAAPRPPPAISVSVSAPAFYAPQKKFAPVVAPKPKVNPFRPGDSEP...


In [21]:
df_mouse_phenotype = df_mouse_phenotype[["id","name","def"]].rename(columns={"id":"MPI","name":"MP_name","def":"MP_def"})

In [22]:
df_mouse_protein_phenotype["Mouse Phenotype ID"] = df_mouse_protein_phenotype["Mouse Phenotype ID"].apply(lambda x: [x_.strip() for x_ in x.split(",")])

In [23]:
df_mouse_protein_phenotype = df_mouse_protein_phenotype.rename(columns={"Mouse Phenotype ID":"MPI"})

In [24]:
df_mouse_protein_phenotype.to_csv("MGI_MPI_sequence.csv",index=False)

In [25]:
df_mouse_phenotype["MP_def"] = df_mouse_phenotype["MP_name"] + ": " + df_mouse_phenotype["MP_def"]

In [26]:
df_mouse_phenotype.to_csv("MP.csv",index=False)

In [27]:
df = pd.read_csv(os.path.join(input_dir, "human_protein_mouse_protein_mouse_phenotype.csv"))

In [28]:
df_human_protein = pd.read_csv("/home/ec2-user/data/Processed/protein.csv")

In [29]:
df_human_protein

Unnamed: 0,node_index,node_id,node_type,node_name,node_source,sequence
0,0,9796,gene/protein,PHYHIP,NCBI,MELLSTPHSIEINNITCDSFRISWAMEDSDLERVTHYFIDLNKKEN...
1,1,7918,gene/protein,GPANK1,NCBI,MSRPLLITFTPATDPSDLWKDGQQQPQPEKPESTLDGAAARAFYEA...
2,2,8233,gene/protein,ZRSR2,NCBI,MAAPEKMTFPEKPSHKKYRAALKKEKRKKRRQELARLRDSGLSQKE...
3,3,4899,gene/protein,NRF1,NCBI,MEEHGVTQTEHMATIEAHAVAQQVQQVHVATYTEHSMLSADEDSPS...
4,4,5297,gene/protein,PI4KA,NCBI,MAAAPARGGGGGGGGGGGCSGSGSSASRGFYFNTVLSLARSLAVQR...
...,...,...,...,...,...,...
19157,83735,100133251,gene/protein,PRR23D2,NCBI,MYGYRRLRSPRDSQTEPQNDNEGETSLATTQMNPPKRRQVEQGPST...
19158,83735,100133251,gene/protein,PRR23D2,NCBI,MYGYRRLRSPRDSQTEPQNDNEGETSLATTQMNPPKRRQVEQGPST...
19159,83740,389649,gene/protein,C8orf86,NCBI,MRPLGKGLLPAEELIRSNLGVGRSLRDCLSQSGKLAEELGSKRLKP...
19160,83746,343990,gene/protein,CRACDL,NCBI,MISTRVMDIKLREAAEGLGEDSTGKKKSKFKTFKKFFGKKKRKESP...


In [30]:
df = df[df["Human Marker Symbol"].isin(df_human_protein["node_name"])]

In [31]:
# build mouse protein - human protein
df = df.rename(columns = {"Human Marker Symbol":"Human_Protein_Name","MGI Marker Accession ID":"MGI", "Mouse Phenotype ID":"MPI"}).reset_index(drop=True)

In [32]:
# get human protein to human phenotype
df_human_protein = df[["Human_Protein_Name"]].drop_duplicates()

In [33]:
df_human_protein = df_human_protein.reset_index(drop=True)

In [34]:
df_kg = pd.read_csv("/home/ec2-user/data/PrimeKG/kg.csv")

  df_kg = pd.read_csv("/home/ec2-user/data/PrimeKG/kg.csv")


In [35]:
df_human_protein_phenotype = df_kg[(df_kg["x_type"] == "gene/protein") & (df_kg["y_type"] == "effect/phenotype")].reset_index(drop=True)

In [36]:
df_human_protein_phenotype = df_human_protein.rename(columns={"Human_Protein_Name":"x_name"}).merge(df_human_protein_phenotype)

In [45]:
df_human_protein_phenotype["y_id"] = df_human_protein_phenotype["y_id"].apply(lambda x: "HP:"+str(x).rjust(7,'0'))

In [46]:
df_human_protein_phenotype = df_human_protein_phenotype[["x_name","y_id"]].groupby("x_name")["y_id"].apply(list).reset_index()

In [47]:
df_human_protein_phenotype

Unnamed: 0,x_name,y_id
0,A1BG,[HP:0002240]
1,ABAT,"[HP:0001254, HP:0010547, HP:0001252, HP:000125..."
2,ABCA3,[HP:0002092]
3,ABCB1,"[HP:0011157, HP:0002013, HP:0002018]"
4,ABCB11,[HP:0000989]
...,...,...
1578,ZMIZ1,"[HP:0001250, HP:0011968, HP:0000152, HP:0001507]"
1579,ZNF142,"[HP:0001250, HP:0001337]"
1580,ZNF292,"[HP:0001250, HP:0011968, HP:0003808, HP:000027..."
1581,ZNF462,[HP:0005487]


In [48]:
df_human_protein_phenotype = df_human_protein_phenotype.rename(columns={"x_name":"human_protein_name","y_id":"human_phenotype_id"})

In [53]:
df_human_protein_phenotype.to_csv("/home/ec2-user/data/mouse_protein/processed/human_protein_phenotype.csv",index=False)

In [3]:
# load 
df = pd.read_csv("/home/ec2-user/data/mouse_protein/processed/human_protein_mouse_protein_mouse_phenotype.csv")

In [4]:
df_human = pd.read_csv("/home/ec2-user/data/mouse_protein/processed/human_protein_phenotype.csv")

In [5]:
df_human

Unnamed: 0,human_protein_name,human_phenotype_id
0,A1BG,['HP:0002240']
1,ABAT,"['HP:0001254', 'HP:0010547', 'HP:0001252', 'HP..."
2,ABCA3,['HP:0002092']
3,ABCB1,"['HP:0011157', 'HP:0002013', 'HP:0002018']"
4,ABCB11,['HP:0000989']
...,...,...
1578,ZMIZ1,"['HP:0001250', 'HP:0011968', 'HP:0000152', 'HP..."
1579,ZNF142,"['HP:0001250', 'HP:0001337']"
1580,ZNF292,"['HP:0001250', 'HP:0011968', 'HP:0003808', 'HP..."
1581,ZNF462,['HP:0005487']


In [6]:
df = df.rename(columns={"Human Marker Symbol":"human_protein_name"})

In [7]:
df = df.merge(df_human, on="human_protein_name")

In [8]:
df = df[["MGI Marker Accession ID", "human_phenotype_id"]].rename(columns= {"MGI Marker Accession ID":"MGI"})

In [10]:
df.to_csv("/home/ec2-user/data/mouse_protein/processed/mouse_protein_human_phenotype.csv", index=False)

In [62]:
all_hpo = df["human_phenotype_id"].map(eval).explode().unique().tolist()

In [63]:
import requests

In [64]:
all_hpo

['HP:0002240',
 'HP:0001254',
 'HP:0010547',
 'HP:0001252',
 'HP:0001250',
 'HP:0008000',
 'HP:0001284',
 'HP:0003438',
 'HP:0007359',
 'HP:0002092',
 'HP:0011157',
 'HP:0002013',
 'HP:0002018',
 'HP:0000989',
 'HP:0001410',
 'HP:0002910',
 'HP:0000625',
 'HP:0010867',
 'HP:0010871',
 'HP:0002078',
 'HP:0002311',
 'HP:0002070',
 'HP:0012232',
 'HP:0000846',
 'HP:0002313',
 'HP:0012393',
 'HP:0004324',
 'HP:0001337',
 'HP:0000365',
 'HP:0002936',
 'HP:0000762',
 'HP:0000855',
 'HP:0012418',
 'HP:0003498',
 'HP:0012592',
 'HP:0011675',
 'HP:0001699',
 'HP:0100602',
 'HP:0100603',
 'HP:0003251',
 'HP:0001342',
 'HP:0012735',
 'HP:0006685',
 'HP:0005268',
 'HP:0002354',
 'HP:0002080',
 'HP:0002345',
 'HP:0030186',
 'HP:0002174',
 'HP:0002322',
 'HP:0200085',
 'HP:0002912',
 'HP:0010783',
 'HP:0001511',
 'HP:0002268',
 'HP:0002451',
 'HP:0000377',
 'HP:0000307',
 'HP:0000637',
 'HP:0000343',
 'HP:0000219',
 'HP:0002553',
 'HP:0001257',
 'HP:0000421',
 'HP:0001892',
 'HP:0001824',
 'HP:00043

In [26]:
# response = requests.get("https://clinicaltables.nlm.nih.gov/api/hpo/v3/search?terms=Renal%20cyst&df=id,name,definition&maxList=1")

In [34]:
# response = requests.get(f"https://clinicaltables.nlm.nih.gov/api/hpo/v3/search?terms={term}&df=id,name,definition&maxList=1")

[['HP:0000107', 'Renal cyst', 'A fluid filled sac in the kidney']]

In [65]:
parsed_hpo = {}
for hpo in all_hpo:
    term = hpo.replace(" ","%20").lower()
    response = requests.get(f"https://clinicaltables.nlm.nih.gov/api/hpo/v3/search?terms={term}&df=id,name,definition&sf=id")
    try:
        parsed = eval(response.text.split("null")[-1][2:-2])
    except:
        print(f"{hpo} not found")
    parsed_hpo[hpo] = parsed

In [69]:
df_hpo = {
    "HPO":[],
    "name":[],
    "definition":[],
    "human_phenotype_id":[]
}
for k, v in parsed_hpo.items():
    df_hpo["HPO"].append(v[0])
    df_hpo["human_phenotype_id"].append(k)
    df_hpo["name"].append(v[1])
    df_hpo["definition"].append(v[2])

In [70]:
df_hpo = pd.DataFrame(df_hpo)

In [72]:
df_hpo[(df_hpo["HPO"] != df_hpo["human_phenotype_id"])]

Unnamed: 0,HPO,name,definition,human_phenotype_id


In [75]:
df_hpo.drop(["human_phenotype_id"],axis=1).to_csv("/home/ec2-user/data/mouse_protein/processed/human_phenotype.csv", index=False)

In [76]:
df_hpo

Unnamed: 0,HPO,name,definition,human_phenotype_id
0,HP:0002240,Hepatomegaly,Abnormally increased size of the liver,HP:0002240
1,HP:0001254,Lethargy,"A state of disinterestedness, listlessness, an...",HP:0001254
2,HP:0010547,Muscle flaccidity,A type of paralysis in which a muscle becomes ...,HP:0010547
3,HP:0001252,Hypotonia,Hypotonia is an abnormally low muscle tone (th...,HP:0001252
4,HP:0001250,Seizure,A seizure is an intermittent abnormality of ne...,HP:0001250
...,...,...,...,...
348,HP:0001289,Confusion,"Lack of clarity and coherence of thought, perc...",HP:0001289
349,HP:0000737,Irritability,"A proneness to anger, i.e., a condition of bei...",HP:0000737
350,HP:0003218,Oroticaciduria,An increased concentration of orotic acid in t...,HP:0003218
351,HP:0000324,Facial asymmetry,An abnormal difference between the left and ri...,HP:0000324
