In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

DATA_DIR = 'integration_data' # data from sources
OUTPUT_DATA_DIR = 'integration_data_processed' # processed data from sources (where output of the code will be stored)
Path(OUTPUT_DATA_DIR).mkdir(parents=True, exist_ok=True)

In [2]:
# read in various mapping files
gene_mapping_filepath = os.path.join(DATA_DIR, 'gene_mapping.csv')
go_to_cui_filepath = os.path.join(DATA_DIR, 'GO_to_CUI.csv')
hpo_to_cui_filepath = os.path.join(DATA_DIR, 'HPO_to_CUI.csv')

gene_mapping_df = pd.read_csv(gene_mapping_filepath)
go_to_cui_df = pd.read_csv(go_to_cui_filepath)
hpo_to_cui_df = pd.read_csv(hpo_to_cui_filepath)

# Format DisGeNET

Format disease-gene interactions from DisGeNET (https://www.disgenet.org/downloads). Standardize diseases to CUI and genes to their UMLS CUI.

In [3]:
input_filepath = os.path.join(DATA_DIR, 'curated_gene_disease_associations.tsv')
output_filepath = os.path.join(OUTPUT_DATA_DIR, 'disease_gene_relationships.csv')

In [4]:
df = pd.read_csv(input_filepath, sep='\t')
df = df.rename(columns={'diseaseId': 'CUI_disease'})
df = df.merge(gene_mapping_df[['NCBI Gene ID(supplied by NCBI)', 'CUI_gene']].drop_duplicates().dropna(), left_on='geneId', right_on='NCBI Gene ID(supplied by NCBI)')

In [5]:
df.head()

Unnamed: 0,geneId,geneSymbol,DSI,DPI,CUI_disease,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source,NCBI Gene ID(supplied by NCBI),CUI_gene
0,1,A1BG,0.7,0.538,C0019209,Hepatomegaly,phenotype,C23;C06,Finding,0.3,1.0,2017.0,2017.0,1,0,CTD_human,1.0,C1412045
1,1,A1BG,0.7,0.538,C0036341,Schizophrenia,disease,F03,Mental or Behavioral Dysfunction,0.3,1.0,2015.0,2015.0,1,0,CTD_human,1.0,C1412045
2,2,A2M,0.529,0.769,C0002395,Alzheimer's Disease,disease,C10;F03,Disease or Syndrome,0.5,0.769,1998.0,2018.0,3,0,CTD_human,2.0,C1412046
3,2,A2M,0.529,0.769,C0007102,Malignant tumor of colon,disease,C06;C04,Neoplastic Process,0.31,1.0,2004.0,2019.0,1,0,CTD_human,2.0,C1412046
4,2,A2M,0.529,0.769,C0009375,Colonic Neoplasms,group,C06;C04,Neoplastic Process,0.3,1.0,2004.0,2004.0,1,0,CTD_human,2.0,C1412046


In [6]:
df = df[['CUI_disease', 'CUI_gene', 'diseaseName', 'geneSymbol', 'score']].drop_duplicates().dropna(subset=['CUI_disease', 'CUI_gene']).fillna(0)
df.to_csv(output_filepath, index=False)
print(len(df))
df.head()

100782


Unnamed: 0,CUI_disease,CUI_gene,diseaseName,geneSymbol,score
0,C0019209,C1412045,Hepatomegaly,A1BG,0.3
1,C0036341,C1412045,Schizophrenia,A1BG,0.3
2,C0002395,C1412046,Alzheimer's Disease,A2M,0.5
3,C0007102,C1412046,Malignant tumor of colon,A2M,0.31
4,C0009375,C1412046,Colonic Neoplasms,A2M,0.3


# Format DGIdb
Format drug-gene interactions from DGIdb (https://www.dgidb.org/downloads). Standardize drugs and genes to their official UMLS CUI.

In [7]:
input_filepath = os.path.join(DATA_DIR, 'drug_gene_interactions.tsv')
output_filepath = os.path.join(OUTPUT_DATA_DIR, 'drug_gene_relationships.csv')

In [8]:
df = pd.read_csv(input_filepath, sep='\t')

In [9]:
# convert gene symbol to CUI
df = df.merge(gene_mapping_df[['NCBI Gene ID(supplied by NCBI)', 'CUI_gene']].drop_duplicates().dropna(), left_on='entrez_id', right_on='NCBI Gene ID(supplied by NCBI)')
df = df.drop_duplicates()

In [10]:
print('number of gene-drug interactions from DGIdb')
len(df[['CUI_gene', 'drug_claim_primary_name']].dropna().drop_duplicates())

number of gene-drug interactions from DGIdb


104534

In [11]:
# convert drug name to CUI using KG triples
literature_df = pd.read_csv('../data/semrep_relationships_processed.csv')
id_to_cui_1 = literature_df[['info1', 'identifier1']].rename(columns={'info1': 'info', 'identifier1': 'identifier'})
id_to_cui_2 = literature_df[['info2', 'identifier2']].rename(columns={'info2': 'info', 'identifier2': 'identifier'})

id_to_cui = id_to_cui_1.append(id_to_cui_2).drop_duplicates()
id_to_cui['info'] = id_to_cui['info'].apply(lambda x: x.split(';'))
id_to_cui = id_to_cui.explode('info')

id_to_cui['identifier'] = id_to_cui['identifier'].apply(lambda x: x.split(';'))
id_to_cui = id_to_cui.explode('identifier')

id_to_cui = id_to_cui.drop_duplicates()

In [12]:
df['drug_claim_primary_name'] = df['drug_claim_primary_name'].str.lower()
df = df.merge(id_to_cui, left_on='drug_claim_primary_name', right_on='info', how='left').rename(columns={'identifier': 'CUI_drug'})

In [13]:
df.head()

Unnamed: 0,gene_name,gene_claim_name,entrez_id,interaction_claim_source,interaction_types,drug_claim_name,drug_claim_primary_name,drug_name,drug_concept_id,interaction_group_score,PMIDs,NCBI Gene ID(supplied by NCBI),CUI_gene,info,CUI_drug
0,CDK7,CDK7,1022.0,CancerCommons,inhibitor,SNS-032,sns-032,BMS-387032,chembl:CHEMBL296468,0.82,,1022.0,C1413288,,
1,CDK7,Cyclin-dependent kinase 7,1022.0,TTD,,Oxazolyl methylthiothiazole derivative 1,oxazolyl methylthiothiazole derivative 1,,,,,1022.0,C1413288,,
2,CDK7,CAK,1022.0,ChemblInteractions,inhibitor,CHEMBL1944698,tg-02,ZOTIRACICLIB,chembl:CHEMBL1944698,0.2,,1022.0,C1413288,,
3,CDK7,BE0001913,1022.0,DrugBank,,DB05969,sns-032,BMS-387032,chembl:CHEMBL296468,0.82,,1022.0,C1413288,,
4,CDK7,CDK7,1022.0,MyCancerGenome,inhibitor,SELICICLIB,seliciclib,SELICICLIB,chembl:CHEMBL14762,0.87,,1022.0,C1413288,,


In [14]:
df = df[['CUI_drug', 'CUI_gene', 'drug_claim_primary_name', 'gene_name', 'interaction_group_score']].drop_duplicates().dropna(subset=['drug_claim_primary_name', 'CUI_gene']).fillna(0)
df.to_csv(output_filepath, index=False)
print(len(df))
df.head()

99414


Unnamed: 0,CUI_drug,CUI_gene,drug_claim_primary_name,gene_name,interaction_group_score
0,0,C1413288,sns-032,CDK7,0.82
1,0,C1413288,oxazolyl methylthiothiazole derivative 1,CDK7,0.0
2,0,C1413288,tg-02,CDK7,0.2
4,0,C1413288,seliciclib,CDK7,0.87
6,0,C1413288,bms-387032,CDK7,0.82


# Format STRING
Format protein-protein interactions from STRING (https://string-db.org/cgi/download?sessionId=%24input-%3E%7BsessionId%7D&species_text=Homo+sapiens). Standardize proteins to their gene CUI.

In [15]:
input_filepath = os.path.join(DATA_DIR, '9606.protein.links.v11.0.txt')
string_info_filepath = os.path.join(DATA_DIR, '9606.protein.info.v11.0.txt')
output_filepath = os.path.join(OUTPUT_DATA_DIR, 'gene_gene_relationships.csv')

In [16]:
info_df = pd.read_csv(string_info_filepath, sep="\t")

In [17]:
# threshold to only use PPI with combined score > 700 (out of 1000)
interactions_df = pd.read_csv(input_filepath, sep=" ")
THRESHOLD = 400
interactions_df = interactions_df[interactions_df['combined_score'] >= THRESHOLD]

In [18]:
# convert ENSP protein ID to gene symbol
df = interactions_df.merge(info_df, how='left', left_on='protein1', right_on='protein_external_id').drop('protein_external_id', axis=1).rename(columns={'preferred_name': 'preferred_name1'})
df = df.merge(info_df, how='left', left_on='protein2', right_on='protein_external_id').drop('protein_external_id', axis=1).rename(columns={'preferred_name': 'preferred_name2'})


In [19]:
# convert gene symbol to CUI
df = df.merge(gene_mapping_df[['Approved symbol', 'CUI_gene']].dropna().drop_duplicates(), how='left', left_on='preferred_name1', right_on='Approved symbol').drop('Approved symbol', axis=1).rename(columns={'CUI_gene': 'CUI_gene_1'})
df = df.merge(gene_mapping_df[['Approved symbol', 'CUI_gene']].dropna().drop_duplicates(), how='left', left_on='preferred_name2', right_on='Approved symbol').drop('Approved symbol', axis=1).rename(columns={'CUI_gene': 'CUI_gene_2'})

In [20]:
df.head()

Unnamed: 0,protein1,protein2,combined_score,preferred_name1,protein_size_x,annotation_x,preferred_name2,protein_size_y,annotation_y,CUI_gene_1,CUI_gene_2
0,9606.ENSP00000000233,9606.ENSP00000272298,490,ARF5,180,ADP-ribosylation factor 5; GTP-binding protein...,CALM2,149,"Calmodulin 2 (phosphorylase kinase, delta); EF...",C1412510,C1413089
1,9606.ENSP00000000233,9606.ENSP00000418915,606,ARF5,180,ADP-ribosylation factor 5; GTP-binding protein...,CDKN2A,167,Cyclin-dependent kinase inhibitor 2A; Acts as ...,C1412510,C0525037
2,9606.ENSP00000000233,9606.ENSP00000418915,606,ARF5,180,ADP-ribosylation factor 5; GTP-binding protein...,CDKN2A,167,Cyclin-dependent kinase inhibitor 2A; Acts as ...,C1412510,C0384826
3,9606.ENSP00000000233,9606.ENSP00000418915,606,ARF5,180,ADP-ribosylation factor 5; GTP-binding protein...,CDKN2A,167,Cyclin-dependent kinase inhibitor 2A; Acts as ...,C1412510,C0249880
4,9606.ENSP00000000233,9606.ENSP00000356737,418,ARF5,180,ADP-ribosylation factor 5; GTP-binding protein...,GORAB,394,"Golgin, RAB6 interacting",C1412510,C1822658


In [21]:
df = df[['CUI_gene_1', 'CUI_gene_2', 'preferred_name1', 'preferred_name2', 'combined_score']]
df = df.drop_duplicates(subset=['CUI_gene_1', 'CUI_gene_2']).dropna().fillna(0)
df.to_csv(output_filepath, index=False)
print(len(df))
df.head()

2143493


Unnamed: 0,CUI_gene_1,CUI_gene_2,preferred_name1,preferred_name2,combined_score
0,C1412510,C1413089,ARF5,CALM2,490
1,C1412510,C0525037,ARF5,CDKN2A,606
2,C1412510,C0384826,ARF5,CDKN2A,606
3,C1412510,C0249880,ARF5,CDKN2A,606
4,C1412510,C1822658,ARF5,GORAB,418


# Format Uniprot
Format protein/gene to gene ontology (GO) relationships from Uniprot (https://www.uniprot.org/uniprot/?query=*&fil=organism%3A%22Homo+sapiens+%28Human%29+%5B9606%5D%22+AND+reviewed%3Ayes). Standardize proteins to their gene CUIs and gene ontology to CUI.

In [22]:
input_filepath = os.path.join(DATA_DIR, 'uniprot-filtered-organism__Homo+sapiens+(Human)+[9606]_+AND+review--.tab')
output_filepath = os.path.join(OUTPUT_DATA_DIR, 'gene_GO_relationships.csv')

In [23]:
def split_by(x, sep):
    return [i.strip() for i in str(x).split(sep) if (i and i != "nan")]

In [24]:
df = pd.read_csv(input_filepath, sep="\t", dtype=str)

In [25]:
# read in uniprot data and format
df = pd.read_csv(input_filepath, sep="\t", dtype=str) # need to drop na on GO ID and gene name
df['GO_ID'] = df['Gene ontology IDs'].apply(lambda x: split_by(x, ";"))
df['Gene name'] = df['Gene names'].apply(lambda x: split_by(x, " "))
df = df.dropna(subset=['GO_ID', 'Gene name'])

In [26]:
df = df.explode('Gene name')
df = df.explode('GO_ID')
df = df.dropna(subset=['GO_ID', 'Gene name'])

In [27]:
# GO to CUI
df = df.merge(go_to_cui_df.dropna().drop_duplicates(), how='left', left_on='GO_ID', right_on='GO').rename(columns={'CUI': "CUI_GO"})

In [28]:
# map to protein to gene CUI
df = df.merge(gene_mapping_df[['Approved symbol', 'CUI_gene']].dropna().drop_duplicates(), left_on='Gene name', right_on='Approved symbol', how='left')


In [29]:
df.head()

Unnamed: 0,Entry,Entry name,Status,Gene names,Gene ontology (biological process),Gene ontology (cellular component),Gene ontology (GO),Gene ontology (molecular function),Gene ontology IDs,Cross-reference (GeneID),...,Gene names (synonym ),Protein names,Organism,Ensembl transcript,GO_ID,Gene name,GO,CUI_GO,Approved symbol,CUI_gene
0,O95825,QORL1_HUMAN,reviewed,CRYZL1 4P11,quinone metabolic process [GO:1901661],cytosol [GO:0005829],cytosol [GO:0005829]; NADP binding [GO:0050661...,NADP binding [GO:0050661]; NADPH:quinone reduc...,GO:0003960; GO:0005829; GO:0050661; GO:1901661,9946;,...,4P11,Quinone oxidoreductase-like protein 1 (EC 1.-....,Homo sapiens (Human),ENST00000381554 [O95825-1];ENST00000420072 [O9...,GO:0003960,CRYZL1,GO:0003960,C1151392,CRYZL1,C1413743
1,O95825,QORL1_HUMAN,reviewed,CRYZL1 4P11,quinone metabolic process [GO:1901661],cytosol [GO:0005829],cytosol [GO:0005829]; NADP binding [GO:0050661...,NADP binding [GO:0050661]; NADPH:quinone reduc...,GO:0003960; GO:0005829; GO:0050661; GO:1901661,9946;,...,4P11,Quinone oxidoreductase-like protein 1 (EC 1.-....,Homo sapiens (Human),ENST00000381554 [O95825-1];ENST00000420072 [O9...,GO:0003960,CRYZL1,GO:0003960,C2257368,CRYZL1,C1413743
2,O95825,QORL1_HUMAN,reviewed,CRYZL1 4P11,quinone metabolic process [GO:1901661],cytosol [GO:0005829],cytosol [GO:0005829]; NADP binding [GO:0050661...,NADP binding [GO:0050661]; NADPH:quinone reduc...,GO:0003960; GO:0005829; GO:0050661; GO:1901661,9946;,...,4P11,Quinone oxidoreductase-like protein 1 (EC 1.-....,Homo sapiens (Human),ENST00000381554 [O95825-1];ENST00000420072 [O9...,GO:0003960,CRYZL1,GO:0003960,C2257369,CRYZL1,C1413743
3,O95825,QORL1_HUMAN,reviewed,CRYZL1 4P11,quinone metabolic process [GO:1901661],cytosol [GO:0005829],cytosol [GO:0005829]; NADP binding [GO:0050661...,NADP binding [GO:0050661]; NADPH:quinone reduc...,GO:0003960; GO:0005829; GO:0050661; GO:1901661,9946;,...,4P11,Quinone oxidoreductase-like protein 1 (EC 1.-....,Homo sapiens (Human),ENST00000381554 [O95825-1];ENST00000420072 [O9...,GO:0005829,CRYZL1,GO:0005829,C1383501,CRYZL1,C1413743
4,O95825,QORL1_HUMAN,reviewed,CRYZL1 4P11,quinone metabolic process [GO:1901661],cytosol [GO:0005829],cytosol [GO:0005829]; NADP binding [GO:0050661...,NADP binding [GO:0050661]; NADPH:quinone reduc...,GO:0003960; GO:0005829; GO:0050661; GO:1901661,9946;,...,4P11,Quinone oxidoreductase-like protein 1 (EC 1.-....,Homo sapiens (Human),ENST00000381554 [O95825-1];ENST00000420072 [O9...,GO:0050661,CRYZL1,GO:0050661,C1323255,CRYZL1,C1413743


In [30]:
df = df[['CUI_GO', 'CUI_gene', 'GO_ID', 'Gene name']].drop_duplicates().dropna(subset=['CUI_GO', 'CUI_gene'])
df.to_csv(output_filepath, index=False)
print(len(df))
df.head()

580380


Unnamed: 0,CUI_GO,CUI_gene,GO_ID,Gene name
0,C1151392,C1413743,GO:0003960,CRYZL1
1,C2257368,C1413743,GO:0003960,CRYZL1
2,C2257369,C1413743,GO:0003960,CRYZL1
3,C1383501,C1413743,GO:0005829,CRYZL1
4,C1323255,C1413743,GO:0050661,CRYZL1


# Format HPO
Format gene-phenotype and disease-phenotype relationships from the Human Phenotype Ontology (HPO) (https://hpo.jax.org/app/download/annotation). Standardize phenotypes, genes, and diseases to CUI.

### Format gene-phenotype relationships

In [31]:
input_filepath = os.path.join(DATA_DIR, 'genes_to_phenotype.txt')
output_filepath = os.path.join(OUTPUT_DATA_DIR, 'gene_phenotype_relationships.csv')

In [32]:
df = pd.read_csv(input_filepath, sep="\t", skiprows=1, header=None, dtype=str)
df.columns = ["entrez-gene-id", "entrez-gene-symbol", "HPO-Term-ID", "HPO-Term-Name", "Frequency-Raw", "Frequency-HPO", "Additional Info from G-D source", "G-D source", "disease-ID for link"]


In [33]:
# HPO to CUI
df = df.merge(hpo_to_cui_df, how='left', left_on='HPO-Term-ID', right_on='HPO').rename(columns={'CUI': 'CUI_HPO'})

In [34]:
# convert gene to CUI
df['entrez-gene-id'] = df['entrez-gene-id'].astype(float)
df = df.merge(gene_mapping_df[['NCBI Gene ID(supplied by NCBI)', 'CUI_gene']].drop_duplicates().dropna(), left_on='entrez-gene-id', right_on='NCBI Gene ID(supplied by NCBI)', how='left')


In [35]:
df.head()

Unnamed: 0,entrez-gene-id,entrez-gene-symbol,HPO-Term-ID,HPO-Term-Name,Frequency-Raw,Frequency-HPO,Additional Info from G-D source,G-D source,disease-ID for link,HPO,CUI_HPO,NCBI Gene ID(supplied by NCBI),CUI_gene
0,8192.0,CLPP,HP:0000013,Hypoplasia of the uterus,-,,-,mim2gene,OMIM:614129,HP:0000013,C0266399,8192.0,C1413504
1,8192.0,CLPP,HP:0000786,Primary amenorrhea,-,,-,mim2gene,OMIM:614129,HP:0000786,C0232939,8192.0,C1413504
2,8192.0,CLPP,HP:0004322,Short stature,-,HP:0040283,-,mim2gene,OMIM:614129,HP:0004322,C0349588,8192.0,C1413504
3,8192.0,CLPP,HP:0000007,Autosomal recessive inheritance,-,,-,mim2gene,OMIM:614129,HP:0000007,C0441748,8192.0,C1413504
4,8192.0,CLPP,HP:0000007,Autosomal recessive inheritance,-,,-,mim2gene,OMIM:614129,HP:0000007,C4020899,8192.0,C1413504


In [36]:
df = df[['CUI_HPO', 'CUI_gene', 'HPO-Term-Name', 'entrez-gene-symbol']].drop_duplicates().dropna(subset=['CUI_HPO', 'CUI_gene'])
df.to_csv(output_filepath, index=False)
print(len(df))
df.head()

293134


Unnamed: 0,CUI_HPO,CUI_gene,HPO-Term-Name,entrez-gene-symbol
0,C0266399,C1413504,Hypoplasia of the uterus,CLPP
1,C0232939,C1413504,Primary amenorrhea,CLPP
2,C0349588,C1413504,Short stature,CLPP
3,C0441748,C1413504,Autosomal recessive inheritance,CLPP
4,C4020899,C1413504,Autosomal recessive inheritance,CLPP


### Format disease-phenotype relationships

In [37]:
input_filepath = os.path.join(DATA_DIR, 'phenotype_annotation.tab')
output_filepath = os.path.join(OUTPUT_DATA_DIR, 'disease_phenotype_relationships.csv')

In [38]:
df = pd.read_csv(input_filepath, sep="\t", dtype=str)
df = df[(df["#disease-db"]=="OMIM")] # just use OMIM diseases

In [39]:
disease_mapping_filepath = os.path.join(DATA_DIR, 'doid_formatted.tsv')
disease_mapping_df = pd.read_csv(disease_mapping_filepath)

# add CUI column to disease_mapping_df
def get_cui(xref):
    if str(xref) != 'nan':
        ids = xref.split(",")
        for id_ in ids:
            if id_.split(":")[0].strip() == 'UMLS_CUI':
                return id_.split(":")[1].strip()
    return np.nan

disease_mapping_df['CUI_disease'] = disease_mapping_df['xref'].apply(lambda x: get_cui(x))

In [40]:
disease_mapping_df.head()

Unnamed: 0,id,name,alt_id,children,comment,created_by,creation_date,definition,disjoint_from,parent_ids,subset,synonym,xref,OMIM,MESH,ICD10CM,ICD9CM,CUI_disease
0,DOID:4,disease,,"DOID:0014667, DOID:0050117, DOID:0080015, DOID...",,,,A disease is a disposition (i) to undergo path...,,,NCIthesaurus,,"MESH:D004194, NCI:C2991, SNOMEDCT_US_2020_03_0...",,D004194,,,C0012634
1,DOID:0014667,disease of metabolism,,"DOID:0060158, DOID:655",,,,A disease that involving errors in metabolic p...,,DOID:4,"DO_AGR_slim, DO_GXD_slim, NCIthesaurus",metabolic disease EXACT [],"ICD10CM:E88.9, ICD9CM:277.9, MESH:D008659, NCI...",,D008659,E88.9,277.9,C0025517
2,DOID:0050117,disease by infectious agent,"DOID:10115, DOID:11078, DOID:1304, DOID:1321, ...","DOID:0050720, DOID:0060000, DOID:1003, DOID:10...",DO:wk,,,A disease that is the consequence of the prese...,,DOID:4,"DO_GXD_slim, DO_MGI_slim, NCIthesaurus",infectious disease EXACT [],"ICD9CM:079.0, UMLS_CUI:C0001485",,,,79.0,C0001485
3,DOID:0080015,physical disorder,,"DOID:0050328, DOID:0050534, DOID:0050545, DOID...",,,,A disease that has_material_basis_in a genetic...,,DOID:4,"DO_AGR_slim, DO_GXD_slim, DO_MGI_slim",congenital disorder EXACT [],,,,,,
4,DOID:14566,disease of cellular proliferation,DOID:0000818,"DOID:0060071, DOID:0060072, DOID:162",,,,A disease that is characterized by abnormally ...,,DOID:4,DO_FlyBase_slim,"cell process disease EXACT [], ""neoplasm"" EXAC...",,,,,,


In [41]:
# OMIM to CUI
df = df.merge(disease_mapping_df, left_on='disease-identifier', right_on='OMIM')

In [42]:
# HPO to CUI
df = df.merge(hpo_to_cui_df, how='left', left_on='HPO-ID', right_on='HPO').rename(columns={'CUI': 'CUI_HPO'})


In [43]:
df.head()

Unnamed: 0,#disease-db,disease-identifier,disease-name,negation,HPO-ID,reference,evidence-code,onset,frequencyHPO,modifier,...,subset,synonym,xref,OMIM,MESH,ICD10CM,ICD9CM,CUI_disease,HPO,CUI_HPO
0,OMIM,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",,HP:0000028,OMIM:100050,IEA,,,,...,,,OMIM:100050,100050,,,,,HP:0000028,C0010417
1,OMIM,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",,HP:0000049,OMIM:100050,IEA,,,,...,,,OMIM:100050,100050,,,,,HP:0000049,C1858539
2,OMIM,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",,HP:0000175,OMIM:100050,TAS,,,,...,,,OMIM:100050,100050,,,,,HP:0000175,C2981150
3,OMIM,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",,HP:0000202,OMIM:100050,IEA,,,,...,,,OMIM:100050,100050,,,,,HP:0000202,C0158646
4,OMIM,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",,HP:0000202,OMIM:100050,IEA,,,,...,,,OMIM:100050,100050,,,,,HP:0000202,C4021813


In [44]:
df = df[['CUI_HPO', 'CUI_disease', 'name', 'disease-name']].drop_duplicates().dropna(subset=['CUI_HPO', 'CUI_disease'])
df.to_csv(output_filepath, index=False)
print(len(df))
df.head()

29611


Unnamed: 0,CUI_HPO,CUI_disease,name,disease-name
59,C0443147,C0162871,abdominal aortic aneurysm,"AORTIC ANEURYSM, ABDOMINAL"
60,C0441748,C0162871,abdominal aortic aneurysm,"AORTIC ANEURYSM, ABDOMINAL"
61,C4020899,C0162871,abdominal aortic aneurysm,"AORTIC ANEURYSM, ABDOMINAL"
62,C0600599,C0162871,abdominal aortic aneurysm,"AORTIC ANEURYSM, ABDOMINAL"
63,C4732735,C0162871,abdominal aortic aneurysm,"AORTIC ANEURYSM, ABDOMINAL"
