In [1]:
import pandas as pd
import numpy as np
import os

DATA_DIR = 'integration_data' # data from sources
OUTPUT_DATA_DIR = 'integration_data_processed' # processed data from sources (where output of the code will be stored)

In [2]:
gene_mapping_filepath = os.path.join(DATA_DIR, 'gene_mapping.csv')
go_to_cui_filepath = os.path.join(DATA_DIR, 'GO_to_CUI.csv')
hpo_to_cui_filepath = os.path.join(DATA_DIR, 'HPO_to_CUI.csv')

gene_mapping_df = pd.read_csv(gene_mapping_filepath)

go_to_cui_df = pd.read_csv(go_to_cui_filepath)
hpo_to_cui_df = pd.read_csv(hpo_to_cui_filepath)

# Format DisGeNET

Format disease-gene interactions from DisGeNET (https://www.disgenet.org/downloads). Standardize diseases to MESH and genes to their UMLS CUI.

In [3]:
input_filepath = os.path.join(DATA_DIR, 'curated_gene_disease_associations.tsv')
output_filepath = os.path.join(OUTPUT_DATA_DIR, 'disease_gene_relationships.csv')

In [4]:
df = pd.read_csv(input_filepath, sep='\t')
df = df.rename(columns={'diseaseId': 'CUI_disease'})
df = df.merge(gene_mapping_df[['Alias symbol', 'CUI']], left_on='geneSymbol', right_on='Alias symbol').rename(columns={'CUI': 'CUI_gene'})
df = df[['CUI_disease', 'CUI_gene', 'score']].drop_duplicates().dropna()
df.to_csv(output_filepath, index=False)

# Format DGIdb
Format drug-gene interactions from DGIdb (https://www.dgidb.org/downloads). Standardize drugs and genes to their official UMLS CUI.

In [5]:
input_filepath = os.path.join(DATA_DIR, 'drug_gene_interactions.tsv')
output_filepath = os.path.join(OUTPUT_DATA_DIR, 'drug_gene_relationships.csv')

In [6]:
df = pd.read_csv(input_filepath, sep='\t')

In [7]:
# convert gene symbol to CUI
df = df.merge(gene_mapping_df[['Alias symbol', 'CUI']], left_on='gene_name', right_on='Alias symbol').rename(columns={'CUI': 'CUI_gene'})
df = df.drop_duplicates()

In [8]:
# convert drug name to CUI
literature_df = pd.read_csv('../data/semrep_relationships_processed.csv')
id_to_cui_1 = literature_df[['info1', 'identifier1']].rename(columns={'info1': 'info', 'identifier1': 'identifier'})
id_to_cui_2 = literature_df[['info2', 'identifier2']].rename(columns={'info2': 'info', 'identifier2': 'identifier'})

id_to_cui = id_to_cui_1.append(id_to_cui_2).drop_duplicates()
id_to_cui['info'] = id_to_cui['info'].apply(lambda x: x.split(';'))
id_to_cui = id_to_cui.explode('info')

In [9]:
df['drug_claim_primary_name'] = df['drug_claim_primary_name'].str.lower()
df = df.merge(id_to_cui, left_on='drug_claim_primary_name', right_on='info', how='left')[['identifier', 'CUI_gene', 'interaction_group_score']].drop_duplicates().dropna()
df.columns = ['CUI_drug', 'CUI_gene', 'interaction_group_score']
df.to_csv(output_filepath, index=False)

# Format STRING
Format protein-protein interactions from STRING (https://string-db.org/cgi/download?sessionId=%24input-%3E%7BsessionId%7D&species_text=Homo+sapiens). Standardize proteins to their gene CUI.

In [10]:
input_filepath = os.path.join(DATA_DIR, '9606.protein.links.v11.0.txt')
string_info_filepath = os.path.join(DATA_DIR, '9606.protein.info.v11.0.txt')
output_filepath = os.path.join(OUTPUT_DATA_DIR, 'gene_gene_relationships.csv')

In [11]:
info_df = pd.read_csv(string_info_filepath, sep="\t")

In [12]:
# threshold to only use PPI with combined score > 700 (out of 1000)
interactions_df = pd.read_csv(input_filepath, sep=" ")
THRESHOLD = 700
interactions_df = interactions_df[interactions_df['combined_score'] > THRESHOLD]

In [13]:
# convert ENSP protein ID to gene symbol
df = interactions_df.merge(info_df, how='left', left_on='protein1', right_on='protein_external_id').drop('protein_external_id', axis=1).rename(columns={'preferred_name': 'preferred_name1'})
df = df.merge(info_df, how='left', left_on='protein2', right_on='protein_external_id').drop('protein_external_id', axis=1).rename(columns={'preferred_name': 'preferred_name2'})


In [14]:
# convert gene symbol to CUI
df = df.merge(gene_mapping_df[['Alias symbol', 'CUI']].dropna().drop_duplicates(), how='left', left_on='preferred_name1', right_on='Alias symbol').drop('Alias symbol', axis=1).rename(columns={'CUI': 'CUI_gene_1'})
df = df.merge(gene_mapping_df[['Alias symbol', 'CUI']].dropna().drop_duplicates(), how='left', left_on='preferred_name2', right_on='Alias symbol').drop('Alias symbol', axis=1).rename(columns={'CUI': 'CUI_gene_2'})
df = df[['CUI_gene_1', 'CUI_gene_2', 'combined_score']].drop_duplicates().dropna()
df.to_csv(output_filepath, index=False)


# Format Uniprot
Format protein/gene to gene ontology (GO) relationships from Uniprot (https://www.uniprot.org/uniprot/?query=*&fil=organism%3A%22Homo+sapiens+%28Human%29+%5B9606%5D%22+AND+reviewed%3Ayes). Standardize proteins to their gene CUIs and gene ontology to CUI.

In [15]:
input_filepath = os.path.join(DATA_DIR, 'uniprot-filtered-organism__Homo+sapiens+(Human)+[9606]_+AND+review--.tab')
output_filepath = os.path.join(OUTPUT_DATA_DIR, 'gene_GO_relationships.csv')

In [16]:
def split_by(x, sep):
    return [i.strip() for i in str(x).split(sep) if (i and i != "nan")]

In [17]:
# read in uniprot data and format
df = pd.read_csv(input_filepath, sep="\t", dtype=str)
df['GO_ID'] = df['Gene ontology IDs'].apply(lambda x: split_by(x, ";"))
df['Gene name'] = df['Gene names'].apply(lambda x: split_by(x, " "))

In [18]:
df = df.explode('Gene name')
df = df.explode('GO_ID')

In [19]:
# GO to CUI
df = df.merge(go_to_cui_df, how='left', left_on='GO_ID', right_on='GO').rename(columns={'CUI': "CUI_GO"})
# map to protein to gene CUI
df = df.merge(gene_mapping_df[['Alias symbol', 'CUI']], left_on='Gene name', right_on='Alias symbol', how='left').rename(columns={'CUI': "CUI_gene"})
df = df[['CUI_GO', 'CUI_gene']].drop_duplicates().dropna()
df.to_csv(output_filepath, index=False)

# format HPO
Format gene-phenotype and disease-phenotype relationships from the Human Phenotype Ontology (HPO) (https://hpo.jax.org/app/download/annotation). Standardize phenotypes, genes, and diseases to CUI.

In [20]:
# format gene-phenotype relationships
input_filepath = os.path.join(DATA_DIR, 'genes_to_phenotype.txt')
output_filepath = os.path.join(OUTPUT_DATA_DIR, 'gene_phenotype_relationships.csv')

In [21]:
df = pd.read_csv(input_filepath, sep="\t", skiprows=1, header=None, dtype=str)
df.columns = ["entrez-gene-id", "entrez-gene-symbol", "HPO-Term-ID", "HPO-Term-Name", "Frequency-Raw", "Frequency-HPO", "Additional Info from G-D source", "G-D source", "disease-ID for link"]


In [22]:
# HPO to CUI
df = df.merge(hpo_to_cui_df, how='left', left_on='HPO-Term-ID', right_on='HPO').rename(columns={'CUI': 'CUI_HPO'})

In [23]:
# convert gene to CUI
df = df.merge(gene_mapping_df[['Alias symbol', 'CUI']], left_on='entrez-gene-symbol', right_on='Alias symbol', how='left').rename(columns={'CUI': 'CUI_gene'})
df = df[['CUI_HPO', 'CUI_gene']].drop_duplicates().dropna()
df.to_csv(output_filepath, index=False)

In [24]:
# format disease-phenotype relationships
input_filepath = os.path.join(DATA_DIR, 'phenotype_annotation.tab')
output_filepath = os.path.join(OUTPUT_DATA_DIR, 'disease_phenotype_relationships.csv')

In [25]:
df = pd.read_csv(input_filepath, sep="\t", dtype=str)
df = df[(df["#disease-db"]=="OMIM")] # just use OMIM diseases

In [26]:
disease_mapping_filepath = os.path.join(DATA_DIR, 'doid_formatted.tsv')
disease_mapping_df = pd.read_csv(disease_mapping_filepath)

# add CUI column to disease_mapping_df
def get_cui(xref):
    if str(xref) != 'nan':
        ids = xref.split(",")
        for id_ in ids:
            if id_.split(":")[0].strip() == 'UMLS_CUI':
                return id_.split(":")[1].strip()
    return np.nan

disease_mapping_df['CUI'] = disease_mapping_df['xref'].apply(lambda x: get_cui(x))

In [27]:
# OMIM to CUI
df = df.merge(disease_mapping_df, left_on='disease-identifier', right_on='OMIM').rename(columns={'CUI': 'CUI_disease'})

In [28]:
# HPO to CUI
df = df.merge(hpo_to_cui_df, how='left', left_on='HPO-ID', right_on='HPO').rename(columns={'CUI': 'CUI_HPO'})
df = df[['CUI_HPO', 'CUI_disease']].drop_duplicates().dropna()
df.to_csv(output_filepath, index=False)
