In [1]:
import sys
import pandas 

# import utils.rawgit()
rawgit = 'https://raw.githubusercontent.com/' 
# local imports
sys.path.insert(0, './data/')

### Read DO Slim

In [2]:
commit = '72614ade9f1cc5a5317b8f6836e1e464b31d5587'
#url = utils.rawgit('dhimmel', 'disease-ontology', commit, 'data/slim-terms.tsv')
url = rawgit + 'dhimmel/' + 'disease-ontology/' + commit + '/data/slim-terms.tsv'
disease_df = pandas.read_table(url)
disease_df = disease_df.rename(columns={'doid': 'doid_id', 'name': 'doid_name'})
disease_df = disease_df[['doid_id', 'doid_name']]
disease_df.head(2)

Unnamed: 0,doid_id,doid_name
0,DOID:2531,Hematologic cancer
1,DOID:1319,Brain cancer


### Read Entrez Gene

In [3]:
#commit = '6e133f9ef8ce51a4c5387e58a6cc97564a66cec8'
##url = utils.rawgit('dhimmel', 'entrez-gene', commit, 'data/genes-human.tsv')
#url = rawgit + 'dhimmel/' + 'entrez-gene/' + commit + '/data/genes-human.tsv'
gene_df = pandas.read_table('../entrez-gene/data/genes-human.tsv')
gene_df = gene_df[gene_df.type_of_gene == 'protein-coding']
gene_df = gene_df.rename(columns={'GeneID': 'entrez_gene_id', 'Symbol': 'gene_symbol'})
gene_df = gene_df[['entrez_gene_id', 'gene_symbol']]
gene_df.head(2)

Unnamed: 0,entrez_gene_id,gene_symbol
0,1,A1BG
1,2,A2M


### Read Datasets

In [4]:
# Orphanet
orphanet_df = pandas.read_table('../data/orphanet-disease-gene.tsv')
orphanet_df = orphanet_df.merge(gene_df, on='gene_symbol', how='left')
orphanet_df.head(2)

Unnamed: 0,orphanet_code,orphanet_term,orphanet_gene_id,gene_orphanumber,gene_name,gene_type,gene_symbol,dga_type,dga_status,entrez_gene_id
0,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",20160,268061,kinesin family member 7,gene with protein product,KIF7,Disease-causing germline mutation(s) in,Assessed,374654.0
1,93,Aspartylglucosaminuria,15470,119513,aspartylglucosaminidase,gene with protein product,AGA,Disease-causing germline mutation(s) in,Assessed,175.0


In [5]:
'{} genes in entrez and {} genes in orphanet'.format(len(gene_df), len(orphanet_df))

'20458 genes in entrez and 6806 genes in orphanet'

In [6]:
print('gene_types: {}'.format(orphanet_df.gene_type.unique()))
print('dga_status: {}'.format(orphanet_df.dga_status.unique()))

gene_types: ['gene with protein product' 'Non-coding RNA' 'locus']
dga_status: ['Assessed' 'Not yet assessed']


In [7]:
# check if there are missing values
orpha_na = orphanet_df.fillna('missing')
orpha_na.query('entrez_gene_id == "missing"')

Unnamed: 0,orphanet_code,orphanet_term,orphanet_gene_id,gene_orphanumber,gene_name,gene_type,gene_symbol,dga_type,dga_status,entrez_gene_id
41,550,MELAS,16471,123512,mitochondrially encoded cytochrome c oxidase I,gene with protein product,MT-CO1,Disease-causing germline mutation(s) in,Assessed,missing
42,550,MELAS,16472,123516,mitochondrially encoded cytochrome c oxidase II,gene with protein product,MT-CO2,Disease-causing germline mutation(s) in,Assessed,missing
43,550,MELAS,16473,123520,mitochondrially encoded cytochrome c oxidase III,gene with protein product,MT-CO3,Candidate gene tested in,Not yet assessed,missing
44,550,MELAS,16478,123537,mitochondrially encoded NADH:ubiquinone oxidor...,gene with protein product,MT-ND1,Disease-causing germline mutation(s) in,Assessed,missing
45,550,MELAS,16481,123552,mitochondrially encoded NADH:ubiquinone oxidor...,gene with protein product,MT-ND4,Disease-causing germline mutation(s) in,Assessed,missing
46,550,MELAS,16483,123562,mitochondrially encoded NADH:ubiquinone oxidor...,gene with protein product,MT-ND5,Disease-causing germline mutation(s) in,Assessed,missing
47,550,MELAS,16484,123567,mitochondrially encoded NADH:ubiquinone oxidor...,gene with protein product,MT-ND6,Disease-causing germline mutation(s) in,Assessed,missing
48,550,MELAS,16841,138895,mitochondrially encoded tRNA leucine 1 (UUA/G),Non-coding RNA,MT-TL1,Disease-causing germline mutation(s) in,Assessed,missing
49,550,MELAS,16844,138906,mitochondrially encoded tRNA glutamine,Non-coding RNA,MT-TQ,Candidate gene tested in,Not yet assessed,missing
50,550,MELAS,17473,160307,mitochondrially encoded tRNA tryptophan,Non-coding RNA,MT-TW,Disease-causing germline mutation(s) in,Assessed,missing


In [8]:
'NA GeneID: {}'.format(len(orpha_na.query('entrez_gene_id == "missing"')))

'NA GeneID: 197'

In [9]:
'NA GeneID for protein codeing: {}'.format(len(orpha_na.query('entrez_gene_id == "missing" and gene_type == "gene with protein product"')))

'NA GeneID for protein codeing: 88'

In [10]:
orpha_na.query('entrez_gene_id == "missing" and gene_type == "gene with protein product"').head(2)

Unnamed: 0,orphanet_code,orphanet_term,orphanet_gene_id,gene_orphanumber,gene_name,gene_type,gene_symbol,dga_type,dga_status,entrez_gene_id
41,550,MELAS,16471,123512,mitochondrially encoded cytochrome c oxidase I,gene with protein product,MT-CO1,Disease-causing germline mutation(s) in,Assessed,missing
42,550,MELAS,16472,123516,mitochondrially encoded cytochrome c oxidase II,gene with protein product,MT-CO2,Disease-causing germline mutation(s) in,Assessed,missing


In [11]:
gene_df.query('gene_symbol == "MT-CO1"')

Unnamed: 0,entrez_gene_id,gene_symbol


In [12]:
gene_df.query('entrez_gene_id == 4512')

Unnamed: 0,entrez_gene_id,gene_symbol
3622,4512,COX1


In [13]:
# the problem is that in the web site the officia symbol for entrez:4512 is 'MT-CO1',
# while in the downloaded file is 'COX1' +> try the mapping using biothings

In [14]:
# Remove genes with missing entrez_gene_id data
orpha_df = orpha_na.query('entrez_gene_id != "missing"')
print('genes pre-removal: {}'.format(len(orpha_na)))
print('genes post-removal: {}'.format(len(orpha_df)))

genes pre-removal: 6806
genes post-removal: 6609


In [15]:
# reformat
orpha_df = orpha_df.rename(columns={'orphanet_code': 'doid_id', 'orphanet_term': 'doid_name'})
orpha_df['doid_id'] = 'ORPHA:' + orpha_df['doid_id'].astype(str)
orpha_df.head(2)

Unnamed: 0,doid_id,doid_name,orphanet_gene_id,gene_orphanumber,gene_name,gene_type,gene_symbol,dga_type,dga_status,entrez_gene_id
0,ORPHA:166024,"Multiple epiphyseal dysplasia, Al-Gazali type",20160,268061,kinesin family member 7,gene with protein product,KIF7,Disease-causing germline mutation(s) in,Assessed,374654
1,ORPHA:93,Aspartylglucosaminuria,15470,119513,aspartylglucosaminidase,gene with protein product,AGA,Disease-causing germline mutation(s) in,Assessed,175


In [16]:
# DISEASES
commit = 'e0089ef89a56348d7d4e0684a9c51c5747b16237'
#url = utils.rawgit('dhimmel', 'diseases', commit, 'data/merged-slim.tsv')
url = rawgit + 'dhimmel/' + 'diseases/' + commit + '/data/merged-slim.tsv'
diseases_df = pandas.read_table(url)
diseases_df.head(2)

Unnamed: 0,doid_id,doid_name,entrez_gene_id,gene_symbol,score_text,score_knowledge,score_cosmic,score_distild,score_integrated_no_distild,score_integrated
0,DOID:13223,uterine fibroid,60,ACTB,0.8,,,,0.8,0.8
1,DOID:13223,uterine fibroid,71,ACTG1,0.8,,,,0.8,0.8


In [17]:
# DOAF
commit = 'bbe1c326aa385416e36d02b144e89e2b99e700b6'
#url = utils.rawgit('dhimmel', 'doaf', commit, 'data/doaf.tsv')
url = rawgit + 'dhimmel/' + 'doaf/' + commit + '/data/doaf.tsv'
doaf_df = pandas.read_table(url)
doaf_df = doaf_df.rename(columns={'doid_code': 'doid_id', 'GeneID': 'entrez_gene_id'})
doaf_df.head(2)

Unnamed: 0,doid_id,doid_name,entrez_gene_id,Symbol,count
0,DOID:0001816,angiosarcoma,302,ANXA2,1
1,DOID:0001816,angiosarcoma,595,CCND1,1


In [18]:
# DisGeNET
commit = 'fdc5f42f2da745cbf71d7b4cc5021de5685e4a11'
#url = utils.rawgit('dhimmel', 'disgenet', commit, 'data/consolidated.tsv')
url = rawgit + 'dhimmel/' + 'disgenet/' + commit + '/data/consolidated.tsv'
disgenet_df = pandas.read_table(url)
disgenet_df = disgenet_df.rename(columns={'doid_code': 'doid_id', 'geneId': 'entrez_gene_id'})
disgenet_df.head(2)

Unnamed: 0,doid_id,doid_name,entrez_gene_id,geneSymbol,count,pubmeds_max,score_max,score_mean,associationType,source
0,DOID:0050156,idiopathic pulmonary fibrosis,729238,SFTPA2,1.0,1.0,0.620284,0.620284,Biomarker|GeneticVariation,BeFree|CLINVAR|CTD_human|UNIPROT
1,DOID:0050156,idiopathic pulmonary fibrosis,7015,TERT,1.0,10.0,0.422153,0.422153,Biomarker|GeneticVariation,BeFree|CLINVAR|CTD_human|GAD|LHGDN


In [19]:
# hetio GWAS
commit = '0617ea7ea8268f21f5ca1b8dbe487dd12671fc7b'
#url = utils.rawgit('dhimmel', 'gwas-catalog', commit, 'data/gene-associations.tsv')
url = rawgit + 'dhimmel/' + 'gwas-catalog/' + commit + '/data/gene-associations.tsv'
gwas_df = pandas.read_table(url)
gwas_df = gwas_df.rename(columns={'doid_code': 'doid_id', 'gene': 'entrez_gene_id'})
gwas_df.head(2)

Unnamed: 0,doid_id,doid_name,locus,high_confidence,primary,status,entrez_gene_id,symbol
0,DOID:9970,obesity,0,1,1,HC-P,3953,LEPR
1,DOID:9970,obesity,14,1,1,HC-P,4094,MAF


### Filters

In [20]:
orpha_df = orpha_df.query('gene_type == "gene with protein product" or dga_status == "Assessed"')
diseases_df = diseases_df.query('score_integrated_no_distild >= 2')
doaf_df = doaf_df.query('count >= 3')
disgenet_df = disgenet_df.query('score_max >= 0.06')
gwas_df = gwas_df[gwas_df.status == 'HC-P']

### Combine

In [21]:
orpha_df['provenance'] = 'Orphanet'
diseases_df['provenance'] = 'DISEASES'
doaf_df['provenance'] = 'DOAF'
disgenet_df['provenance'] = 'DisGeNET'
gwas_df['provenance'] = 'GWAS Catalog'

orpha_df['license'] = 'CC BY-ND 3.0'
diseases_df['license'] = 'CC BY 4.0'
doaf_df['license'] = ''
disgenet_df['license'] = 'ODbL 1.0'
gwas_df['license'] = 'CC BY 4.0'

In [22]:
dfs = [df[['doid_id', 'entrez_gene_id', 'provenance', 'license']]
       for df in (diseases_df, doaf_df, disgenet_df, gwas_df)]
concat_df = pandas.concat(dfs)
concat_df = disease_df.merge(gene_df.merge(concat_df))
concat_df.provenance.value_counts()

DisGeNET        7552
DISEASES        4990
DOAF            1649
GWAS Catalog    1283
Name: provenance, dtype: int64

In [23]:
concat_df.head(2)
orpha_df = orpha_df[['doid_id', 'doid_name', 'entrez_gene_id', 'gene_symbol', 'provenance', 'license']]
concat_df = pandas.concat([concat_df,orpha_df])
concat_df.provenance.value_counts()

DisGeNET        7552
Orphanet        6609
DISEASES        4990
DOAF            1649
GWAS Catalog    1283
Name: provenance, dtype: int64

In [24]:
def condense(df):
    """Consolidate multiple associations into a single Series."""
    row = pandas.Series()
    row['sources'] = '|'.join(df.provenance)
    licenses = set(df.license)
    licenses.discard('')
    try:
        row['license'], = licenses
    except ValueError:
        row['license'] = None
    return row

short_df = concat_df.groupby(['doid_id', 'entrez_gene_id']).apply(condense).reset_index()
short_df = disease_df.merge(gene_df.merge(short_df))
short_df.head()

Unnamed: 0,doid_id,doid_name,entrez_gene_id,gene_symbol,sources,license
0,DOID:2531,Hematologic cancer,25,ABL1,DISEASES|DisGeNET,
1,DOID:2531,Hematologic cancer,27,ABL2,DisGeNET,ODbL 1.0
2,DOID:2531,Hematologic cancer,54,ACP5,DISEASES,CC BY 4.0
3,DOID:2531,Hematologic cancer,113,ADCY7,DisGeNET,ODbL 1.0
4,DOID:2531,Hematologic cancer,142,PARP1,DISEASES|DisGeNET,


In [25]:
short_df.to_csv('DaG-association.tsv', sep='\t', index=False)

In [26]:
short_df = concat_df.groupby(['doid_id', 'entrez_gene_id']).apply(condense).reset_index()

In [28]:
short_df

Unnamed: 0,doid_id,entrez_gene_id,sources,license
0,DOID:0050156,1832.0,DisGeNET,ODbL 1.0
1,DOID:0050156,2335.0,DISEASES,CC BY 4.0
2,DOID:0050156,2355.0,DisGeNET,ODbL 1.0
3,DOID:0050156,4088.0,DISEASES,CC BY 4.0
4,DOID:0050156,5328.0,DisGeNET,ODbL 1.0
5,DOID:0050156,6440.0,DISEASES|DisGeNET,
6,DOID:0050156,6441.0,DISEASES,CC BY 4.0
7,DOID:0050156,7015.0,DISEASES|DOAF|DisGeNET|GWAS Catalog,
8,DOID:0050156,7040.0,DISEASES,CC BY 4.0
9,DOID:0050156,10144.0,DisGeNET,ODbL 1.0
