# Read and crop input data

In [131]:
import pandas as pd 
import requests

integrated =  pd.read_csv('../raw/human_disease_integrated_full.tsv',sep = '\t')
integrated.columns = ['geneID','geneName','diseaseID','diseaseName','integrated_confidenceScore']
# integrated = integrated.drop(columns=['geneName','diseaseName'])

text_mining = pd.read_csv('../raw/human_disease_textmining_filtered.tsv',sep = '\t')
text_mining.columns= ['geneID','geneName','diseaseID','diseaseName','zScore','confidenceScore','sourceUrl']

knowlege = pd.read_csv('../raw/human_disease_knowledge_filtered.tsv',sep = '\t')
knowlege.columns= ['geneID','geneName','diseaseID','diseaseName','sourceDB','evidenceType','confidenceScore']

tiga = pd.read_csv('../raw/tiga_gene-trait_stats.tsv',sep='\t')
tiga = tiga[['ensemblId','efoId','trait','n_snp','n_snpw']]
tiga = tiga.drop_duplicates(subset=['ensemblId','trait'])

human_do = pd.read_csv('../raw/HumanDO.tsv',sep = '\t')
human_do = human_do.drop_duplicates(subset='label')
human_do = human_do[['id','label']]

tiga_do = tiga.merge(human_do,left_on='trait',right_on='label',how='inner',validate='m:1')

# Map geneIDs in knowledge channel from ENSP to ENSG format


In [None]:
r = requests.post(
url='https://biit.cs.ut.ee/gprofiler/api/convert/convert/',
json={
    'organism':'hsapiens',
    'target':'ENSG',
    'query': list(knowlege['geneID']),
    }
)

results = r.json()['result']
mapping ={}
for x in results:
    if x['converted'] != 'None':
        mapping.update({x['incoming']:x['converted']})

mapping_df = pd.DataFrame.from_dict(mapping.items())
mapping_df.columns = ['ENSP','ENSG']
knowlege_mapped = knowlege.merge(mapping_df,left_on='geneID',right_on='ENSP',how ='inner',validate='m:1')
knowlege_mapped = knowlege_mapped.sort_values('confidenceScore',ascending=False).drop_duplicates(subset = ['ENSG','diseaseID'],keep=False).sort_index()

7769


# Map geneIDs in text mining channel from ENSP to ENSG format

In [None]:
r = requests.post(
url='https://biit.cs.ut.ee/gprofiler/api/convert/convert/',
json={
    'organism':'hsapiens',
    'target':'ENSG',
    'query': list(text_mining['geneID']),
    }
)
results = r.json()['result']
mapping ={}
for x in results:
    if x['converted'] != 'None':
        mapping.update({x['incoming']:x['converted']})

mapping_df = pd.DataFrame.from_dict(mapping.items())
mapping_df.columns = ['ENSP','ENSG']
text_mining_mapped = text_mining.merge(mapping_df,left_on='geneID',right_on='ENSP',how ='inner',validate='m:1')
text_mining_mapped = text_mining_mapped.sort_values('confidenceScore',ascending=False).drop_duplicates(subset = ['ENSG','diseaseID'],keep=False).sort_index()

# Map geneIDs in integrated channel from ENSP to ENSG format ?

In [None]:
from more_itertools import chunked


# r = requests.post(
# url='https://biit.cs.ut.ee/gprofiler/api/convert/convert/',
# json={
#     'organism':'hsapiens',
#     'target':'ENSG',
#     'query': list(integrated['geneID']),
#     }
# )
# results = r.json()['result']
# mapping ={}
# for x in results:
#     if x['converted'] != 'None':
#         mapping.update({x['incoming']:x['converted']})

# mapping_df = pd.DataFrame.from_dict(mapping.items())
# mapping_df.columns = ['ENSP','ENSG']
# integrated_mapped = integrated.merge(mapping_df,left_on='geneID',right_on='ENSP',how ='inner',validate='m:1')
# integrated_mapped = integrated_mapped.sort_values('confidenceScore',ascending=False).drop_duplicates(subset = ['ENSG','diseaseID'],keep=False).sort_index()

In [164]:

import statistics as stats

integrated = integrated[~integrated['diseaseName'].str.contains('ICD10')]
integrated_group = integrated.groupby('diseaseName')
integrated_dict = {k:v for k,v in integrated_group}
integrated_count = {x:len(integrated_dict[x]) for x in integrated_dict.keys()}


print(stats.quantiles(integrated_count.values()))

integrated_threshold = {k:v  for (k,v) in integrated_count.items() if (v > 100 and v <400)}
tst = integrated.loc[integrated['diseaseName'].isin(list(integrated_threshold.keys()))]

print(len(tst))


# print(integrated_count)

[45.0, 151.0, 439.0]
696441


# Merge TIGA data with knowlege and experimental channel datasets

In [34]:
tiga_knowledge = knowlege_mapped.merge(tiga_do,left_on=['ENSG','diseaseID'],right_on=['ensemblId','id'],how='inner',validate='1:1')
tiga_knowledge = tiga_knowledge[['ENSP','ENSG','geneName','trait','efoId','diseaseID','confidenceScore','n_snp','n_snpw']]

tiga_text = text_mining_mapped.merge(tiga_do,left_on=['ENSG','diseaseID'],right_on=['ensemblId','id'],how='inner',validate='1:1')
tiga_text = tiga_text[['ENSP','ENSG','geneName','trait','efoId','diseaseID','confidenceScore','n_snp','n_snpw']]


# Select highest confidence scores for each channel

In [169]:
txt = tiga_text[['ENSG','diseaseID','confidenceScore','n_snp','n_snpw','trait']]
kn = tiga_knowledge[['ENSG','diseaseID','confidenceScore','n_snp','n_snpw','trait']]

inner = txt.merge(kn,on=['ENSG','diseaseID'],how='inner')
inner['confidenceScore'] = inner.apply(lambda x: max(x.confidenceScore_x,x.confidenceScore_y),axis=1)
inner = inner.rename(columns={'n_snp_x':'n_snp','n_snpw_x':'n_snpw','trait_x':'trait'})
inner = inner[['ENSG','diseaseID','confidenceScore','n_snp','n_snpw','trait']]

txt_only = txt.merge(kn,on=['ENSG','diseaseID'],how='left')
txt_only = txt_only[txt_only['confidenceScore_y'].isna()]
txt_only = txt_only.rename(columns={'confidenceScore_x':'confidenceScore','n_snp_x':'n_snp','n_snpw_x':'n_snpw','trait_x':'trait'})
txt_only = txt_only[['ENSG','diseaseID','confidenceScore','n_snp','n_snpw','trait']]

kn_only = txt.merge(kn,on=['ENSG','diseaseID'],how='right')
kn_only = kn_only[kn_only['confidenceScore_x'].isna()]
kn_only = kn_only.rename(columns={'confidenceScore_y':'confidenceScore','n_snp_y':'n_snp','n_snpw_y':'n_snpw','trait_y':'trait'})
kn_only = kn_only[['ENSG','diseaseID','confidenceScore','n_snp','n_snpw','trait']]

df_list = [inner,txt_only,kn_only]
inputs = pd.concat(df_list)

print(len(inputs))

2435


# Select TIGA data with no associated DISEASE data

In [None]:
tiga_only = inputs.merge(tiga_do,left_on=['ENSG','diseaseID'],right_on=['ensemblId','id'],how = 'right')
tiga_only = tiga_only[tiga_only['ENSG'].isna()]



# Generate quantile stats

In [None]:
import statistics as stats

inputs_group = inputs.groupby('trait')
inputs_dict = {k:v for k,v in inputs_group}
inputs_count = {x:len(inputs_dict[x]) for x in inputs_dict.keys()}


print('count quantiles: ',stats.quantiles(inputs_count.values()))
print('score quantiles: ',stats.quantiles(inputs['confidenceScore']))


296


# Threshold inputs based on quantiles

In [168]:
inputs_score_threshold = inputs.loc[(inputs['confidenceScore']>=4)]

input_score_group = inputs.groupby('trait')
input_score_dict = {k:v for k,v in input_score_group}
input_score_count = {x:len(input_score_dict[x]) for x in input_score_dict.keys()}
inputs_count_threshold = {k:v  for (k,v) in input_score_count.items() if (v > 10)}


inputs_combined_threshold = inputs_score_threshold.loc[inputs_score_threshold['trait'].isin(list(inputs_count_threshold.keys()))]

# print(len(inputs_combined_threshold))
# print(inputs_combined_threshold)

print(len(inputs_count_threshold))
for i in inputs_count_threshold:
    print(i,inputs_count_threshold[i])

41
COVID-19 80
Crohn's disease 40
alcohol dependence 15
allergic disease 107
alopecia 15
alopecia areata 15
ankylosing spondylitis 13
asthma 203
atrial fibrillation 21
attention deficit hyperactivity disorder 109
autism spectrum disorder 26
bipolar disorder 87
cancer 141
cholelithiasis 11
chronic obstructive pulmonary disease 21
cleft lip 15
coronary artery disease 189
diabetes mellitus 54
endometriosis 40
glaucoma 29
gout 23
hypertension 99
hypothyroidism 39
inflammatory bowel disease 106
keratoconus 12
kidney disease 11
leprosy 18
melanoma 20
multiple sclerosis 20
nicotine dependence 15
obesity 23
osteoarthritis 13
osteoporosis 22
polycystic ovary syndrome 19
psoriasis 34
refractive error 12
restless legs syndrome 11
rheumatoid arthritis 48
schizophrenia 296
substance abuse 36
systemic scleroderma 34
