# Read and crop input data

In [1]:
import pandas as pd 
import requests

text_mining = pd.read_csv('../raw/human_disease_textmining_filtered.tsv',sep = '\t')
text_mining.columns= ['geneID','geneName','diseaseID','diseaseName','zScore','confidenceScore','sourceUrl']

knowlege = pd.read_csv('../raw/human_disease_knowledge_filtered.tsv',sep = '\t')
knowlege.columns= ['geneID','geneName','diseaseID','diseaseName','sourceDB','evidenceType','confidenceScore']

tiga = pd.read_csv('../raw/tiga_gene-trait_stats.tsv',sep='\t')
tiga = tiga[['ensemblId','efoId','trait','n_snp','n_snpw']]
tiga = tiga.drop_duplicates(subset=['ensemblId','trait'])

human_do = pd.read_csv('../raw/HumanDO.tsv',sep = '\t')
human_do = human_do.drop_duplicates(subset='label')
human_do = human_do[['id','label']]

tiga_do = tiga.merge(human_do,left_on='trait',right_on='label',how='inner',validate='m:1')

# Map geneIDs in knowledge channel from ENSP to ENSG format


In [2]:
r = requests.post(
url='https://biit.cs.ut.ee/gprofiler/api/convert/convert/',
json={
    'organism':'hsapiens',
    'target':'ENSG',
    'query': list(knowlege['geneID']),
    }
)

results = r.json()['result']
mapping ={}
for x in results:
    if x['converted'] != 'None':
        mapping.update({x['incoming']:x['converted']})

mapping_df = pd.DataFrame.from_dict(mapping.items())
mapping_df.columns = ['ENSP','ENSG']
knowlege_mapped = knowlege.merge(mapping_df,left_on='geneID',right_on='ENSP',how ='inner',validate='m:1')
knowlege_mapped = knowlege_mapped.sort_values('confidenceScore',ascending=False).drop_duplicates(subset = ['ENSG','diseaseID'],keep=False).sort_index()

# Map geneIDs in text mining channel from ENSP to ENSG format

In [3]:
r = requests.post(
url='https://biit.cs.ut.ee/gprofiler/api/convert/convert/',
json={
    'organism':'hsapiens',
    'target':'ENSG',
    'query': list(text_mining['geneID']),
    }
)
results = r.json()['result']
mapping ={}
for x in results:
    if x['converted'] != 'None':
        mapping.update({x['incoming']:x['converted']})

mapping_df = pd.DataFrame.from_dict(mapping.items())
mapping_df.columns = ['ENSP','ENSG']
text_mining_mapped = text_mining.merge(mapping_df,left_on='geneID',right_on='ENSP',how ='inner',validate='m:1')
text_mining_mapped = text_mining_mapped.sort_values('confidenceScore',ascending=False).drop_duplicates(subset = ['ENSG','diseaseID'],keep=False).sort_index()

# Select highest confidence scores for each channel

In [4]:


inner = text_mining_mapped.merge(knowlege_mapped,on=['ENSG','diseaseID'],how='inner')
inner['confidenceScore'] = inner.apply(lambda x: max(x.confidenceScore_x,x.confidenceScore_y),axis=1)
inner = inner.rename(columns={'ENSP_x':'ENSP','geneName_x':'geneName','diseaseName_x':'diseaseName','geneID_x':'geneID'})
inner = inner[['ENSG','ENSP','geneName','diseaseID','diseaseName','confidenceScore']]


txt_only = text_mining_mapped.merge(knowlege_mapped,on=['ENSG','diseaseID'],how='left')
txt_only = txt_only[txt_only['confidenceScore_y'].isna()]
txt_only = txt_only.rename(columns={'confidenceScore_x':'confidenceScore','ENSP_x':'ENSP','geneName_x':'geneName','diseaseName_x':'diseaseName'})
txt_only = txt_only[['ENSG','ENSP','geneName','diseaseID','diseaseName','confidenceScore']]


kn_only = text_mining_mapped.merge(knowlege_mapped,on=['ENSG','diseaseID'],how='right')
kn_only = kn_only[kn_only['confidenceScore_x'].isna()]
kn_only = kn_only.rename(columns={'confidenceScore_y':'confidenceScore','ENSP_y':'ENSP','geneName_y':'geneName','diseaseName_y':'diseaseName'})
kn_only = kn_only[['ENSG','ENSP','geneName','diseaseID','diseaseName','confidenceScore']]


df_list = [inner,txt_only,kn_only]
inputs = pd.concat(df_list)


# Generate quantile stats

In [5]:
import statistics as stats

inputs_group = inputs.groupby('diseaseName')
inputs_dict = {k:v for k,v in inputs_group}
inputs_count = {x:len(inputs_dict[x]) for x in inputs_dict.keys()}

print('count quantiles: ',stats.quantiles(inputs_count.values()))
print('score quantiles: ',stats.quantiles(inputs['confidenceScore']))

count quantiles:  [2.0, 5.0, 20.0]
score quantiles:  [1.643, 1.841, 2.165]


# Threshold inputs based on quantiles

In [None]:
inputs_score_threshold = inputs.loc[(inputs['confidenceScore']>=4)]

input_score_group = inputs.groupby('diseaseName')
input_score_dict = {k:v for k,v in input_score_group}
input_score_count = {x:len(input_score_dict[x]) for x in input_score_dict.keys()}

inputs_count_threshold = {k:v  for (k,v) in input_score_count.items() if (v > 10)}

inputs_combined_threshold = inputs_score_threshold.loc[inputs_score_threshold['diseaseName'].isin(list(inputs_count_threshold.keys()))]
print(inputs_combined_threshold)

# Map input IDs to STRING IDs

In [None]:
string_api_url = "https://version-12-0.string-db.org/api"
output_format = "tsv-no-header"
method = "get_string_ids"
str_params = {
    "identifiers" : "\r".join(list(inputs_combined_threshold['ENSP'])), 
    "species" : 9606, 
    "echo_query" : 1, 
}
request_url = "/".join([string_api_url, output_format, method])
string_results = requests.post(request_url, data=str_params)

string_map = {}
for line in string_results.text.strip().split("\n"):
    l = line.split("\t")
    string_map.update({l[0]:l[2]})
string_df = pd.DataFrame.from_dict(string_map.items())
string_df.columns = ['ENSP','str_id']

inputs_string_df = inputs_combined_threshold.merge(string_df,on='ENSP',how ='inner')

# Threshold TIGA data by gene count per disease

In [7]:
tiga_filtered = tiga_do[tiga_do['id'].isin(inputs_combined_threshold['diseaseID'])]

tiga_group = tiga_filtered.groupby('trait')
tiga_dict = {k:v for k,v in tiga_group}
tiga_count = {x:len(tiga_dict[x]) for x in tiga_dict.keys()}
tiga_count_threshold = {k:v  for (k,v) in tiga_count.items() if (v > 10)}

tiga_threshold = tiga_filtered.loc[tiga_filtered['trait'].isin(list(tiga_count_threshold.keys()))]

# Map TIGA to STRING IDs

In [8]:
string_api_url = "https://version-12-0.string-db.org/api"
output_format = "tsv-no-header"
method = "get_string_ids"
str_params = {
    "identifiers" : "\r".join(list(tiga_threshold['ensemblId'])), 
    "species" : 9606, 
    "echo_query" : 1, 
}
request_url = "/".join([string_api_url, output_format, method])
string_results = requests.post(request_url, data=str_params)

string_map = {}
for line in string_results.text.strip().split("\n"):
    l = line.split("\t")
    string_map.update({l[0]:l[2]})
string_df = pd.DataFrame.from_dict(string_map.items())
string_df.columns = ['ENSP','str_id']

tiga_string_df = tiga_threshold.merge(string_df,left_on = 'ensemblId',right_on = 'ENSP',how ='inner')

# Map inputs (txt and knowledge) to string IDs

In [9]:
string_api_url = "https://version-12-0.string-db.org/api"
output_format = "tsv-no-header"
method = "get_string_ids"
str_params = {
    "identifiers" : "\r".join(list(inputs_combined_threshold['ENSP'])), 
    "species" : 9606, 
    "echo_query" : 1, 
}
request_url = "/".join([string_api_url, output_format, method])
string_results = requests.post(request_url, data=str_params)

string_map = {}
for line in string_results.text.strip().split("\n"):
    l = line.split("\t")
    string_map.update({l[0]:l[2]})
string_df = pd.DataFrame.from_dict(string_map.items())
string_df.columns = ['ENSP','str_id']

inputs_string_df = inputs_combined_threshold.merge(string_df,on='ENSP',how ='inner')

# Generate input prize files

In [10]:
tiga_prizes = tiga_string_df.groupby('trait')
tiga_prize_dict = {k:v for k,v in tiga_prizes}

for disease in tiga_prize_dict.keys():
    df = tiga_prize_dict[disease]
    df = df[['str_id','n_snpw']]
    df = df.rename(columns={'str_id':'NODEID','n_snpw':'prize'})
    df.to_csv(f"../prize_files/{disease.replace(' ','_')}_prizes.txt",sep = '	',index=False)



# Process STRING interactome 

In [31]:
string = pd.read_csv('../raw/9606.protein.links.v12.0.txt',sep = ' ',skiprows=[0],header=None)
string = string[string.iloc[:,2]>900]
string = string.iloc[:,[0,1]]
string[len(string.columns)] = 1
string.to_csv('string_interactome.txt',sep = ' ',index=False,header=None)

# Generate gold standard files 

In [53]:
inputs_string_df = inputs_string_df[inputs_string_df['diseaseID'].isin(tiga_string_df['id'])]

inputs_combined_group = inputs_string_df.groupby('diseaseName')
inputs_combined_dict = {k:v for k,v in inputs_combined_group}

for disease in inputs_combined_dict.keys():
    df = inputs_combined_dict[disease]
    df = df[['str_id']]
    df.to_csv(f"../GS_files/{disease.replace(' ','_')}_GS.txt",sep = '	',index=False,header=None)