# Install Python Dependencies (including the NCATS MVP Module Library)

In [7]:
# You need to figure out where your Python site-packages are... After getting the answer to the following Jupyter 'type' directive
# change the path component 'bin' to 'lib' path, then append the suffix "/site-packages" Then use thge resultng path to 
# find the location at which you can complete 'Step 1' in the following cell.
!type python

python is /anaconda3/bin/python


In [8]:
'''
Assuming that we've checked out the NCATS mvp-module-library 
alongside our ncats-translator-workflows project, this should work
'''
import sys
sys.path.append("../mvp-module-library")
#
# Hack to get around problematic updating of distutils installed PyYAML and a slightly older pandas requiring a compatible numpy
#
# Step 1 - MANUALLY remove the PyYAML and numpy module directories from the your site-packages noted above, then...
#
# Step 2 - Install your system (required new PyYAML updated version should be installed alongside?)#
!{sys.executable} -m pip install -r requirements.txt



In [9]:
from BioLink.biolink_client import BioLinkWrapper
import pandas as pd

# Workflow II Rare Disease Candidates

In [10]:
from Modules.Mod0_lookups import LookUp

# workflow input is a disease identifier
lu = LookUp()

Mod O DiseaseGeneLookup metadata:
{'data_type': 'disease',
 'input_type': {'complexity': 'single', 'id_type': ['MONDO', 'DO', 'OMIM']},
 'limit': None,
 'output_type': {'complexity': 'set', 'id_type': 'HGNC'},
 'predicate': 'blm:gene associated with condition',
 'source': 'Monarch Biolink',
 'taxon': 'human'}


In [11]:
input_disease = 'MONDO:0008667' # Von Hippel Lindau (VHL)
input_object = {
    'input': input_disease,
    'parameters': {
        'taxon': 'human',
        'threshold': None,
    },
}

lu.load_input_object(input_object=input_object)
input_object = lu.input_object

{'description': 'Von Hippel-Lindau disease (VHL) is a familial cancer '
                'predisposition syndrome associated with a variety of '
                'malignant and benign neoplasms, most frequently retinal, '
                'cerebellar, and spinal hemangioblastoma, renal cell carcinoma '
                '(RCC), and pheochromocytoma.',
 'id': 'MONDO:0008667',
 'label': 'von Hippel-Lindau disease'}


In [12]:
# get genes associated with the disease from Biolink
disease_associated_genes = lu.disease_geneset_lookup()
# create list of gene curies for downstream module input
input_curie_set = disease_associated_genes[['hit_id', 'hit_symbol']].to_dict(orient='records')
# show the disease associated genes
disease_associated_genes['modules'] = 'Mod0'
disease_associated_genes

Unnamed: 0,input_id,input_symbol,hit_id,hit_symbol,relation,sources,modules
0,MONDO:0008667,von Hippel-Lindau disease,HGNC:12687,VHL,pathogenic_for_condition,"ctd, omim, orphane, clinvar",Mod0
1,MONDO:0008667,von Hippel-Lindau disease,HGNC:1582,CCND1,contributes to,"omim, ctd",Mod0
2,MONDO:0008667,von Hippel-Lindau disease,HGNC:23057,BRK1,pathogenic_for_condition,clinvar,Mod0


# Mod1A Functional Similarity
## Find similar genes based on GO functional annotations using OntoBio Jaccard similarity

In [14]:
from Modules.Mod1A_functional_sim import FunctionalSimilarity

## Mod1A_human

In [15]:
# Module specification
mod1a_input_object_human = {
    'input': input_curie_set,
    'parameters': {
        'taxon': 'human',
        'threshold': 0.75,  # jaccard index threshold
    },
}
    
func_sim_human = FunctionalSimilarity()

Mod1A Functional Similarity metadata:
{'input_type': {'complexity': 'set', 'data_type': 'gene', 'id_type': 'HGNC'},
 'output_type': {'complexity': 'set', 'data_type': 'gene', 'id_type': 'HGNC'},
 'predicate': ['blm:macromolecular machine to biological process association',
               'macromolecular machine to molecular activity association'],
 'source': 'Monarch Biolink'}


In [16]:
func_sim_human.load_input_object(mod1a_input_object_human)
func_sim_human.load_gene_set() 

In [17]:
func_sim_human.load_associations()

In [18]:
mod1a_results = func_sim_human.compute_similarity()

In [24]:
Mod1A_results_human = pd.DataFrame(mod1a_results)
Mod1A_results_human = Mod1A_results_human[~Mod1A_results_human['hit_id'].isin(disease_associated_genes['hit_id'].tolist())].sort_values('score', ascending=False)
Mod1A_results_human['module'] = 'Mod1A'
Mod1A_results_human

Unnamed: 0,hit_id,hit_symbol,input_id,input_symbol,score,module
3,HGNC:7666,NCKAP1,HGNC:23057,BRK1,0.835714,Mod1A


# MOD1B Phenotype Similarity
## Find similar genes based on OwlSim calculated Phenotype Similarity

## Mod1B Human

In [25]:
from Modules.Mod1B1_phenotype_similarity import PhenotypeSimilarity

In [26]:
# Module specification
mod1b_input_object_human = {
    'input': input_curie_set,
     'parameters': {
        'taxon': 'human',
        'threshold': .50,
    },
}


pheno_sim_human = PhenotypeSimilarity()

Mod1B Phenotype Similarity metadata:
{'input_type': {'complexity': 'set', 'data_type': 'gene', 'id_type': 'HGNC'},
 'output_type': {'complexity': 'set', 'data_type': 'gene', 'id_type': 'HGNC'},
 'predicate': ['blm:has phenotype'],
 'source': 'Monarch Biolink'}


In [27]:
pheno_sim_human.load_input_object(mod1b_input_object_human)
pheno_sim_human.load_gene_set()

In [28]:
pheno_sim_human.load_associations()

In [29]:
mod1b_results = pheno_sim_human.compute_similarity()

In [30]:
# compute phenotype similarity
Mod1B_results = pd.DataFrame(mod1b_results)
Mod1B_results = Mod1B_results[~Mod1B_results['hit_id'].isin(disease_associated_genes['hit_id'].tolist())].sort_values('score', ascending=False)
Mod1B_results['module'] = 'Mod1B'
Mod1B_results

Unnamed: 0,hit_id,hit_symbol,input_id,input_symbol,score,module
7,HGNC:5477,IGH,HGNC:1582,CCND1,1.0,Mod1B
5,HGNC:6913,MAX,HGNC:12687,VHL,0.647482,Mod1B
0,HGNC:26034,SDHAF2,HGNC:12687,VHL,0.629371,Mod1B
4,HGNC:6971,MDH2,HGNC:12687,VHL,0.572727,Mod1B
1,HGNC:16636,KIF1B,HGNC:12687,VHL,0.559557,Mod1B


# Mod1E Protein Interaction

## Mod1E Human

In [31]:
from Modules.Mod1E_interactions import GeneInteractions

In [32]:
# Module specification
mod1E_input_object_human = {
    'input': input_curie_set,
     'parameters': {
        'taxon': 'human',
        'threshold': None,
    },
}


interactions_human = GeneInteractions()

Mod1E Interaction Network metadata:
{'input_type': {'complexity': 'set', 'data_type': 'gene', 'id_type': 'HGNC'},
 'output_type': {'complexity': 'set', 'data_type': 'gene', 'id_type': 'HGNC'},
 'predicate': ['blm:interacts with'],
 'source': 'Monarch Biolink'}


In [33]:
interactions_human.load_input_object(mod1E_input_object_human)
interactions_human.load_gene_set()

In [34]:
mod1e_results = interactions_human.get_interactions()

In [35]:
Mod1E_results_human = pd.DataFrame(mod1e_results)

In [36]:
counts = Mod1E_results_human['hit_symbol'].value_counts().rename_axis('unique_values').to_frame('counts').reset_index()
high_counts = counts[counts['counts'] > 12]['unique_values'].tolist()

In [37]:
Mod1E_results_final = pd.DataFrame(Mod1E_results_human[Mod1E_results_human['hit_symbol'].isin(high_counts)])

In [38]:
Mod1E_results_final['module'] = 'Mod1E'

In [39]:
Mod1E_results_final.head()

Unnamed: 0,hit_id,hit_symbol,input_id,input_symbol,score,module


In [40]:
all_results = pd.concat([Mod1A_results_human, Mod1B_results])

In [41]:
from Modules.StandardOutput import StandardOutput

In [42]:
so = StandardOutput(results=all_results.to_dict(orient='records'), input_object=input_object)

In [43]:
std_api_response_json = so.output_object

In [44]:
std_api_response_json

{'context': 'https://raw.githubusercontent.com/biolink/biolink-model/master/context.jsonld',
 'datetime': '2019-02-19 08:16:22.141878',
 'id': '',
 'message': '6 results found',
 'n_results': 6,
 'original_question_text': 'What genes are functionally similar to genes associated with MONDO:0008667',
 'query_type_id': 'query_id',
 'reasoner_id': 'Orange',
 'response_code': 'OK',
 'restated_question_text': 'What genes are functionally similar to genes associated with MONDO:0008667',
 'result_list': [{'confidence': 0.8357142857142857,
   'essence': 'gene, functional similarity',
   'id': 'Mod1A',
   'reasoner_id': 'orange',
   'result_graph': {'edge_list': [{'is_defined_by': 'orange',
      'provided_by': 'BioLink',
      'source_id': 'HGNC:23057',
      'target_id': 'HGNC:7666',
      'type': 'functially_similar_to'}],
    'node_list': [{'description': 'gene',
      'id': 'HGNC:7666',
      'name': 'NCKAP1',
      'type': 'gene',
      'uri': ''},
     {'description': 'gene',
      'id': 

In [46]:
import requests

# get the URL for these results displayed in the RTX UI
RTX_UI_REQUEST_URL = "https://rtx.ncats.io/api/rtx/v1/response/process"
to_post = {"options": ["Store", "ReturnResponseId"], "responses": [std_api_response_json]}
ui_url = requests.post(RTX_UI_REQUEST_URL, json=to_post)
print("Please visit the following website: https://rtx.ncats.io/?r=%s" % ui_url.json()['response_id'])

Please visit the following website: https://rtx.ncats.io/?r=1308


In [47]:
# Retrieving Details

print("Please visit the following link to retrieve JSON results: https://rtx.ncats.io/api/rtx/v1/response/%s" % ui_url.json()['response_id'])

Please visit the following link to retrieve JSON results: https://rtx.ncats.io/api/rtx/v1/response/1308
