Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cohd/cohd_trapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def operate(self):
batch_size_limit = 100 # max length of any IDs list
limit_max_results = 500
json_inf_replacement = 999 # value to replace +/-Infinity with in JSON
mcq_score_scaling = 0.75 # magic number to adjust normalized MCQ score
supported_query_methods = ['relativeFrequency', 'obsExpRatio', 'chiSquare']
supported_operation = 'lookup_and_score'

Expand Down
48 changes: 27 additions & 21 deletions cohd/cohd_trapi_15.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class CohdTrapi150(CohdTrapi):
edge_types_negative = ['biolink:negatively_correlated_with']
default_negative_predicate = edge_types_negative[0]

tool_version = f'{CohdTrapi._SERVICE_NAME} 6.5.3'
tool_version = f'{CohdTrapi._SERVICE_NAME} 6.5.4'
schema_version = '1.5.0'
biolink_version = bm_version

Expand Down Expand Up @@ -600,15 +600,15 @@ def _interpret_query(self):
if self._concept_1_set_interpretation == 'BATCH':
ids = list(set(concept_1_qnode['ids'])) # remove duplicate CURIEs
elif self._concept_1_set_interpretation == 'MANY':
member_ids = concept_1_qnode.get('member_ids')
if not member_ids:
self._mcq_member_ids = concept_1_qnode.get('member_ids')
if not self._mcq_member_ids:
# Missing required member_ids for MCQ
self._valid_query = False
description = 'set_interpretation: MANY but no member_ids'
response = self._trapi_mini_response(TrapiStatusCode.MISSING_MEMBER_IDS, description)
self._invalid_query_response = response, 200
return self._valid_query, self._invalid_query_response
ids = list(set(concept_1_qnode['member_ids'])) # remove duplicate CURIEs
ids = list(self._mcq_member_ids) # remove duplicate CURIEs

# Get the MCQ set ID
self._mcq_set_id = concept_1_qnode['ids'][0]
Expand Down Expand Up @@ -999,31 +999,36 @@ def operate_mcq(self):
# categories (domains)
for domain_id, concept_class_id in self._domain_class_pairs:
new_results = query_cohd_mysql.query_trapi_mcq(concept_ids=self._concept_1_omop_ids,
dataset_id=self._dataset_id,
domain_id=domain_id,
concept_class_id=concept_class_id,
ln_ratio_sign=self._association_direction,
confidence=self._confidence_interval,
bypass=self._bypass_cache)
n_member_ids=len(self._mcq_member_ids),
score_scaling=CohdTrapi.mcq_score_scaling,
dataset_id=self._dataset_id,
domain_id=domain_id,
concept_class_id=concept_class_id,
ln_ratio_sign=self._association_direction,
confidence=self._confidence_interval,
bypass=self._bypass_cache)
new_set_results, new_single_results = new_results
if new_set_results:
set_results.extend(new_set_results)
single_results.update(new_single_results)
else:
# No category (domain) was specified for Node 2. Query the associations between Node 1 and all
# domains
new_results = query_cohd_mysql.query_trapi_mcq(concept_id_1=self._concept_1_omop_ids,
dataset_id=self._dataset_id, domain_id=None,
ln_ratio_sign=self._association_direction,
confidence=self._confidence_interval,
bypass=self._bypass_cache)
new_results = query_cohd_mysql.query_trapi_mcq(concept_ids=self._concept_1_omop_ids,
n_member_ids=len(self._mcq_member_ids),
score_scaling=CohdTrapi.mcq_score_scaling,
dataset_id=self._dataset_id,
domain_id=None,
ln_ratio_sign=self._association_direction,
confidence=self._confidence_interval,
bypass=self._bypass_cache)
new_set_results, new_single_results = new_results
if new_set_results:
set_results.extend(new_set_results)
single_results.update(new_single_results)

# Results within each query call should be sorted, but still need to be sorted across query calls
new_set_results = sort_cohd_results(new_set_results, sort_field='ln_ratio_score')
new_set_results = sort_cohd_results(new_set_results, sort_field='mcq_score')

# Convert results from COHD format to Translator Reasoner standard
self._add_mcq_results_to_trapi(set_results, single_results)
Expand Down Expand Up @@ -1169,8 +1174,8 @@ def _add_mcq_result(self, set_result, single_results, criteria):
kg_node_2, kg_set_edge, kg_set_edge_id = self._add_kg_set_edge(node_2, is_subject, set_result)

# Add to results
score = set_result['ln_ratio_score']
self._add_result(self._mcq_set_id, concept_2_curie, kg_set_edge_id, score)
score = set_result['mcq_score']
self._add_result(self._mcq_set_id, concept_2_curie, kg_set_edge_id, score, mcq=True)

# Add single result edges and auxiliary graphs
support_graphs = list()
Expand All @@ -1196,7 +1201,7 @@ def _add_mcq_result(self, set_result, single_results, criteria):
"value": support_graphs
})

def _add_result(self, kg_node_1_id, kg_node_2_id, kg_edge_id, score):
def _add_result(self, kg_node_1_id, kg_node_2_id, kg_edge_id, score, mcq=False):
""" Adds a knowledge graph edge to the results list

Parameters
Expand All @@ -1205,6 +1210,7 @@ def _add_result(self, kg_node_1_id, kg_node_2_id, kg_edge_id, score):
kg_node_2_id: Object node ID
kg_edge_id: edge ID
score: result score
mcq: True/False if MCQ analysis

Returns
-------
Expand All @@ -1231,7 +1237,7 @@ def _add_result(self, kg_node_1_id, kg_node_2_id, kg_edge_id, score):
}]
},
'score': score,
'scoring_method': 'Lower bound of biolink:ln_ratio_confidence_interval',
'scoring_method': 'COHD set-input query scoring, range: [0,1]' if mcq else 'Lower bound of biolink:ln_ratio_confidence_interval',
}
]
}
Expand Down Expand Up @@ -1913,7 +1919,7 @@ def _add_kg_set_edge(self, node_2, is_subject, set_result):
'value_type_id': 'EDAM:data_1772', # Score
'attribute_source': CohdTrapi._INFORES_ID,
'description': 'Observed-expected frequency ratio.'
},
},
{
'attribute_type_id': 'biolink:supporting_data_set', # Database ID
'original_attribute_name': 'dataset_id',
Expand Down
26 changes: 21 additions & 5 deletions cohd/query_cohd_mysql.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pymysql
from flask import jsonify
from scipy.stats import chisquare
import numpy as np
from numpy import argsort
import logging
import pandas as pd
Expand All @@ -15,6 +16,7 @@
DATASET_ID_DEFAULT = 1
DATASET_ID_DEFAULT_HIER = 3
DEFAULT_CONFIDENCE = 0.99
DEFAULT_MCQ_SCORE_SCALING = 0.75

# OXO API configuration
URL_OXO_SEARCH = 'https://www.ebi.ac.uk/spot/oxo/api/search'
Expand Down Expand Up @@ -1132,7 +1134,11 @@ def query_db(service, method, args):
elif type(concept_ids) is not list:
concept_ids = [concept_ids]

set_results, single_results = query_trapi_mcq(concept_ids, dataset_id, domain_id, bypass=True)
set_results, single_results = query_trapi_mcq(concept_ids=concept_ids,
n_member_ids=len(concept_ids),
dataset_id=dataset_id,
domain_id=domain_id,
bypass=True)
json_return = {
'set_results': set_results,
'single_results': single_results
Expand Down Expand Up @@ -1837,7 +1843,7 @@ def _get_weighted_statistics(cur=None,dataset_id=None,domain_id = None,concept_i
concept_list_1_w_df= pd.DataFrame({'concept_id_1':concept_id_1})
concept_list_1_w_df['w'] = 1

# Calculate the weights based on Jaccard index between input concep
# Calculate the weights based on Jaccard index between input concepts
pair_count_q1 = pd.DataFrame(get_pair_concept_count(cur=cur,dataset_id=dataset_id,domain_id=domain_id, concept_id_list_1=concept_id_1,concept_id_list_2=concept_id_1))
if pair_count_q1.shape[0] > 0:
# Sum of Jaccard index
Expand All @@ -1849,6 +1855,7 @@ def _get_weighted_statistics(cur=None,dataset_id=None,domain_id = None,concept_i
# Weight = 1/(1 + sum(Jaccards))
concept_list_1_w_df['w'] = 1/concept_list_1_w_df['w']
concept_list_1_w_df = concept_list_1_w_df[['concept_id_1','w']]
total_weights = concept_list_1_w_df.w.sum()

# Multiply the scores by the weights
pair_count_df = pair_count_df.merge(concept_list_1_w_df)
Expand All @@ -1858,7 +1865,7 @@ def _get_weighted_statistics(cur=None,dataset_id=None,domain_id = None,concept_i
# Group by concept_id_2. Sum the scores and combine concept_id_1 into a list
gb = pair_count_df.groupby('concept_id_2')
weighted_stats = gb[json_key].agg('sum')
return weighted_stats.reset_index()
return weighted_stats.reset_index(), total_weights


def _get_ci_scores(r, score_col):
Expand All @@ -1871,14 +1878,17 @@ def _get_ci_scores(r, score_col):


@cache.memoize(timeout=86400, unless=_bypass_cache)
def query_trapi_mcq(concept_ids, dataset_id=None, domain_id=None, concept_class_id=None,
def query_trapi_mcq(concept_ids, n_member_ids, score_scaling=DEFAULT_MCQ_SCORE_SCALING,
dataset_id=None, domain_id=None, concept_class_id=None,
ln_ratio_sign=0, confidence=DEFAULT_CONFIDENCE, bypass=False):
""" Query for TRAPI Multicurie Query. Calculates weighted scores using methods similar to linkage disequilibrium to
downweight contributions from input concepts that are similar to each other

Parameters
----------
concept_ids: list of OMOP concept IDs
n_member_ids: number of input IDs in set node
score_scaling: linear scaling of ln_ratio_score prior to logistic normalization
dataset_id: (optional) String - COHD dataset ID
domain_id: (optional) String - OMOP domain ID
concept_class_id: (optional) String - OMOP concept class ID
Expand Down Expand Up @@ -1912,13 +1922,19 @@ def query_trapi_mcq(concept_ids, dataset_id=None, domain_id=None, concept_class_

# Adjust the scores by weights
concept_list_1 = list(set(associations['concept_id_1'].tolist()))
weighted_ln_ratio = _get_weighted_statistics(cur=cur, dataset_id=dataset_id, domain_id=domain_id,
weighted_ln_ratio, total_weights = _get_weighted_statistics(cur=cur, dataset_id=dataset_id, domain_id=domain_id,
concept_id_1=concept_list_1, pair_count_df=associations,
json_key = 'ln_ratio_score')
# weighted_log_odds = _get_weighted_statistics(cur=cur, dataset_id=dataset_id, domain_id=domain_id,
# concept_id_1=concept_list_1, pair_count_df=associations,
# json_key = 'log_odds_score')

# For TRAPI result score, normalize the score relative to the number of input CURIEs and
# scale the score range to [0-1] using a scaled logistic function
n_mapped_ids = len(concept_list_1)
weighted_ln_ratio['mcq_score'] = weighted_ln_ratio['ln_ratio_score'] / total_weights * n_mapped_ids / n_member_ids
weighted_ln_ratio['mcq_score'] = (1/(1+np.exp(-np.abs(weighted_ln_ratio['mcq_score']*score_scaling)))-0.5) * 2

# Add list of single associations
single_associations = dict()
for i, row in weighted_ln_ratio.iterrows():
Expand Down
Loading