In [25]:
# Parameter inputs
aragorn_submit_url = "https://aragorn-u24.apps.renci.org/robokop/query"
trapi_submit_url = "http://automat-u24.apps.renci.org/robokopkg/1.3/query"

automat_cypher_submit_url = 'https://automat.renci.org/robokopkg/cypher'
robokopkg_bolt_url = "bolt://robokopkg.renci.org:7687"

input_search_string = 'ppara'
output_search_string = 'liver fibrosis'

In [27]:
import requests
import os
import json
import pandas as pd
import pprint
pp = pprint.PrettyPrinter(indent=5)

from datetime import datetime
from pathlib import Path
from collections import Counter

In [1]:
# Initializing directory to write
now = datetime.now()
dt_string = now.strftime("%Y-%m-%d_%H%M%S")
write_dir_trapi_automat = Path("output/compare/"+str(dt_string)+"/trapi/automat")
write_dir_trapi_automat.mkdir(parents=True, exist_ok=True)

write_dir_trapi_ara = Path("output/compare/"+str(dt_string)+"/trapi/ara")
write_dir_trapi_ara.mkdir(parents=True, exist_ok=True)

write_dir_cypher_automat = Path("output/compare/"+str(dt_string)+"/cypher/automat")
write_dir_cypher_automat.mkdir(parents=True, exist_ok=True)

write_dir_cypher_robokopkg = Path("output/compare/"+str(dt_string)+"/cypher/robokopkg")
write_dir_cypher_robokopkg.mkdir(parents=True, exist_ok=True)

In [28]:
# # Pathway results to load and write directory to specify (on David's work computer from previous run)
# write_dir_cypher_automat = "output/compare/2023-07-07_153233/cypher/automat"
# write_dir_cypher_robokopkg = "output/compare/2023-07-07_153233/cypher/robokopkg"
# ARA_results_file = "output/compare/2023-07-07_153233/trapi/ara/results_ARA.csv"
# TRAPI_results_file = "output/compare/2023-07-07_153233/trapi/automat/results_TRAPI.csv"
# Cypher_ROBOKOPKG_results_file = "output/compare/2023-07-07_153233/cypher/robokopkg/results.csv"
# Cypher_Automat_results_file = "output/compare/2023-07-07_153233/cypher/automat/results.csv"

# Pathway results to load
ARA_results_file = "output/compare/"+str(dt_string)+"/trapi/ara/results_ARA.csv"
TRAPI_results_file = "output/compare/"+str(dt_string)+"/trapi/automat/results_TRAPI.csv"
Cypher_ROBOKOPKG_results_file = "output/compare/"+str(dt_string)+"/cypher/robokopkg/results.csv"
Cypher_Automat_results_file = "output/compare/"+str(dt_string)+"/cypher/automat/results.csv"

In [29]:
# Initializing Neo4j connection class
user = 'neo4j'
pw = ''

from neo4j import GraphDatabase
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session()
            response = list(session.run(query))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

## Generating data for comparison

### Getting "Input/Output" CURIES

In [30]:
results = requests.post(f'https://name-resolution-sri.renci.org/lookup?string={input_search_string}&offset=0&limit=100')
results_json = results.json()
#print(json.dumps(results_json,indent=4))
input_node_id_list = list(results_json.keys())
print(input_node_id_list)
print(f"Number of 'input' IDs: {len(input_node_id_list)}")

['UniProtKB:P37230', 'UniProtKB:Q07869', 'UniProtKB:Q95N78', 'PR:000013056', 'UniProtKB:P23204', 'NCBIGene:19013', 'NCBIGene:25747', 'NCBIGene:5465', 'NCBIGene:557714', 'NCBIGene:30755', 'NCBIGene:563298', 'UMLS:C0166415', 'NCBIGene:10891', 'NCBIGene:133522', 'UMLS:C2984537', 'NCBIGene:400931', 'PR:000040325', 'UMLS:C1868415', 'MESH:C000630914', 'MESH:C000634429', 'UMLS:C5226508', 'UMLS:C5197094', 'UMLS:C5417797', 'REACT:R-SSC-400204', 'REACT:R-BTA-400204', 'UMLS:C1518805', 'REACT:R-DRE-400204', 'REACT:R-HSA-400204', 'REACT:R-MMU-400204', 'REACT:R-BTA-9734475', 'REACT:R-SSC-9734475', 'REACT:R-HSA-879724', 'REACT:R-DRE-9734475', 'REACT:R-HSA-9734475', 'REACT:R-CFA-400204', 'REACT:R-MMU-9734475', 'REACT:R-RNO-400204', 'REACT:R-XTR-400204', 'REACT:R-CFA-9734475', 'REACT:R-RNO-9734475', 'REACT:R-XTR-9734475', 'REACT:R-DME-400204', 'REACT:R-HSA-1989781', 'REACT:R-BTA-4341070', 'REACT:R-DME-9734475', 'REACT:R-BTA-400143', 'REACT:R-DRE-400143', 'REACT:R-DRE-4341070', 'REACT:R-MMU-400143', 'RE

In [31]:
results = requests.post(f'https://name-resolution-sri.renci.org/lookup?string={output_search_string}&offset=0&limit=100')
results_json = results.json()
#print(json.dumps(results_json,indent=4))
output_node_id_list = list(results_json.keys())
print(output_node_id_list)
print(f"Number of 'output' IDs: {len(output_node_id_list)}")

['HP:0001395', 'UMLS:C4227681', 'UMLS:C4034373', 'UMLS:C5189427', 'UMLS:C0544816', 'MONDO:0100430', 'MONDO:0018840', 'UMLS:C1397317', 'UMLS:C4068302', 'UMLS:C4481250', 'UMLS:C2827436', 'UMLS:C4321337', 'UMLS:C4695229', 'UMLS:C0494791', 'UMLS:C0400961', 'UMLS:C3864238', 'UMLS:C1960658', 'UMLS:C4695228', 'UMLS:C5563662', 'UMLS:C1407032', 'UMLS:C0400925', 'UMLS:C4749320', 'UMLS:C5548949', 'UMLS:C4533463', 'UMLS:C5689517', 'UMLS:C5689516', 'UMLS:C4722044', 'UMLS:C1856310', 'UMLS:C5439238', 'UMLS:C4722043', 'UMLS:C5548946', 'UMLS:C4533767', 'UMLS:C3277942', 'UMLS:C1385044', 'UMLS:C4070891', 'UMLS:C3873179', 'UMLS:C4070890', 'UMLS:C1954436', 'UMLS:C4070622', 'UMLS:C4036765', 'UMLS:C5215514', 'UMLS:C0451713', 'UMLS:C5686432', 'UMLS:C3275636', 'UMLS:C4750548', 'UMLS:C5549445', 'UMLS:C5549441', 'UMLS:C2184113', 'UMLS:C5190480', 'UMLS:C5171263', 'UMLS:C5171261', 'UMLS:C5171262', 'UMLS:C2751577', 'UMLS:C1869017', 'UMLS:C4030819', 'UMLS:C2749679', 'UMLS:C5697513', 'UMLS:C4732266', 'UMLS:C3869480',

### TRAPI methods

In [6]:
# Initializing TRAPI query and extracting results for ARA and Automat
query={
    "message": {
      "query_graph": {
        "edges": {
          "e00": {
            "subject": "n00",
              "object": "n01",
          "predicates":["biolink:related_to"]
          },
          "e01": {
            "subject": "n01",
              "object": "n02",
          "predicates":["biolink:related_to"]
          }
        },
        "nodes": {
          "n00": {
            "ids": input_node_id_list, #['NCBIGene:5465'], #
            "categories": ["biolink:GeneOrGeneProduct"]
          },
          "n01": {
              "categories": ["biolink:BiologicalEntity"]
          },
          "n02": {
            "ids": output_node_id_list, #["HP:0001395"],
            "categories": ["biolink:DiseaseOrPhenotypicFeature"]
          }
        }
      }
    }
  }


In [7]:
response_trapi = requests.post(trapi_submit_url,json=query)
print(response_trapi.status_code)
number_pathway_results_trapi = len(response_trapi.json()['message']['results'])
print(len(response_trapi.json()['message']['results']))

200
201


In [9]:
kg_trapi = response_trapi.json()['message']['knowledge_graph']
results_trapi = response_trapi.json()['message']['results']

In [None]:
import pandas as pd
import os

cols = []
for node in sorted(results_trapi[0]['node_bindings'].keys()):
    cols.append(node)
    cols.append(node + '_name')
results_trapi_df = pd.DataFrame(columns = cols)

results_trapi_list = []
for result in results_trapi:
    result_dict = {}
    for node in sorted(result['node_bindings'].keys()):
        node_id = result['node_bindings'][node][0]['id']
        result_dict[node] = node_id
        result_dict[node + '_name'] = kg_trapi['nodes'][node_id]['name']

    results_trapi_list.append(pd.DataFrame([result_dict]))
results_trapi_df = pd.concat(results_trapi_list)
display(results_trapi_df)
# results_trapi_df.to_csv(os.path.join(write_dir_trapi_automat,'results_TRAPI.csv'), index=False)
results_trapi_df.to_csv(TRAPI_results_file, index=False)

combined_node_list = ["_".join([row[1].replace(" ", "_"), row[3].replace(" ", "_"), row[5].replace(" ", "_")]) for row in results_trapi_df[cols].to_numpy()]
pp.pprint(combined_node_list)

In [None]:
from collections import Counter
import json
import pprint
pp = pprint.PrettyPrinter(indent=5)

for i in range(number_pathway_results_trapi):
    print(f"Pathway result: {combined_node_list[i]}")
    edge_bindings = results_trapi[i]['edge_bindings']

    edge_ids = []
    for edge_name, edge_list in edge_bindings.items():
        edge_ids.append({edge_name: [x['id'] for x in edge_list]})

    string_out_list = []
    for edge_dict in edge_ids:
        for edge_name, edge_list in edge_dict.items():
            for edge_id in edge_list:
                subject_id = kg_trapi['edges'][edge_id]['subject']
                subject = kg_trapi['nodes'][subject_id]['name']
                predicate = kg_trapi['edges'][edge_id]['predicate']
                object_id = kg_trapi['edges'][edge_id]['object']
                object = kg_trapi['nodes'][object_id]['name']
                string_out = f"{subject} -> {predicate} -> {object}"
                string_out_list.append(string_out)
    string_out_dict = dict(Counter(string_out_list).items())
    pp.pprint(string_out_dict)
    print("")
    
    with open(os.path.join(write_dir_trapi_automat,combined_node_list[i]+".txt"), 'a') as convert_file:
        convert_file.write(json.dumps(string_out_dict))
        

In [13]:
response_ara = requests.post(aragorn_submit_url,json=query)
print(response_ara.status_code)
number_pathway_results_ara = len(response_ara.json()['message']['results'])
print(len(response_ara.json()['message']['results']))

200
201


In [14]:
kg_ara = response_ara.json()['message']['knowledge_graph']
results_ara = response_ara.json()['message']['results']

In [None]:
import pandas as pd
import os

cols = []
for node in sorted(results_ara[0]['node_bindings'].keys()):
    cols.append(node)
    cols.append(node + '_name')
results_ara_df = pd.DataFrame(columns = cols)

results_ara_list = []
for result in results_ara:
    result_dict = {}
    for node in sorted(result['node_bindings'].keys()):
        node_id = result['node_bindings'][node][0]['id']
        result_dict[node] = node_id
        result_dict[node + '_name'] = kg_ara['nodes'][node_id]['name']

    results_ara_list.append(pd.DataFrame([result_dict]))
results_ara_df = pd.concat(results_ara_list)
display(results_ara_df)
# results_ara_df.to_csv(os.path.join(write_dir_trapi_ara,'results_ARA.csv'), index=False)
results_ara_df.to_csv(ARA_results_file, index=False)

combined_node_list = ["_".join([row[1].replace(" ", "_"), row[3].replace(" ", "_"), row[5].replace(" ", "_")]) for row in results_ara_df[cols].to_numpy()]
pp.pprint(combined_node_list)

In [None]:
from collections import Counter
import json
import pprint
pp = pprint.PrettyPrinter(indent=5)

for i in range(number_pathway_results_ara):
    print(f"Pathway result: {combined_node_list[i]}")
    edge_bindings = results_ara[i]['edge_bindings']

    edge_ids = []
    for edge_name, edge_list in edge_bindings.items():
        edge_ids.append({edge_name: [x['id'] for x in edge_list]})

    string_out_list = []
    for edge_dict in edge_ids:
        for edge_name, edge_list in edge_dict.items():
            for edge_id in edge_list:
                subject_id = kg_ara['edges'][edge_id]['subject']
                subject = kg_ara['nodes'][subject_id]['name']
                predicate = kg_ara['edges'][edge_id]['predicate']
                object_id = kg_ara['edges'][edge_id]['object']
                object = kg_ara['nodes'][object_id]['name']
                string_out = f"{subject} -> {predicate} -> {object}"
                string_out_list.append(string_out)
    string_out_dict = dict(Counter(string_out_list).items())
    pp.pprint(string_out_dict)
    print("")
    
    with open(os.path.join(write_dir_trapi_ara,combined_node_list[i]+".txt"), 'a') as convert_file:
        convert_file.write(json.dumps(string_out_dict))
        

### Cypher methods

In [20]:
# Initializing Cypher query and extracting results from Automat using Cypher

nodes = ['n00','n01','n02']
cols = []
for node in nodes:
    cols.append(node)
    cols.append(node+"_name")

results_df = pd.DataFrame(columns = cols)
results_dict_list = []

i = 0
for input_node_id in input_node_id_list:
    if i > 0:
        break
    for output_node_id in output_node_id_list:
        if i > 0:
            break
        cypher = f"""MATCH ({nodes[0]}:`biolink:GeneOrGeneProduct`)-[r0_0]-({nodes[1]}:`biolink:BiologicalEntity`)-[r1_0]-({nodes[2]}:`biolink:DiseaseOrPhenotypicFeature`) 
        WHERE {nodes[0]}.id IN ['{input_node_id}'] AND {nodes[2]}.id IN ['{output_node_id}'] 
        RETURN [startNode(r0_0),[type(r0_0),properties(r0_0)],endNode(r0_0)] as edge_1, 
        [startNode(r1_0),[type(r1_0),properties(r1_0)],endNode(r1_0)] as edge_2, 
        [{nodes[0]}.name, {nodes[1]}.name, {nodes[2]}.name] as node_names,
        [{nodes[0]}.id, {nodes[1]}.id, {nodes[2]}.id] as node_ids LIMIT 100"""
        # print(f"{input_node_id}-{output_node_id}")
        j = {'query': cypher}
        results = requests.post(automat_cypher_submit_url,json=j, timeout=(40,200))
        # print(results.status_code)
        
        try:
            results_json = results.json()
        except json.decoder.JSONDecodeError:
            print(f"JSON Decode Error: {input_node_id}-{output_node_id}")
            
        string_out_list = []
        # pp.pprint(results_json)
        for result in results_json['results'][0]['data']:
            for item in result['row'][0:2]:
                string_out = f"{item[0]['name']} -> {item[1][0]} -> {item[2]['name']}||{item[1][1]}"
                if string_out not in string_out_list:
                    string_out_list.append(string_out)
        if len(results_json['results'][0]['data']) > 0:
            combined_node_list = "_".join(results_json['results'][0]['data'][0]['row'][2]).replace(" ", "_")
            # print(combined_node_list)

        string_out_list = [i.split('||', 1)[0] for i in string_out_list]

        string_out_dict = dict(Counter(string_out_list).items())
        if len(string_out_dict.keys()) == 0:
            pass
            # print(f"None found for {input_node_id}-{output_node_id}")
        else:
            combined_node_list = "_".join(results_json['results'][0]['data'][0]['row'][2]).replace(" ", "_")
            print(combined_node_list)

            string_out_list = [i.split('||', 1)[0] for i in string_out_list]

            string_out_dict = dict(Counter(string_out_list).items())
            pp.pprint(string_out_dict)
            # i = i + 1

            with open(os.path.join(write_dir_cypher_automat,combined_node_list+".txt"), 'a') as convert_file:
                convert_file.write(json.dumps(string_out_dict))
            
            result_dict = {}
            for j in range(len(nodes)):
                node_id = results_json['results'][0]['data'][0]['row'][3][j]
                node_name = results_json['results'][0]['data'][0]['row'][2][j]
                result_dict[nodes[j]] = node_id
                result_dict[nodes[j]+"_name"] = node_name
            pp.pprint(result_dict)

            results_dict_list.append(result_dict)

results_df = pd.concat([results_df,pd.DataFrame.from_records(results_dict_list)])
print(results_df.shape)
# results_df.to_csv(os.path.join(write_dir_cypher_automat,'results.csv'))
results_df.to_csv(Cypher_Automat_results_file)

PPARA_CCN2_Hepatic_fibrosis
{    'ABCB4 -> biolink:genetically_associated_with -> Hepatic fibrosis': 1,
     'AGT -> biolink:affects -> PPARA': 1,
     'AGT -> biolink:genetically_associated_with -> Hepatic fibrosis': 1,
     'ALB -> biolink:affects -> PPARA': 1,
     'ALB -> biolink:directly_physically_interacts_with -> PPARA': 1,
     'ALB -> biolink:genetically_associated_with -> Hepatic fibrosis': 1,
     'ARNT -> biolink:affects -> PPARA': 1,
     'ARNT -> biolink:genetically_associated_with -> Hepatic fibrosis': 1,
     'ARRB1 -> biolink:affects -> PPARA': 1,
     'ARRB1 -> biolink:genetically_associated_with -> Hepatic fibrosis': 1,
     'CCL2 -> biolink:affects -> PPARA': 1,
     'CCL2 -> biolink:genetically_associated_with -> Hepatic fibrosis': 1,
     'CCN2 -> biolink:affects -> PPARA': 1,
     'CCN2 -> biolink:genetically_associated_with -> Hepatic fibrosis': 1,
     'CNR2 -> biolink:affects -> PPARA': 1,
     'CNR2 -> biolink:genetically_associated_with -> Hepatic fibrosis'

ConnectionError: HTTPSConnectionPool(host='automat.renci.org', port=443): Max retries exceeded with url: /robokopkg/cypher (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001ED3CA39F10>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))

In [21]:
# Saving results if above failed
results_df = pd.DataFrame(columns = cols)
results_df = pd.concat([results_df,pd.DataFrame.from_records(results_dict_list)])
print(results_df.shape)
# results_df.to_csv(os.path.join(write_dir_cypher_automat,'results.csv'))
results_df.to_csv(Cypher_Automat_results_file)

(10, 6)


In [19]:
# Initializing Cypher query and extracting results from ROBOKOPKG using Bolt protocol method
conn = Neo4jConnection(uri=robokopkg_bolt_url, user = user, pwd = pw)

nodes = ['n00','n01','n02']

cols = []
for node in nodes:
    cols.append(node)
    cols.append(node+"_name")

results_df = pd.DataFrame(columns = cols)
results_dict_list = []

i = 0
for input_node_id in input_node_id_list:
    if i > 0:
        break
    for output_node_id in output_node_id_list:
        if i > 0:
            break
        cypher = f"""MATCH ({nodes[0]}:`biolink:GeneOrGeneProduct`)-[r0_0]-({nodes[1]}:`biolink:BiologicalEntity`)-[r1_0]-({nodes[2]}:`biolink:DiseaseOrPhenotypicFeature`) 
        WHERE {nodes[0]}.id IN ['{input_node_id}'] AND {nodes[2]}.id IN ['{output_node_id}'] 
        RETURN [startNode(r0_0),[type(r0_0),properties(r0_0)],endNode(r0_0)] as edge_1, 
        [startNode(r1_0),[type(r1_0),properties(r1_0)],endNode(r1_0)] as edge_2, 
        [{nodes[0]}.name, {nodes[1]}.name, {nodes[2]}.name] as node_names,
        [{nodes[0]}.id, {nodes[1]}.id, {nodes[2]}.id] as node_ids LIMIT 100"""
        record_list = conn.query(cypher)
        
        string_out_list = []
        for record in record_list: 
            record_data = record.data()
            #only grab the edge information and skip the node names and IDs
            record_data_first2 = {k: record_data[k] for k in list(record_data)[:2]}
            for label, data in record_data_first2.items():
                string_out = f"{label} - {data[0]['name']} -> {data[1][0]} -> {data[2]['name']}||{data[1][1]}"
                if string_out not in string_out_list:
                    string_out_list.append(string_out)

        if len(record_list) > 0:
            combined_node_list = "_".join(list(record_list[0].data('node_names').values())[0])
            print(combined_node_list)

            string_out_list = [i.split('||', 1)[0] for i in string_out_list]

            string_out_dict = dict(Counter(string_out_list).items())
            pp.pprint(string_out_dict)

            with open(os.path.join(write_dir_cypher_robokopkg,combined_node_list+".txt"), 'a') as convert_file:
                convert_file.write(json.dumps(string_out_dict))
        
        for record in record_list:
            # print(f"{input_node_id} - {output_node_id}")
            record_data = record.data()
            #only grab the node labels and IDs
            record_data_last2 = {k: record_data[k] for k in list(record_data)[2:]}

            result_dict = {}
            for label, data in record_data_last2.items():
                # print(f"{label}: {data}")
                for j in range(len(nodes)):
                    if "ids" in label:
                        node_id = data[j]
                        result_dict[nodes[j]] = node_id
                    elif "names" in label:
                        node_name = data[j]
                        result_dict[nodes[j]+"_name"] = node_name
            # pp.pprint(result_dict)

            results_dict_list.append(result_dict)

results_df = pd.concat([results_df,pd.DataFrame.from_records(results_dict_list)])
print(results_df.shape)
display(results_df)
# results_df.to_csv(os.path.join(write_dir_cypher_robokopkg,'results.csv'))
results_df.to_csv(Cypher_ROBOKOPKG_results_file)
conn.close()

PPARA_STAT1_Hepatic fibrosis
{    'edge_1 - AGT -> biolink:regulates -> PPARA': 1,
     'edge_1 - ALB -> biolink:regulates -> PPARA': 1,
     'edge_1 - ARNT -> biolink:regulates -> PPARA': 1,
     'edge_1 - ARRB1 -> biolink:regulates -> PPARA': 1,
     'edge_1 - CCL2 -> biolink:regulates -> PPARA': 1,
     'edge_1 - CCN2 -> biolink:regulates -> PPARA': 1,
     'edge_1 - CNR2 -> biolink:regulates -> PPARA': 1,
     'edge_1 - CP -> biolink:regulates -> PPARA': 1,
     'edge_1 - CTNNB1 -> biolink:regulates -> PPARA': 1,
     'edge_1 - F2R -> biolink:regulates -> PPARA': 1,
     'edge_1 - HGF -> biolink:regulates -> PPARA': 1,
     'edge_1 - IL6 -> biolink:regulates -> PPARA': 1,
     'edge_1 - LGALS1 -> biolink:regulates -> PPARA': 1,
     'edge_1 - MMP2 -> biolink:regulates -> PPARA': 1,
     'edge_1 - NFE2L2 -> biolink:regulates -> PPARA': 1,
     'edge_1 - NPC1 -> biolink:regulates -> PPARA': 1,
     'edge_1 - PLAU -> biolink:regulates -> PPARA': 1,
     'edge_1 - PPARA -> biolink:cont

Unnamed: 0,n00,n00_name,n01,n01_name,n02,n02_name
0,NCBIGene:5465,PPARA,NCBIGene:6772,STAT1,HP:0001395,Hepatic fibrosis
1,NCBIGene:5465,PPARA,NCBIGene:183,AGT,HP:0001395,Hepatic fibrosis
2,NCBIGene:5465,PPARA,NCBIGene:5244,ABCB4,HP:0001395,Hepatic fibrosis
3,NCBIGene:5465,PPARA,NCBIGene:2147,F2,HP:0001395,Hepatic fibrosis
4,NCBIGene:5465,PPARA,MONDO:0005154,liver disorder,HP:0001395,Hepatic fibrosis
...,...,...,...,...,...,...
122,NCBIGene:133522,PPARGC1B,NCBIGene:6927,HNF1A,MONDO:0018840,isolated congenital hepatic fibrosis
123,NCBIGene:133522,PPARGC1B,NCBIGene:5972,REN,MONDO:0018840,isolated congenital hepatic fibrosis
124,NCBIGene:80205,CHD9,NCBIGene:183,AGT,HP:0001395,Hepatic fibrosis
125,NCBIGene:80205,CHD9,NCBIGene:5244,ABCB4,HP:0001395,Hepatic fibrosis


In [19]:
import pandas as pd

def get_summary_stats(results_df):
    results_df['Freq'] = 1

    occur_n00=(results_df
      .groupby(
           results_df[["n00"]]
           .apply(lambda x: str(sorted(x)), axis=1)
           )
      .agg({"n00": "first", "n00_name": "first", "Freq": "sum"}).sort_values(['Freq'], ascending = False)
      .reset_index(drop=True)
    )
    occur_n02=(results_df
      .groupby(
           results_df[["n02"]]
           .apply(lambda x: str(sorted(x)), axis=1)
           )
      .agg({"n02": "first","n02_name": "first", "Freq": "sum"}).sort_values(['Freq'], ascending = False)
      .reset_index(drop=True)
    )
    
    # Counting pairwise results
    occur_pair=(results_df
      .groupby(
           results_df[["n00", "n02"]]
           .apply(lambda x: str(sorted(x)), axis=1)
           )
      .agg({"n00": "first", "n00_name": "first", "n02": "first", "n02_name": "first", "Freq": "sum"}).sort_values(['Freq'], ascending = False)
      .reset_index(drop=True)
    )

    print("\nFrequency of n00 IDs")
    display(occur_n00)
    print("\nFrequency of n02 IDs")
    display(occur_n02)
    print("\nFrequency of n00-n02 pair IDs")
    display(occur_pair)
    
    print("\nMost frequent pair")
    display(occur_pair.head(1))
    return(occur_pair.head(1))
    
def compare_summary_stats(results_df1, results_df2, label1, label2):
    print(f"Summary stats for {label1}")
    most_frequent_pair1 = get_summary_stats(results_df1)
    most_frequent_pair1['label'] = label1
    print(f"\nSummary stats for {label2}")
    most_frequent_pair2 = get_summary_stats(results_df2)
    most_frequent_pair2['label'] = label2
    frames = [most_frequent_pair1,most_frequent_pair2]
    most_frequent_pairs = pd.concat(frames)
    display(most_frequent_pairs)

In [33]:
results_df_Cypher_Automat = pd.read_csv(Cypher_Automat_results_file)
display(results_df_Cypher_Automat)
most_common_pair_test = get_summary_stats(results_df_Cypher_Automat)

Unnamed: 0.1,Unnamed: 0,n00,n00_name,n01,n01_name,n02,n02_name
0,0,NCBIGene:5465,PPARA,NCBIGene:1490,CCN2,HP:0001395,Hepatic fibrosis
1,1,NCBIGene:5465,PPARA,MONDO:0005154,liver disorder,MONDO:0100430,fibrotic liver disease
2,2,NCBIGene:5465,PPARA,NCBIGene:5314,PKHD1,MONDO:0018840,isolated congenital hepatic fibrosis
3,3,NCBIGene:5465,PPARA,MONDO:0005154,liver disorder,UMLS:C2827436,Liver Disease Associated with Cystic Fibrosis
4,4,NCBIGene:10891,PPARGC1A,NCBIGene:213,ALB,HP:0001395,Hepatic fibrosis
5,5,NCBIGene:10891,PPARGC1A,NCBIGene:2200,FBN1,MONDO:0018840,isolated congenital hepatic fibrosis
6,6,NCBIGene:133522,PPARGC1B,NCBIGene:6347,CCL2,HP:0001395,Hepatic fibrosis
7,7,NCBIGene:133522,PPARGC1B,NCBIGene:183,AGT,MONDO:0018840,isolated congenital hepatic fibrosis
8,8,NCBIGene:80205,CHD9,NCBIGene:183,AGT,HP:0001395,Hepatic fibrosis
9,9,NCBIGene:80205,CHD9,NCBIGene:183,AGT,MONDO:0018840,isolated congenital hepatic fibrosis



Frequency of n00 IDs


Unnamed: 0,n00,n00_name,Freq
0,NCBIGene:5465,PPARA,4
1,NCBIGene:10891,PPARGC1A,2
2,NCBIGene:133522,PPARGC1B,2
3,NCBIGene:80205,CHD9,2



Frequency of n02 IDs


Unnamed: 0,n02,n02_name,Freq
0,HP:0001395,Hepatic fibrosis,4
1,MONDO:0018840,isolated congenital hepatic fibrosis,4
2,MONDO:0100430,fibrotic liver disease,1
3,UMLS:C2827436,Liver Disease Associated with Cystic Fibrosis,1



Frequency of n00-n02 pair IDs


Unnamed: 0,n00,n00_name,n02,n02_name,Freq
0,NCBIGene:10891,PPARGC1A,HP:0001395,Hepatic fibrosis,1
1,NCBIGene:133522,PPARGC1B,HP:0001395,Hepatic fibrosis,1
2,NCBIGene:5465,PPARA,HP:0001395,Hepatic fibrosis,1
3,NCBIGene:80205,CHD9,HP:0001395,Hepatic fibrosis,1
4,NCBIGene:10891,PPARGC1A,MONDO:0018840,isolated congenital hepatic fibrosis,1
5,NCBIGene:133522,PPARGC1B,MONDO:0018840,isolated congenital hepatic fibrosis,1
6,NCBIGene:5465,PPARA,MONDO:0018840,isolated congenital hepatic fibrosis,1
7,NCBIGene:80205,CHD9,MONDO:0018840,isolated congenital hepatic fibrosis,1
8,NCBIGene:5465,PPARA,MONDO:0100430,fibrotic liver disease,1
9,NCBIGene:5465,PPARA,UMLS:C2827436,Liver Disease Associated with Cystic Fibrosis,1



Most frequent pair


Unnamed: 0,n00,n00_name,n02,n02_name,Freq
0,NCBIGene:10891,PPARGC1A,HP:0001395,Hepatic fibrosis,1


## TRAPI comparison results
ARA vs TRAPI

In [21]:
results_df_ARA = pd.read_csv(ARA_results_file)
results_df_TRAPI = pd.read_csv(TRAPI_results_file)
compare_summary_stats(results_df_ARA,results_df_TRAPI,"ARA","TRAPI (automat)")

Summary stats for ARA

Frequency of n00 IDs


Unnamed: 0,n00,n00_name,Freq
0,NCBIGene:5465,PPARA,112
1,UniProtKB:Q07869-1,peroxisome proliferator-activated receptor alp...,39
2,NCBIGene:10891,PPARGC1A,22
3,NCBIGene:133522,PPARGC1B,18
4,UniProtKB:Q9UBK2-1,peroxisome proliferator-activated receptor gam...,5
5,NCBIGene:80205,CHD9,4
6,UniProtKB:Q3L8U1-2,chromodomain-helicase-DNA-binding protein 9 is...,1



Frequency of n02 IDs


Unnamed: 0,n02,n02_name,Freq
0,HP:0001395,Hepatic fibrosis,133
1,MONDO:0018840,isolated congenital hepatic fibrosis,60
2,HP:0001405,Periportal fibrosis,2
3,HP:0012852,Hepatic bridging fibrosis,2
4,MONDO:0100430,fibrotic liver disease,2
5,UMLS:C2827436,Liver Disease Associated with Cystic Fibrosis,2



Frequency of n00-n02 pair IDs


Unnamed: 0,n00,n00_name,n02,n02_name,Freq
0,NCBIGene:5465,PPARA,HP:0001395,Hepatic fibrosis,68
1,NCBIGene:5465,PPARA,MONDO:0018840,isolated congenital hepatic fibrosis,36
2,UniProtKB:Q07869-1,peroxisome proliferator-activated receptor alp...,HP:0001395,Hepatic fibrosis,33
3,NCBIGene:10891,PPARGC1A,HP:0001395,Hepatic fibrosis,14
4,NCBIGene:133522,PPARGC1B,HP:0001395,Hepatic fibrosis,12
5,NCBIGene:10891,PPARGC1A,MONDO:0018840,isolated congenital hepatic fibrosis,8
6,UniProtKB:Q07869-1,peroxisome proliferator-activated receptor alp...,MONDO:0018840,isolated congenital hepatic fibrosis,6
7,NCBIGene:133522,PPARGC1B,MONDO:0018840,isolated congenital hepatic fibrosis,6
8,UniProtKB:Q9UBK2-1,peroxisome proliferator-activated receptor gam...,HP:0001395,Hepatic fibrosis,3
9,NCBIGene:80205,CHD9,MONDO:0018840,isolated congenital hepatic fibrosis,2



Most frequent pair


Unnamed: 0,n00,n00_name,n02,n02_name,Freq
0,NCBIGene:5465,PPARA,HP:0001395,Hepatic fibrosis,68



Summary stats for TRAPI (automat)

Frequency of n00 IDs


Unnamed: 0,n00,n00_name,Freq
0,NCBIGene:5465,PPARA,112
1,UniProtKB:Q07869-1,peroxisome proliferator-activated receptor alp...,39
2,NCBIGene:10891,PPARGC1A,22
3,NCBIGene:133522,PPARGC1B,18
4,UniProtKB:Q9UBK2-1,peroxisome proliferator-activated receptor gam...,5
5,NCBIGene:80205,CHD9,4
6,UniProtKB:Q3L8U1-2,chromodomain-helicase-DNA-binding protein 9 is...,1



Frequency of n02 IDs


Unnamed: 0,n02,n02_name,Freq
0,HP:0001395,Hepatic fibrosis,133
1,MONDO:0018840,isolated congenital hepatic fibrosis,60
2,HP:0001405,Periportal fibrosis,2
3,HP:0012852,Hepatic bridging fibrosis,2
4,MONDO:0100430,fibrotic liver disease,2
5,UMLS:C2827436,Liver Disease Associated with Cystic Fibrosis,2



Frequency of n00-n02 pair IDs


Unnamed: 0,n00,n00_name,n02,n02_name,Freq
0,NCBIGene:5465,PPARA,HP:0001395,Hepatic fibrosis,68
1,NCBIGene:5465,PPARA,MONDO:0018840,isolated congenital hepatic fibrosis,36
2,UniProtKB:Q07869-1,peroxisome proliferator-activated receptor alp...,HP:0001395,Hepatic fibrosis,33
3,NCBIGene:10891,PPARGC1A,HP:0001395,Hepatic fibrosis,14
4,NCBIGene:133522,PPARGC1B,HP:0001395,Hepatic fibrosis,12
5,NCBIGene:10891,PPARGC1A,MONDO:0018840,isolated congenital hepatic fibrosis,8
6,UniProtKB:Q07869-1,peroxisome proliferator-activated receptor alp...,MONDO:0018840,isolated congenital hepatic fibrosis,6
7,NCBIGene:133522,PPARGC1B,MONDO:0018840,isolated congenital hepatic fibrosis,6
8,UniProtKB:Q9UBK2-1,peroxisome proliferator-activated receptor gam...,HP:0001395,Hepatic fibrosis,3
9,NCBIGene:80205,CHD9,MONDO:0018840,isolated congenital hepatic fibrosis,2



Most frequent pair


Unnamed: 0,n00,n00_name,n02,n02_name,Freq
0,NCBIGene:5465,PPARA,HP:0001395,Hepatic fibrosis,68


Unnamed: 0,n00,n00_name,n02,n02_name,Freq,label
0,NCBIGene:5465,PPARA,HP:0001395,Hepatic fibrosis,68,ARA
0,NCBIGene:5465,PPARA,HP:0001395,Hepatic fibrosis,68,TRAPI (automat)


## Cypher comparison results
Automat vs Bolt

In [22]:
results_df_Cypher_Automat = pd.read_csv(Cypher_Automat_results_file)
results_df_Cypher_ROBOKOPKG = pd.read_csv(Cypher_ROBOKOPKG_results_file)
compare_summary_stats(results_df_Cypher_Automat,results_df_Cypher_ROBOKOPKG,"Cypher (automat)","Cypher (ROBOKOPKG)")

Summary stats for Cypher (automat)

Frequency of n00 IDs


Unnamed: 0,n00,n00_name,Freq
0,NCBIGene:5465,PPARA,4
1,NCBIGene:10891,PPARGC1A,2
2,NCBIGene:133522,PPARGC1B,2
3,NCBIGene:80205,CHD9,2



Frequency of n02 IDs


Unnamed: 0,n02,n02_name,Freq
0,HP:0001395,Hepatic fibrosis,4
1,MONDO:0018840,isolated congenital hepatic fibrosis,4
2,MONDO:0100430,fibrotic liver disease,1
3,UMLS:C2827436,Liver Disease Associated with Cystic Fibrosis,1



Frequency of n00-n02 pair IDs


Unnamed: 0,n00,n00_name,n02,n02_name,Freq
0,NCBIGene:10891,PPARGC1A,HP:0001395,Hepatic fibrosis,1
1,NCBIGene:133522,PPARGC1B,HP:0001395,Hepatic fibrosis,1
2,NCBIGene:5465,PPARA,HP:0001395,Hepatic fibrosis,1
3,NCBIGene:80205,CHD9,HP:0001395,Hepatic fibrosis,1
4,NCBIGene:10891,PPARGC1A,MONDO:0018840,isolated congenital hepatic fibrosis,1
5,NCBIGene:133522,PPARGC1B,MONDO:0018840,isolated congenital hepatic fibrosis,1
6,NCBIGene:5465,PPARA,MONDO:0018840,isolated congenital hepatic fibrosis,1
7,NCBIGene:80205,CHD9,MONDO:0018840,isolated congenital hepatic fibrosis,1
8,NCBIGene:5465,PPARA,MONDO:0100430,fibrotic liver disease,1
9,NCBIGene:5465,PPARA,UMLS:C2827436,Liver Disease Associated with Cystic Fibrosis,1



Most frequent pair


Unnamed: 0,n00,n00_name,n02,n02_name,Freq
0,NCBIGene:10891,PPARGC1A,HP:0001395,Hepatic fibrosis,1



Summary stats for Cypher (ROBOKOPKG)

Frequency of n00 IDs


Unnamed: 0,n00,n00_name,Freq
0,NCBIGene:5465,PPARA,80
1,NCBIGene:10891,PPARGC1A,23
2,NCBIGene:133522,PPARGC1B,21
3,NCBIGene:80205,CHD9,3



Frequency of n02 IDs


Unnamed: 0,n02,n02_name,Freq
0,HP:0001395,Hepatic fibrosis,99
1,MONDO:0018840,isolated congenital hepatic fibrosis,24
2,MONDO:0100430,fibrotic liver disease,2
3,UMLS:C2827436,Liver Disease Associated with Cystic Fibrosis,2



Frequency of n00-n02 pair IDs


Unnamed: 0,n00,n00_name,n02,n02_name,Freq
0,NCBIGene:5465,PPARA,HP:0001395,Hepatic fibrosis,60
1,NCBIGene:10891,PPARGC1A,HP:0001395,Hepatic fibrosis,19
2,NCBIGene:133522,PPARGC1B,HP:0001395,Hepatic fibrosis,18
3,NCBIGene:5465,PPARA,MONDO:0018840,isolated congenital hepatic fibrosis,16
4,NCBIGene:10891,PPARGC1A,MONDO:0018840,isolated congenital hepatic fibrosis,4
5,NCBIGene:133522,PPARGC1B,MONDO:0018840,isolated congenital hepatic fibrosis,3
6,NCBIGene:80205,CHD9,HP:0001395,Hepatic fibrosis,2
7,NCBIGene:5465,PPARA,MONDO:0100430,fibrotic liver disease,2
8,NCBIGene:5465,PPARA,UMLS:C2827436,Liver Disease Associated with Cystic Fibrosis,2
9,NCBIGene:80205,CHD9,MONDO:0018840,isolated congenital hepatic fibrosis,1



Most frequent pair


Unnamed: 0,n00,n00_name,n02,n02_name,Freq
0,NCBIGene:5465,PPARA,HP:0001395,Hepatic fibrosis,60


Unnamed: 0,n00,n00_name,n02,n02_name,Freq,label
0,NCBIGene:10891,PPARGC1A,HP:0001395,Hepatic fibrosis,1,Cypher (automat)
0,NCBIGene:5465,PPARA,HP:0001395,Hepatic fibrosis,60,Cypher (ROBOKOPKG)


## Automat comparison results
TRAPI vs Cypher

In [23]:
results_df_TRAPI = pd.read_csv(TRAPI_results_file)
results_df_Cypher_Automat = pd.read_csv(Cypher_Automat_results_file)
compare_summary_stats(results_df_TRAPI,results_df_Cypher_Automat,"TRAPI (automat)","Cypher (automat)")

Summary stats for TRAPI (automat)

Frequency of n00 IDs


Unnamed: 0,n00,n00_name,Freq
0,NCBIGene:5465,PPARA,112
1,UniProtKB:Q07869-1,peroxisome proliferator-activated receptor alp...,39
2,NCBIGene:10891,PPARGC1A,22
3,NCBIGene:133522,PPARGC1B,18
4,UniProtKB:Q9UBK2-1,peroxisome proliferator-activated receptor gam...,5
5,NCBIGene:80205,CHD9,4
6,UniProtKB:Q3L8U1-2,chromodomain-helicase-DNA-binding protein 9 is...,1



Frequency of n02 IDs


Unnamed: 0,n02,n02_name,Freq
0,HP:0001395,Hepatic fibrosis,133
1,MONDO:0018840,isolated congenital hepatic fibrosis,60
2,HP:0001405,Periportal fibrosis,2
3,HP:0012852,Hepatic bridging fibrosis,2
4,MONDO:0100430,fibrotic liver disease,2
5,UMLS:C2827436,Liver Disease Associated with Cystic Fibrosis,2



Frequency of n00-n02 pair IDs


Unnamed: 0,n00,n00_name,n02,n02_name,Freq
0,NCBIGene:5465,PPARA,HP:0001395,Hepatic fibrosis,68
1,NCBIGene:5465,PPARA,MONDO:0018840,isolated congenital hepatic fibrosis,36
2,UniProtKB:Q07869-1,peroxisome proliferator-activated receptor alp...,HP:0001395,Hepatic fibrosis,33
3,NCBIGene:10891,PPARGC1A,HP:0001395,Hepatic fibrosis,14
4,NCBIGene:133522,PPARGC1B,HP:0001395,Hepatic fibrosis,12
5,NCBIGene:10891,PPARGC1A,MONDO:0018840,isolated congenital hepatic fibrosis,8
6,UniProtKB:Q07869-1,peroxisome proliferator-activated receptor alp...,MONDO:0018840,isolated congenital hepatic fibrosis,6
7,NCBIGene:133522,PPARGC1B,MONDO:0018840,isolated congenital hepatic fibrosis,6
8,UniProtKB:Q9UBK2-1,peroxisome proliferator-activated receptor gam...,HP:0001395,Hepatic fibrosis,3
9,NCBIGene:80205,CHD9,MONDO:0018840,isolated congenital hepatic fibrosis,2



Most frequent pair


Unnamed: 0,n00,n00_name,n02,n02_name,Freq
0,NCBIGene:5465,PPARA,HP:0001395,Hepatic fibrosis,68



Summary stats for Cypher (automat)

Frequency of n00 IDs


Unnamed: 0,n00,n00_name,Freq
0,NCBIGene:5465,PPARA,4
1,NCBIGene:10891,PPARGC1A,2
2,NCBIGene:133522,PPARGC1B,2
3,NCBIGene:80205,CHD9,2



Frequency of n02 IDs


Unnamed: 0,n02,n02_name,Freq
0,HP:0001395,Hepatic fibrosis,4
1,MONDO:0018840,isolated congenital hepatic fibrosis,4
2,MONDO:0100430,fibrotic liver disease,1
3,UMLS:C2827436,Liver Disease Associated with Cystic Fibrosis,1



Frequency of n00-n02 pair IDs


Unnamed: 0,n00,n00_name,n02,n02_name,Freq
0,NCBIGene:10891,PPARGC1A,HP:0001395,Hepatic fibrosis,1
1,NCBIGene:133522,PPARGC1B,HP:0001395,Hepatic fibrosis,1
2,NCBIGene:5465,PPARA,HP:0001395,Hepatic fibrosis,1
3,NCBIGene:80205,CHD9,HP:0001395,Hepatic fibrosis,1
4,NCBIGene:10891,PPARGC1A,MONDO:0018840,isolated congenital hepatic fibrosis,1
5,NCBIGene:133522,PPARGC1B,MONDO:0018840,isolated congenital hepatic fibrosis,1
6,NCBIGene:5465,PPARA,MONDO:0018840,isolated congenital hepatic fibrosis,1
7,NCBIGene:80205,CHD9,MONDO:0018840,isolated congenital hepatic fibrosis,1
8,NCBIGene:5465,PPARA,MONDO:0100430,fibrotic liver disease,1
9,NCBIGene:5465,PPARA,UMLS:C2827436,Liver Disease Associated with Cystic Fibrosis,1



Most frequent pair


Unnamed: 0,n00,n00_name,n02,n02_name,Freq
0,NCBIGene:10891,PPARGC1A,HP:0001395,Hepatic fibrosis,1


Unnamed: 0,n00,n00_name,n02,n02_name,Freq,label
0,NCBIGene:5465,PPARA,HP:0001395,Hepatic fibrosis,68,TRAPI (automat)
0,NCBIGene:10891,PPARGC1A,HP:0001395,Hepatic fibrosis,1,Cypher (automat)
