In [22]:
# Import libraries
import yaml
import warnings
import pprint

from controller.smartapi import SmartAPI
from utils.metakg import parser


# Ignore all warnings
warnings.filterwarnings("ignore")


In [23]:
mkg_parser = parser.MetaKGParser()

---

**Tracking Key Frequencies in Non-TRAPI API and Query Operation Metadata from SmartAPI Documents**    


- processes SmartAPI documents to extract keys from the `api` and `query_operation` sections of non-TRAPI metadata, **updating a set of unique keys found and tracking the frequency of each key across all documents.** The resulting data is stored in a dictionary for each document, recording the found keys and their counts.

In [4]:
import yaml
from collections import defaultdict

mkg_non_trapi_dict = {}
keys_found_no_map_full_set = set()
key_frequency_dict = defaultdict(int)  # New dictionary to track frequency of keys
edge_ct=0
for smartapi in SmartAPI.get_all(1000):
    
    raw_data = smartapi.raw.decode('utf-8')
    source_data = yaml.safe_load(raw_data)

    # Get non-TRAPI metakg data
    metakg = mkg_parser.get_non_TRAPI_metadatas(data=source_data)

    if metakg:
        for edge in metakg:
            edge_list =  list(edge['api']['bte']['query_operation'].keys()) #+list(edge['api'].keys())
            edge_ct+=1

            keys_found_no_map_full_set.update(edge_list)
            
            # Update the frequency of each key found in this document
            for key in edge_list:
                key_frequency_dict[key] += 1

            # Store metadata for each SmartAPI document
            mkg_non_trapi_dict[smartapi._id] = {
                'query_operation': {
                    'keys_found': edge_list, 
                    'key_ct': len(edge_list), 
                    'data': edge['api']['bte']['query_operation']
                }
            }

In [5]:
edge_ct

5754

In [6]:
key_frequency_dict

defaultdict(int,
            {'params': 5752,
             'request_body': 5665,
             'path': 5754,
             'method': 5754,
             'server': 5754,
             'support_batch': 5670,
             'path_params': 14,
             'input_separator': 18})

**Tracking Filtered Response Mapping Key Frequencies in Non-TRAPI Metadata from SmartAPI Documents**

- processes SmartAPI documents to extract keys from the `response_mapping` section of non-TRAPI metadata, filters out capitalized words from the keys, and tracks the frequency of the remaining keys across all documents. It stores the cleaned keys and their count in a dictionary for each document while also maintaining a set of unique keys found and a dictionary to track their frequency.

In [25]:
import yaml
from collections import defaultdict
import re

mkg_non_trapi_dict = {}
keys_found_no_map_full_set = set()
response_mapping_frequency_dict = defaultdict(int)  # Dictionary to track frequency of response mapping keys
doc_count = 0  # Counter to track the total number of documents processed

for smartapi in SmartAPI.get_all(1000):
    doc_count += 1  # Increment the document count
    
    raw_data = smartapi.raw.decode('utf-8')
    source_data = yaml.safe_load(raw_data)

    # Get non-TRAPI metakg data
    metakg = mkg_parser.get_non_TRAPI_metadatas(data=source_data)

    if metakg:
        for edge in metakg:
            for map_key in edge['api']['bte']['response_mapping']:
                keys = edge['api']['bte']['response_mapping'][map_key].keys()
                
                # Remove all capital words from keys
                keys = [key for key in keys if not re.fullmatch(r'[A-Z]+', key)]
                
                # Update the set with cleaned keys
                keys_found_no_map_full_set.update(keys)

                # Update the frequency of each response mapping key
                for key in keys:
                    response_mapping_frequency_dict[key] += 1

                # Store metadata for each SmartAPI document
                if smartapi._id not in mkg_non_trapi_dict:
                    mkg_non_trapi_dict[smartapi._id] = {}

                mkg_non_trapi_dict[smartapi._id]['response_mapping'] = {
                    'keys_found': keys, 
                    'key_ct': len(keys), 
                    'data': edge['api']['bte']['response_mapping']
                }


In [8]:
# Print or store the response mapping key frequencies alphabetically
for key in sorted(response_mapping_frequency_dict.keys()):
    count = response_mapping_frequency_dict[key]
    print(f" {key}: {count}")

# Print the total number of documents processed
print(f"Total number of documents processed: {doc_count}")


 CHEMBL.COMPOUND: 18
 ComplexPortal: 5
 ICD11: 10
 KEGG.PATHWAY: 1
 KEGG_COMPOUND: 11
 NCBIGene: 462
 PHARMGKB.DISEASE: 6
 PHARMGKB.DRUG: 11
 PHARMGKB.GENE: 7
 PHARMGKB.PATHWAYS: 4
 PUBCHEM.COMPOUND: 14
 PUBCHEM_COMPOUND: 11
 TTD.DRUG: 10
 TTD.TARGET: 4
 UniProtKB: 37
 biolink:OMOP: 24
 biolink:chi_squared_statistic: 8
 biolink:evidence_count: 2
 biolink:frequency_qualifier: 2
 biolink:has_count: 26
 biolink:has_quotient: 10
 biolink:has_total: 10
 biolink:max_research_phase: 2
 biolink:name: 24
 biolink:p_value: 8
 biolink:supporting_text: 5029
 edge-attributes: 292
 evidence: 3
 foodb.food: 1
 inTaxon: 1
 input_name: 5434
 name: 7
 ncats.bioplanet: 2
 orphanet: 2
 output_name: 5459
 pubmed: 3
 qualifiers: 3
 ref_clinicaltrials: 8
 ref_doi: 4
 ref_isbn: 2
 ref_pmcid: 8
 ref_pmid: 5125
 ref_url: 23
 source_url: 9
 trapi_sources: 306
Total number of documents processed: 271


In [9]:
# List of valid response mapping keys
response_map_keys = [
    "source_url",
    "edge-attributes",
    "trapi_sources",
    "ref_input",
    "ref_output",
    "input_name",
    "output_name"
]

# Print or store the response mapping key frequencies alphabetically, but only if the key is in response_map_keys
for key in sorted(response_mapping_frequency_dict.keys()):
    if key in response_map_keys:  # Only print if key is in the valid response_map_keys list
        count = response_mapping_frequency_dict[key]
        print(f" {key}: {count}")

# Print the total number of documents processed
print(f"Total number of documents processed: {doc_count}")


 edge-attributes: 292
 input_name: 5434
 output_name: 5459
 source_url: 9
 trapi_sources: 306
Total number of documents processed: 271


In [10]:
# List of valid response mapping keys
response_map_keys = [
    "source_url",
    "edge-attributes",
    "trapi_sources",
    "ref_input",
    "ref_output",
    "input_name",
    "output_name"
]

# Print or store the response mapping key frequencies alphabetically, but only if the key is in response_map_keys
for key in sorted(response_mapping_frequency_dict.keys()):
    if key in response_map_keys:  # Only print if key is in the valid response_map_keys list
        count = response_mapping_frequency_dict[key]
        print(f" {key}: {count}")

# Print the total number of documents processed
print(f"Total number of documents processed: {doc_count}")


 edge-attributes: 292
 input_name: 5434
 output_name: 5459
 source_url: 9
 trapi_sources: 306
Total number of documents processed: 271


In [None]:
for smartapi in SmartAPI.get_all(1000):
    doc_id = smartapi._id    
    raw_data = smartapi.raw.decode('utf-8')
    source_data = yaml.safe_load(raw_data)
    # Get non-TRAPI metakg data
    metakg = mkg_parser.get_non_TRAPI_metadatas(data=source_data)

    if metakg:
        if "x-bte-response-mapping" not in source_data['components']:
            print(source_data['components'].keys())
            print(doc_id)
        mkg_non_trapi_dict[smartapi._id]['response_mapping_raw'] = source_data['components']['x-bte-response-mapping']


---

View the edge

In [13]:
good_key="edeb26858bd27d0322af93e7a9e08761"

In [15]:
pprint.pprint(mkg_non_trapi_dict[good_key])

{'response_mapping': {'data': {'occurs_together_in_literature_with': {'MESH': 'associatedWith.mentions.diseases.mesh',
                                                                      'ref_pmcid': 'associatedWith.pmc',
                                                                      'ref_url': 'associatedWith.figureUrl',
                                                                      'source_url': 'associatedWith.pfocrUrl'}},
                      'key_ct': 3,
                      'keys_found': ['ref_url', 'ref_pmcid', 'source_url']}}


---