# KGx Converter Draft  
Steps to build KGx file:  
1. Get Biolink Mapping  
2. Get SmartAPI edges  
3. Get Corresponding Biothings APIs  
4. Format Nodes and Edges  
5. Write to output files (.tsv, .json, etc) 

## Biolink mapping

Infer from prefix using biolink package?

In [None]:
from bmt import Toolkit

toolkit = Toolkit()

def infer_category_from_curie(subject_id):
    prefix = subject_id.split(":")[0]
    mappings = toolkit.get_prefixes_by_category()

    for category, prefixes in mappings.items():
        if prefix.lower() in [p.lower() for p in prefixes]:
            return category
    return "biolink:NamedThing"  # fallback

In [None]:
def is_valid_biolink_category(category):
    return toolkit.get_class(category) is not None


## SmartAPI Edges  
Using the `/metakg&consolidated=0` endpoint we can get the starting data and  prefixs

In [None]:
def get_smartapi_data(data, api_name, biolink_mapping, node_set, edge_list):
    for hit in data["hits"]:
        # set node and edge data 
        # use node to get biothings data
    # return edge and node full data

## Biothings APIs

## 🧪 BioThings Data Fetcher
- Query BioThings APIs with GIVEN API names (*etc?*)

- Parse results to get:

    - Input NODE (e.g., gene)

    - Output NODE (e.g., drug)

    - Optional label fields

- Respect input/output semantic types

In [None]:
def get_biothings_api(subject_id, node_dict, node_set):
     # set query for specific node data

     for i, data in enumerate(tqdm(client.query(q=bt_query, fetch_all=True))): 
          # get node name and data
    #return node data

## Biolink Mapping (#bl-map) 

In [1]:
from bmt import Toolkit
BMT = Toolkit()

In [90]:
def create_biolink_mappings(data):
    """
    Create Biolink mappings from the given data.
    """
    # Initialize the dictionary for entity mappings and set for unique prefixes
    node_mappings = {}
    edge_mappings = {}
    

    try:
        # Loop over the hits and extract the relevant prefixes
        for hit in data['hits']:  # Use .get() to avoid KeyError if "hits" is missing
            subject, object_ = hit['subject'], hit['object']
            subject_prefix, object_prefix = hit["subject_prefix"], hit["object_prefix"]
            s_uri = BMT.get_element(subject)
            s_uri = s_uri["class_uri"]
            o_uri = BMT.get_element(object_)
            o_uri = o_uri["class_uri"]
            predicate = hit["predicate"]
            pred_element = BMT.get_element(predicate)
            if pred_element:
                bl_uri = pred_element["slot_uri"]
                if bl_uri not in edge_mappings:
                    edge_mappings[bl_uri] = {
                        "to":[o_uri],
                        "from": [s_uri]
                    }
                else:
                    edge_mappings[bl_uri]["to"].append(o_uri)
                    edge_mappings[bl_uri]["to"] = list({str(uri) for uri in edge_mappings[bl_uri]["to"]})
                    edge_mappings[bl_uri]["from"].append(s_uri)
                    edge_mappings[bl_uri]["from"] = list({str(uri) for uri in edge_mappings[bl_uri]["from"]})
                    
            # Navigate to the "response_mapping" key
            response_mapping = hit.get("api", {}).get("bte", {}).get("response_mapping", {})
            s_ref = hit["api"]["bte"]["query_operation"]["request_body"]["body"]["scopes"]

            # Extract the first key-value pair from response_mapping if it's a dictionary
            o_ref = None
            for key, value in response_mapping.items():
                if isinstance(value, dict) and (first_key := next(iter(value), None)) == object_prefix:
                    o_ref = value[first_key]
                    break  # Exit loop once the match is found

            # Loop through subject and object to populate entity_mappings
            for match, prefix, ref in [(subject, subject_prefix, s_ref), (object_, object_prefix, o_ref)]:
                if match in node_mappings:
                    continue  # Skip if already processed

                # Retrieve Biolink element and populate entity_mappings
                if bl_element := BMT.get_element(match):  # Use walrus operator to assign and check
                    node_mappings[match] = {
                        "class_uri": bl_element["class_uri"],
                        "prefix": prefix,
                        "identifier": ref,
                    }

        # Print the results
        print(f"Unique mappings: {len(node_mappings)}")
        return node_mappings, edge_mappings

    except KeyError as e:
        print(f"Error: Missing key in data - {e}")
        return {}

In [71]:
BMT.get_element("affects")

SlotDefinition({
  'name': 'affects',
  'annotations': {'canonical_predicate': Annotation(tag='canonical_predicate',
                                      value=True,
                                      extensions={},
                                      annotations={})},
  'description': ('Describes an entity that has an effect on the state or quality of another '
     'existing entity.'),
  'notes': ["Use of the 'affects' predicate implies that the affected entity already "
    "exists, unlike predicates such as 'affects likelihood of' and 'prevents' "
    'where the effect concerns whether or when something may or may not come into '
    'existence.'],
  'in_subset': ['translator_minimal'],
  'from_schema': 'https://w3id.org/biolink/biolink-model',
  'exact_mappings': ['SEMMEDDB:AFFECTS', 'DGIdb:affects'],
  'related_mappings': ['DRUGBANK:pathway'],
  'narrow_mappings': ['CTD:prediction_hypothesis', 'GOREL:0001006', 'CTD:inferred', 'UPHENO:0000001',
    'RO:0002263', 'RO:0002264'

In [59]:
import requests
import pprint

In [86]:
api_name = "dgidb"
subject = "SmallMolecule"
# client = biothings_client.get_client("gene", url=f"https://biothings.ci.transltr.io/{api_name}")
url = "https://smart-api.info/api/metakg/?q=api.smartapi.id:e3edd325c76f2992a111b43a907a4870&bte=1&consolidated=0&subject=%22SmallMolecule%22&size=100"
response = requests.get(url)
data = response.json()

In [91]:
create_biolink_mappings(data)

Unique mappings: 2


({'Gene': {'class_uri': 'biolink:Gene',
   'prefix': 'NCBIGene',
   'identifier': 'entrezgene'},
  'Disease': {'class_uri': 'biolink:Disease',
   'prefix': 'orphanet',
   'identifier': 'raresource.disease.orphanet'}},
 {'biolink:gene_associated_with_condition': {'to': ['biolink:Disease'],
   'from': ['biolink:Gene']},
  'biolink:condition_associated_with_gene': {'to': ['biolink:Gene'],
   'from': ['biolink:Disease']}})

In [88]:
api_name = "rare_source"
api_id = "b772ebfbfa536bba37764d7fddb11d6f"
# client = biothings_client.get_client("gene", url=f"https://biothings.ci.transltr.io/{api_name}")
url = f"https://smart-api.info/api/metakg/?q=api.smartapi.id:{api_id}&bte=1&consolidated=0&size=100"
print(url)
response = requests.get(url)
data = response.json()
BMT = Toolkit()

https://smart-api.info/api/metakg/?q=api.smartapi.id:b772ebfbfa536bba37764d7fddb11d6f&bte=1&consolidated=0&size=100


In [92]:
create_biolink_mappings(data)

Unique mappings: 2


({'Gene': {'class_uri': 'biolink:Gene',
   'prefix': 'NCBIGene',
   'identifier': 'entrezgene'},
  'Disease': {'class_uri': 'biolink:Disease',
   'prefix': 'orphanet',
   'identifier': 'raresource.disease.orphanet'}},
 {'biolink:gene_associated_with_condition': {'to': ['biolink:Disease'],
   'from': ['biolink:Gene']},
  'biolink:condition_associated_with_gene': {'to': ['biolink:Gene'],
   'from': ['biolink:Disease']}})

---

## Biothings API traversal

bt_client = 

In [None]:
def get_biothings_api(subject_id, node_dict, node_data):
    for data in tqdm(client.query(q="__all__", fields="object,subject,predicate", fetch_all=True)):

    # if "NCBIGene" in subject_id:
    #     query_term = f"entrezgene:{subject_id.split(':')[1]}"
    # elif "orphanet" in subject_id:
    #     query_term = f"raresource.disease.orphanet:{subject_id.split(':')[1]}"

    # bt_data = []

    # for data in tqdm(client.query(q=query_term, fetch_all=True)):
    #     if "NCBIGene" in subject_id:
    #         node_name = data["description"]
    #     elif "orphanet" in subject_id:
    #         for data_dict in data["raresource"]["disease"]:
    #             if "orphanet" in data_dict and data_dict["orphanet"] == subject_id.split(":")[1]:
    #                 node_name = f'ORPHA:{data_dict["orphanet"]}'
    #                 break

    #     node = {
    #         "id": subject_id,
    #         "name": node_name,
    #         "category": node_dict
    #     }

    #     node_data[subject_id] = node

    # return node_data

In [None]:
api_name = "rare_source"
api_id = "b772ebfbfa536bba37764d7fddb11d6f"
client = biothings_client.get_client(url=f"https://biothings.ci.transltr.io/{api_name}")
url = f"https://smart-api.info/api/metakg/?q=api.smartapi.id:{api_id}&bte=1&consolidated=0&size=100"
print(url)
response = requests.get(url)
data = response.json()
BMT = Toolkit()