# Validate the SmartAPI specification against all registered API metadata

The task involves validating the SmartAPI specification against the registered API metadata. Compare the JSON schema defined in the SmartAPI specification (at the provided link) with the metadata of all registered APIs.

In [1]:
import jsonschema
import json

from controller.smartapi import SmartAPI


In [5]:
raw_file="smartapi_schemav2.json"
# Load the schema from a JSON file
with open(raw_file, 'r') as schema_file:
    schema = json.load(schema_file)

In [6]:
raw_file="smartapi_schema.json"
# Load the schema from a JSON file
with open(raw_file, 'r') as schema_file:
    schema = json.load(schema_file)


Dojo testing 

In [19]:
for smartapi in SmartAPI.get_all(1):
    print(dir(smartapi._doc))
    #smartapi.url # confirm with the API
    # print(smartapi.url)
    data_doc = smartapi.raw.decode()
    print(smartapi.raw)

    break  # Print the directory of the first object to get all possible callable methods and attributes

['_meta', '_raw', '_status', 'info', 'openapi', 'paths', 'servers', 'tags']
b'servers:\n  - url: \'https://ramp-api-alpha.ncats.io\'\n # - url: \'http://127.0.0.1:5762/\'\nopenapi: 3.0.3\ninfo:\n  description: Relational Database of Metabolic Pathways (RaMP) API\n  title: RaMP API v1.0.1\n  version: 1.0.1\n  contact:\n    name: Timothy Sheils\n    x-role: responsible developer\n    email: timothy.sheils@nih.gov\n    x-id: \'https://github.com/tsheils\'\n  termsOfService: https://rampdb.nih.gov\ntags:\n- name: NCATS-API\npaths:\n  /api/source_versions:\n    get:\n      summary: Return source version information\n      responses:\n        \'200\':\n          description: OK\n          content:\n            application/json:\n              schema:\n                type: object\n        \'500\':\n          description: Internal Server Error\n          content:\n            application/json:\n              schema:\n                type: string\n        default:\n          description: Defau

Build Summary

In [6]:
import jsonschema
from jsonschema import validate
import yaml
import datetime
from collections import Counter

# Assuming 'schema' is already defined and loaded as shown in previous examples.

# Initialize counters and lists for reporting
pass_count = 0
fail_count = 0
fail_ids = []
fail_details = []
error_summary = Counter()
missing_properties = Counter()
error_categories = Counter()

# Open a file to write the report
with open('validation_report.txt', 'w') as report_file:
    # Traverse all SmartAPI entries
    for smartapi in SmartAPI.get_all(1000):  # Adjust the argument if needed to control the batch size
        try:
            # Decode the raw byte data to a string
            data_doc = smartapi.raw.decode('utf-8')

            # Load the YAML formatted string into a Python dictionary
            source_data = yaml.safe_load(data_doc)

            # Validate the entry against the schema
            validate(instance=source_data, schema=schema)
            pass_count += 1  # Increment pass counter

        except yaml.YAMLError as ye:
            fail_count += 1  # Increment fail counter
            fail_ids.append(smartapi._id)  # Append the failed entry's ID
            fail_details.append(f"YAML Error for ID {smartapi._id}: {ye}")
        except jsonschema.exceptions.ValidationError as ve:
            fail_count += 1  # Increment fail counter
            fail_ids.append(smartapi._id)  # Append the failed entry's ID
            error_message = f"Schema Validation Error for ID {smartapi._id}: {ve}"
            fail_details.append(error_message)
            # Check if error is about a missing required property
            if 'is a required property' in str(ve):
                missing_prop = str(ve).split("'")[1]  # Extract the missing property name
                missing_properties[missing_prop] += 1
            # Categorize and count other types of validation errors
            if 'enum' in str(ve):
                error_categories['Enum Constraint Violations'] += 1
            elif 'additionalProperties' in str(ve):
                error_categories['Unexpected Properties'] += 1
            else:
                error_categories['Other Errors'] += 1

    total_entries = pass_count + fail_count
    percent_passed = (pass_count / total_entries * 100) if total_entries else 0
    percent_failed = (fail_count / total_entries * 100) if total_entries else 0

    # Write the top summary statistics
    report_file.write(f"Validation Report Generated on {datetime.datetime.now()}\n")
    report_file.write("-------------------------------------------------\n")
    report_file.write(f"Total Entries Processed: {total_entries}\n")
    report_file.write(f"Total Passed: {pass_count} ({percent_passed:.2f}%)\n")
    report_file.write(f"Total Failed: {fail_count} ({percent_failed:.2f}%)\n\n")

    # Detailed Error Reports
    if fail_details:
        report_file.write("Detailed Error Reports:\n")
        for detail in fail_details:
            report_file.write(detail + "\n")
            report_file.write("-------------------------------------------------\n")
        report_file.write("\n")

    # Error Type Summary
    report_file.write("Error Type Summary:\n")
    for error_type, count in error_summary.items():
        report_file.write(f"{error_type}: {count}\n")

    # Summary of Missing Required Properties
    if missing_properties:
        report_file.write("Missing Required Property Summary:\n")
        for prop, count in missing_properties.items():
            report_file.write(f"Missing '{prop}': {count} times\n")

    # Summary of Validation Error Categories
    if error_categories:
        report_file.write("Validation Error Categories Summary:\n")
        for category, count in error_categories.items():
            report_file.write(f"{category}: {count} times\n")

# Optionally, you can also print the summary to the console
print(f"Validation Summary: {pass_count} passed, {fail_count} failed.")
if fail_ids:
    print(f"Failed Validation IDs: {', '.join(fail_ids)}")


Validation Summary: 190 passed, 77 failed.
Failed Validation IDs: 283042abacfe3c6bcdc924c4a226ff98, 03283cc2b21c077be6794e1704b1d230, dc91716f44207d2e1287c727f281d339, e3edd325c76f2992a111b43a907a4870, 67cc0e21b6238472f6f1f00e6b7c32aa, b99c6dd64abcefe87dcd0a51c249ee6d, 00fb85fc776279163199e6c50f6ddfc6, edeb26858bd27d0322af93e7a9e08761, a7f784626a426d054885a5f33f17d3f8, 78f324f4ac6b9fafe9085d68c332354c, a80b9c70e756453d1ce8971b59fe1778, fb71332c85d38539ba64ae148a6e3df7, 2575e053d0a631433b447995e1bc9602, 1d288b3a3caf75d541ffaae3aab386c8, 36f82f05705c317bac17ddae3a0ea2f0, af364143267ad5235bf78c1511223875, d161d0a15812dc0667b1d332043ef88c, cd9fc0ca8cc6d9f56bd56a34766de791, 34bad236d77bea0a0ee6c6cba5be54a6, 77ed27f111262d0289ed4f4071faa619, 1c71f68839a44b1b857e79ae7f7e3381, b772ebfbfa536bba37764d7fddb11d6f, aaff750928be8fb09485e953a78e93a2, cecdb8aed0bbd2334784c89c75e8f805, e481efd21f8e8c1deac05662439c2294, e9eb40ff7ad712e4e6f4f04b964b5966, 7f9a1e42306c6fbc2e0ee66429df4fe4, 23f770568b92b7a8

Validate 1 document

In [3]:
for smartapi in SmartAPI.get_all(1):
    print("Raw data:", smartapi.raw[:100])  # Print the first 100 bytes to check the content

    try:
        # Decode the raw data to a string
        data_doc = smartapi.raw.decode('utf-8')
        print("Decoded data:", data_doc[:500])  # Print the first 500 characters of the decoded data

        # Convert the decoded string to a Python dictionary
        data_dict = json.loads(data_doc)
        
        # Now validate the dictionary against the schema
        jsonschema.validate(instance=data_dict, schema=schema)
        print("Validation successful!")
        break  # Validate only one item for testing purposes
    except json.JSONDecodeError as jde:
        print("JSON decoding error:", jde)
    except jsonschema.exceptions.ValidationError as ve:
        print("Validation error:", ve)
    except jsonschema.exceptions.SchemaError as se:
        print("Schema error:", se)
    except Exception as e:
        print("An unexpected error occurred:", e)


  es.search(index=self._index, body=self.to_dict(), **self._params)
  doc = es.get(index=cls._default_index(index), id=id, **kwargs)


Raw data: b'openapi: 3.0.1\ninfo:\n  contact:\n    email: edeutsch@systemsbiology.org\n  description: TRAPI 1.4 endp'
Decoded data: openapi: 3.0.1
info:
  contact:
    email: edeutsch@systemsbiology.org
  description: TRAPI 1.4 endpoint for the NCATS Biomedical Translator Reasoner called
    ARAX
  license:
    name: Apache 2.0
    url: http://www.apache.org/licenses/LICENSE-2.0.html
  termsOfService: https://github.com/RTXteam/RTX/blob/master/LICENSE
  title: ARAX Translator Reasoner - TRAPI 1.4.0
  version: 1.3.1
  x-translator:
    component: ARA
    team:
    - Expander Agent
    infores: infores:arax
    biolink-version
JSON decoding error: Expecting value: line 1 column 1 (char 0)


In [71]:
smartapi.raw

b'servers:\n  - url: \'https://ramp-api-alpha.ncats.io\'\n # - url: \'http://127.0.0.1:5762/\'\nopenapi: 3.0.3\ninfo:\n  description: Relational Database of Metabolic Pathways (RaMP) API\n  title: RaMP API v1.0.1\n  version: 1.0.1\n  contact:\n    name: Timothy Sheils\n    x-role: responsible developer\n    email: timothy.sheils@nih.gov\n    x-id: \'https://github.com/tsheils\'\n  termsOfService: https://rampdb.nih.gov\ntags:\n- name: NCATS-API\npaths:\n  /api/source_versions:\n    get:\n      summary: Return source version information\n      responses:\n        \'200\':\n          description: OK\n          content:\n            application/json:\n              schema:\n                type: object\n        \'500\':\n          description: Internal Server Error\n          content:\n            application/json:\n              schema:\n                type: string\n        default:\n          description: Default response.\n          content:\n            application/json:\n           

In [72]:
import json

for smartapi in SmartAPI.get_all(1):
    try:
        # Decode the byte string explicitly if the encoding might not be UTF-8
        decoded_data = smartapi._raw.decode('utf-8')  # Or use the appropriate encoding
        data_dict = json.loads(decoded_data)
        print(data_dict)
    except (UnicodeDecodeError, json.JSONDecodeError) as e:
        print("Error processing the JSON data:", e)
    break




Error processing the JSON data: Expecting value: line 1 column 1 (char 0)


#### BUILD REPORT

In [11]:
from model import ConsolidatedMetaKGDoc, MetaKGDoc
from controller.metakg import MetaKG


In [12]:
index = MetaKGDoc.Index.name


In [13]:
import jsonschema

for edge in SmartAPI.get_all_via_scan(size=1000, index=index):
    print(edge.keys())
    print(edge['_source'].keys())
    try:
        # Validate the Python dictionary 'edge' against the loaded 'schema'
        jsonschema.validate(instance=edge, schema=schema)
        print("Validation successful!")
        break  # validate one item -- for testing 
    except jsonschema.exceptions.ValidationError as ve:
        print("Validation error:", ve)
    except jsonschema.exceptions.SchemaError as se:
        print("Schema error:", se)


dict_keys(['_index', '_type', '_id', '_score', '_source', 'sort'])
dict_keys(['subject', 'object', 'predicate', 'api'])
Validation successful!


  break  # validate one item -- for testing


#### Write Report

In [14]:
import jsonschema
from jsonschema import validate
import json

# Assuming 'schema' is already defined and loaded as shown in previous examples.

# Initialize counters and lists for reporting
pass_count = 0
fail_count = 0
fail_ids = []

# Open a file to write the report
with open('validation_report.txt', 'w') as report_file:
    # Traverse all edges
    for edge in MetaKG.get_all_via_scan(size=1000, index=index):
        try:
            # Extract the relevant part of the edge for validation
            # Assuming that the part of the edge to be validated is under '_source'
            source_data = edge['_source']
            
            # Validate the edge against the schema
            validate(instance=source_data, schema=schema)
            pass_count += 1  # Increment pass counter

        except jsonschema.exceptions.ValidationError as ve:
            fail_count += 1  # Increment fail counter
            fail_ids.append(edge['_id'])  # Append the failed edge's ID
            report_file.write(f"Failed ID {edge['_id']}: {ve}\n")  # Write detailed error to file

    # After all edges are processed, write summary statistics to the report
    report_file.write(f"\nTotal Passed: {pass_count}\n")
    report_file.write(f"Total Failed: {fail_count}\n")
    report_file.write("Failed IDs: " + ", ".join(fail_ids) + "\n")

# Optionally, you can also print the summary to the console
print(f"Validation Summary: {pass_count} passed, {fail_count} failed.")
if fail_ids:
    print(f"Failed Validation IDs: {', '.join(fail_ids)}")


Validation Summary: 106828 passed, 0 failed.


---

In [2]:
raw_file="smartapi_schemav2.json"
# Load the schema from a JSON file
with open(raw_file, 'r') as schema_file:
    schema2 = json.load(schema_file)