---  
# Build `x-bte` Schema Validation Report  
---

In [16]:
from controller.smartapi import SmartAPI
import json
from jsonschema import validate
from jsonschema.exceptions import ValidationError
import datetime
import yaml
from jsonschema import validate
import jsonschema.exceptions
from collections import defaultdict

--- 
## Validate with test files

### Load Schema

In [9]:
schema_bte = "/Users/nacosta/Documents/smartAPI/WORKING_BRANCH/add-validation/smartAPI/src/x-bte_schema_v3.json"
# Load the JSON file
with open(schema_bte, 'r') as f:
    schema = json.load(f)

# Print the JSON
#print(json.dumps(schema, indent=4))

### Load test documents 
- "Multiomics ClinicalTrials KP"

In [3]:
def load_json(test_doc):
    # Load the JSON document
    with open(test_doc, 'r') as file:
        document = json.load(file)
    # Print the JSON
    # print(json.dumps(document, indent=4))
    return document

In [4]:
multi_kp_doc = load_json("/Users/nacosta/Documents/smartAPI/WORKING_BRANCH/add-validation/smartAPI/src/x-bte_test_doc.json")
quickgo_doc = load_json("/Users/nacosta/Documents/smartAPI/WORKING_BRANCH/add-validation/smartAPI/src/quickgo_doc.json")

### Validate schema 

In [7]:
def validate_json(document, schema):
    try:
        validate(instance=document, schema=schema)
        print("The document is valid.")
    except ValidationError as e:
        print("The document is not valid. See below for more details.")
        print(str(e))


Validate with API Example

In [6]:
validate_json(quickgo_doc, schema)

The document is valid.


---

## Build Report  
**Gather x-bte related documents only**

`x-translator.team` = Service Provider -- has x-bte annotation 

In [2]:
query_data = {"type": "term", "body": {"tags.name": "translator"}}
doc_ct = 0

Write report for original schema

In [73]:
report_name="x-bte_schemav3_report_"+datetime.datetime.now().strftime("%Y%m%d%H%M%S")+".txt"

pass_count = 0
fail_count = 0
doc_ct = 0
fail_ids = []
fail_details = []
missing_properties = defaultdict(int)
error_categories = defaultdict(int)
error_summary = defaultdict(int)

# Function to validate JSON against a schema
def validate_json(data_doc, schema):
    global pass_count, fail_count
    try:
        # Load the YAML formatted string into a Python dictionary
        source_data = yaml.safe_load(data_doc)

        # Validate the entry against the schema
        validate(instance=source_data, schema=schema)
        pass_count += 1  # Increment pass counter

    except yaml.YAMLError as ye:
        fail_count += 1  # Increment fail counter
        fail_ids.append(smartapi._id)  # Append the failed entry's ID
        fail_details.append(f"YAML Error for ID {smartapi._id}: {ye}")
    except jsonschema.exceptions.ValidationError as ve:
        fail_count += 1  # Increment fail counter
        fail_ids.append(smartapi._id)  # Append the failed entry's ID
        error_message = f"Schema Validation Error for ID {smartapi._id}: {ve}"
        fail_details.append(error_message)
        # Check if error is about a missing required property
        if 'is a required property' in str(ve):
            missing_prop = str(ve).split("'")[1]  # Extract the missing property name
            missing_properties[missing_prop] += 1
        # Categorize and count other types of validation errors
        if 'enum' in str(ve):
            error_categories['Enum Constraint Violations'] += 1
        elif 'additionalProperties' in str(ve):
            error_categories['Unexpected Properties'] += 1
        else:
            error_categories['Other Errors'] += 1


In [10]:
# Open a file to write the report
with open(report_name, 'w') as report_file:
    # Traverse all SmartAPI entries with the given query
    for smartapi in SmartAPI.get_all(1000, query_data=query_data):
        doc_ct += 1
        # Decode the raw byte data to a string
        data_doc = smartapi.raw.decode('utf-8')
        validate_json(data_doc, schema)

    total_entries = pass_count + fail_count
    percent_passed = (pass_count / total_entries * 100) if total_entries else 0
    percent_failed = (fail_count / total_entries * 100) if total_entries else 0
    unique_error_ids = len(set(fail_ids))

    # Write the top summary statistics
    report_file.write(f"Validation Report Generated on {datetime.datetime.now()}\n")
    report_file.write("-------------------------------------------------\n")
    report_file.write(f"Total Entries Processed: {total_entries}\n")
    report_file.write(f"Total Passed: {pass_count} ({percent_passed:.2f}%)\n")
    report_file.write(f"Total Failed: {fail_count} ({percent_failed:.2f}%)\n\n")

    # Error Type Summary
    if error_summary:
        report_file.write("Error Type Summary:\n")
        for error_type, count in error_summary.items():
            report_file.write(f"{error_type}: {count}\n")

    report_file.write("\n")

    # Summary of Validation Error Categories
    if error_categories:
        report_file.write("Validation Error Categories Summary:\n")
        for category, count in error_categories.items():
            report_file.write(f"{category}: {count} times\n")

    report_file.write("\n")

    # Summary of Missing Required Properties
    if missing_properties:
        report_file.write("Missing Required Property Summary:\n")
        for prop, count in missing_properties.items():
            report_file.write(f"Missing '{prop}': {count} times\n")

    report_file.write("\n")

    # Summary of Most Common Errors
    if error_categories:
        most_common_error = max(error_categories, key=error_categories.get)
        report_file.write(f"Most Common Error Type: {most_common_error} ({error_categories[most_common_error]} occurrences)\n\n")

    # Summary of Most Common Missing Properties
    if missing_properties:
        most_common_missing_prop = max(missing_properties, key=missing_properties.get)
        report_file.write(f"Most Common Missing Property: '{most_common_missing_prop}' ({missing_properties[most_common_missing_prop]} times)\n\n")

    # Summary List of Error IDs
    if fail_ids:
        report_file.write("Summary List of Error IDs:\n")
        report_file.write(", ".join(fail_ids) + "\n\n")

    # Summary Count of Unique Error IDs
    if unique_error_ids > 0:
        report_file.write(f"Summary Count of Unique Error IDs: {unique_error_ids}\n\n")

    # Detailed Error Reports
    if fail_details:
        report_file.write("Detailed Error Reports:\n")
        report_file.write("-------------------------------------------------\n")
        for detail in fail_details:
            report_file.write(detail + "\n")
            report_file.write("-------------------------------------------------\n")
        report_file.write("\n")

# Optionally, you can also print the summary to the console
print(f"Validation Summary: {pass_count} passed, {fail_count} failed.")
if unique_error_ids > 0:
    print(f"Summary Count of Unique Error IDs: {unique_error_ids}")
if fail_ids:
    print(f"Failed Validation IDs: {', '.join(fail_ids)}")

Validation Summary: 127 passed, 2 failed.
Summary Count of Unique Error IDs: 2
Failed Validation IDs: 671b45c0301c8624abbd26ae78449ca2, cc857d5b7c8b7609b5bbb38ff990bfff


---

## Schema - Polished

In [3]:
schema_bte= "/Users/nacosta/Documents/smartAPI/WORKING_BRANCH/add-validation/smartAPI/src/x-bte_schema_final.json"
with open(schema_bte, 'r') as f:
    schema = json.load(f)


In [4]:
query_data = {"type": "term", "body": {"info.x-translator.team": "Service Provider"}}


In [17]:
# Function to validate JSON against a schema
def validate_json(data_doc, schema):
    global pass_count, fail_count
    try:
        # Load the YAML formatted string into a Python dictionary
        source_data = yaml.safe_load(data_doc)

        # Validate the entry against the schema
        validate(instance=source_data, schema=schema)
        pass_count += 1  # Increment pass counter

    except yaml.YAMLError as ye:
        fail_count += 1  # Increment fail counter
        fail_ids.append(smartapi._id)  # Append the failed entry's ID
        fail_details.append(f"YAML Error for ID {smartapi._id}: {ye}")
    except jsonschema.exceptions.ValidationError as ve:
        fail_count += 1  # Increment fail counter
        fail_ids.append(smartapi._id)  # Append the failed entry's ID
        error_message = f"Schema Validation Error for ID {smartapi._id}: {ve}"
        fail_details.append(error_message)
        # Check if error is about a missing required property
        if 'is a required property' in str(ve):
            missing_prop = str(ve).split("'")[1]  # Extract the missing property name
            missing_properties[missing_prop] += 1
        # Categorize and count other types of validation errors
        if 'enum' in str(ve):
            error_categories['Enum Constraint Violations'] += 1
        elif 'additionalProperties' in str(ve):
            error_categories['Unexpected Properties'] += 1
        else:
            error_categories['Other Errors'] += 1


In [19]:
# query_data = {
#     "type": "match",
#     "body": {
#         "info.x-translator.team": "Service Provider"
#     }
# }
report_name="report_x-bte_schema_val_CURRENT"+datetime.datetime.now().strftime("%Y%m%d%H%M%S")+".txt"

pass_count = 0
fail_count = 0
doc_ct = 0
fail_ids = []
fail_details = []
missing_properties = defaultdict(int)
error_categories = defaultdict(int)
error_summary = defaultdict(int)

schema_bte= "/Users/nacosta/Documents/smartAPI/WORKING_BRANCH/add-validation/smartAPI/src/smartapi_x-bte_schema.json"
with open(schema_bte, 'r') as f:
    schema = json.load(f)

# Open a file to write the report
with open(report_name, 'w') as report_file, open("x-bte_errors.txt", 'w') as error_file:
    # Traverse all SmartAPI entries with the given query
    for smartapi in SmartAPI.get_all(1000):
        doc_ct += 1
        try:
            # Decode the raw byte data to a string
            data_doc = smartapi.raw.decode('utf-8')
            # Check if "x-bte-kgs-operations" exists in the document
            # if "x-bte-kgs-operations:" in data_doc:
            validate_json(data_doc, schema)
            # else: 
            #     continue
        except Exception as e:
            error_file.write(f"\n----------\n{e}\n{data_doc}\n----------\n")
            
    total_entries = pass_count + fail_count
    percent_passed = (pass_count / total_entries * 100) if total_entries else 0
    percent_failed = (fail_count / total_entries * 100) if total_entries else 0
    unique_error_ids = len(set(fail_ids))

    # Write the top summary statistics
    report_file.write(f"Validation Report Generated on {datetime.datetime.now()}\n")
    report_file.write("-------------------------------------------------\n")
    report_file.write(f"Total Entries Processed: {total_entries}\n")
    report_file.write(f"Total Passed: {pass_count} ({percent_passed:.2f}%)\n")
    report_file.write(f"Total Failed: {fail_count} ({percent_failed:.2f}%)\n\n")

    # Error Type Summary
    if error_summary:
        report_file.write("Error Type Summary:\n")
        for error_type, count in error_summary.items():
            report_file.write(f"{error_type}: {count}\n")

    report_file.write("\n")

    # Summary of Validation Error Categories
    if error_categories:
        report_file.write("Validation Error Categories Summary:\n")
        for category, count in error_categories.items():
            report_file.write(f"{category}: {count} times\n")

    report_file.write("\n")

    # Summary of Missing Required Properties
    if missing_properties:
        report_file.write("Missing Required Property Summary:\n")
        for prop, count in missing_properties.items():
            report_file.write(f"Missing '{prop}': {count} times\n")

    report_file.write("\n")

    # Summary of Most Common Errors
    if error_categories:
        most_common_error = max(error_categories, key=error_categories.get)
        report_file.write(f"Most Common Error Type: {most_common_error} ({error_categories[most_common_error]} occurrences)\n\n")

    # Summary of Most Common Missing Properties
    if missing_properties:
        most_common_missing_prop = max(missing_properties, key=missing_properties.get)
        report_file.write(f"Most Common Missing Property: '{most_common_missing_prop}' ({missing_properties[most_common_missing_prop]} times)\n\n")

    # Summary List of Error IDs
    if fail_ids:
        report_file.write("Summary List of Error IDs:\n")
        report_file.write(", ".join(fail_ids) + "\n\n")

    # Summary Count of Unique Error IDs
    if unique_error_ids > 0:
        report_file.write(f"Summary Count of Unique Error IDs: {unique_error_ids}\n\n")

    # Detailed Error Reports
    if fail_details:
        report_file.write("Detailed Error Reports:\n")
        report_file.write("-------------------------------------------------\n")
        for detail in fail_details:
            report_file.write(detail + "\n")
            report_file.write("-------------------------------------------------\n")
        report_file.write("\n")

# Optionally, you can also print the summary to the console
print(f"Validation Summary: {pass_count} passed, {fail_count} failed.")
if unique_error_ids > 0:
    print(f"Summary Count of Unique Error IDs: {unique_error_ids}")
if fail_ids:
    print(f"Failed Validation IDs: {', '.join(fail_ids)}")

KeyboardInterrupt: 

In [20]:
# query_data = {
#     "type": "match",
#     "body": {
#         "info.x-translator.team": "Service Provider"
#     }
# }

schema_bte= "/Users/nacosta/Documents/smartAPI/WORKING_BRANCH/add-validation/smartAPI/src/smartapi_x-bte_schema.json"
with open(schema_bte, 'r') as f:
    schema = json.load(f)

report_name="report_x-bte_schema_val_CURRENT"+datetime.datetime.now().strftime("%Y%m%d%H%M%S")+".txt"

pass_count = 0
fail_count = 0
doc_ct = 0
fail_ids = []
fail_details = []
missing_properties = defaultdict(int)
error_categories = defaultdict(int)
error_summary = defaultdict(int)

# Open a file to write the report
with open(report_name, 'w') as report_file, open("x-bte_errors.txt", 'w') as error_file:
    # Traverse all SmartAPI entries with the given query
    for smartapi in SmartAPI.get_all(1000):
        doc_ct += 1
        try:
            # Decode the raw byte data to a string
            data_doc = smartapi.raw.decode('utf-8')
            # Check if "x-bte-kgs-operations" exists in the document
            # if "x-bte-kgs-operations:" in data_doc:
            validate_json(data_doc, schema)
            # else: 
            #     continue
        except Exception as e:
            error_file.write(f"\n----------\n{e}\n{data_doc}\n----------\n")
            
    total_entries = pass_count + fail_count
    percent_passed = (pass_count / total_entries * 100) if total_entries else 0
    percent_failed = (fail_count / total_entries * 100) if total_entries else 0
    unique_error_ids = len(set(fail_ids))

    # Write the top summary statistics
    report_file.write(f"Validation Report Generated on {datetime.datetime.now()}\n")
    report_file.write("-------------------------------------------------\n")
    report_file.write(f"Total Entries Processed: {total_entries}\n")
    report_file.write(f"Total Passed: {pass_count} ({percent_passed:.2f}%)\n")
    report_file.write(f"Total Failed: {fail_count} ({percent_failed:.2f}%)\n\n")

    # Error Type Summary
    if error_summary:
        report_file.write("Error Type Summary:\n")
        for error_type, count in error_summary.items():
            report_file.write(f"{error_type}: {count}\n")

    report_file.write("\n")

    # Summary of Validation Error Categories
    if error_categories:
        report_file.write("Validation Error Categories Summary:\n")
        for category, count in error_categories.items():
            report_file.write(f"{category}: {count} times\n")

    report_file.write("\n")

    # Summary of Missing Required Properties
    if missing_properties:
        report_file.write("Missing Required Property Summary:\n")
        for prop, count in missing_properties.items():
            report_file.write(f"Missing '{prop}': {count} times\n")

    report_file.write("\n")

    # Summary of Most Common Errors
    if error_categories:
        most_common_error = max(error_categories, key=error_categories.get)
        report_file.write(f"Most Common Error Type: {most_common_error} ({error_categories[most_common_error]} occurrences)\n\n")

    # Summary of Most Common Missing Properties
    if missing_properties:
        most_common_missing_prop = max(missing_properties, key=missing_properties.get)
        report_file.write(f"Most Common Missing Property: '{most_common_missing_prop}' ({missing_properties[most_common_missing_prop]} times)\n\n")

    # Summary List of Error IDs
    if fail_ids:
        report_file.write("Summary List of Error IDs:\n")
        report_file.write(", ".join(fail_ids) + "\n\n")

    # Summary Count of Unique Error IDs
    if unique_error_ids > 0:
        report_file.write(f"Summary Count of Unique Error IDs: {unique_error_ids}\n\n")

    # Detailed Error Reports
    if fail_details:
        report_file.write("Detailed Error Reports:\n")
        report_file.write("-------------------------------------------------\n")
        for detail in fail_details:
            report_file.write(detail + "\n")
            report_file.write("-------------------------------------------------\n")
        report_file.write("\n")

# Optionally, you can also print the summary to the console
print(f"Validation Summary: {pass_count} passed, {fail_count} failed.")
if unique_error_ids > 0:
    print(f"Summary Count of Unique Error IDs: {unique_error_ids}")
if fail_ids:
    print(f"Failed Validation IDs: {', '.join(fail_ids)}")

Validation Summary: 268 passed, 3 failed.
Summary Count of Unique Error IDs: 3
Failed Validation IDs: 671b45c0301c8624abbd26ae78449ca2, cc857d5b7c8b7609b5bbb38ff990bfff, 1f02d8b032f0732f41711fd5c637567f


In [90]:
for smartapi in SmartAPI.get_all(1000, query_data=query_data):
    print(smartapi._id)
    # print(smartapi.raw.de    # Decode the raw byte data to a string
    data_doc = smartapi.raw.decode('utf-8')
    # Check if "x-bte-kgs-operations" exists in the document
    if "x-bte-kgs-operations" in data_doc:
        print(f"'x-bte-kgs-operations' found in document {smartapi._id}")
    else:
        print(f"'x-bte-kgs-operations' not found in document {smartapi._id}")

    break

03283cc2b21c077be6794e1704b1d230
'x-bte-kgs-operations' found in document 03283cc2b21c077be6794e1704b1d230


---

### Handle Errors

---

View Errors

In [2]:
report='/Users/nacosta/Documents/smartAPI/WORKING_BRANCH/add-validation/smartAPI/src/x-bte_schemav3_report_20240726114935.txt'

In [4]:
import re

# Open the file
with open(report, 'r') as file:
    # Initialize a flag for the "Detailed Error Reports" section
    detailed_errors = False
    # Initialize a dictionary to store the error messages and IDs
    error_dict = {}
    # Initialize a dictionary to store the detailed error information
    error_info_dict = {}

    # Loop through each line in the file
    for line in file:
        # If the line contains "Detailed Error Reports", set the flag to True
        if "Detailed Error Reports" in line:
            detailed_errors = True

        # If the flag is True, process the line
        if detailed_errors:
            # If the line contains "Schema Validation Error for ID", extract the ID and error message
            if "Schema Validation Error for ID" in line:
                id = re.search(r"for ID (.*):", line).group(1)
                error = re.search(r": (.*)", line).group(1)
                # If the error message is already in the dictionary, append the ID to the list of IDs
                if error in error_dict:
                    error_dict[error].append(id)
                # If the error message is not in the dictionary, add it with the ID as the first item in the list of IDs
                else:
                    error_dict[error] = [id]
                    # Store the next 6 lines as the detailed error information
                    error_info_dict[error] = ''.join([next(file) for _ in range(6)])

# Print out the error messages, IDs, and detailed error information
for error, ids in error_dict.items():
    print(f"Error: {error}")
    print(f"IDs: {', '.join(ids)}")
    print(f"Details: {error_info_dict[error]}")
    print()

Error: [{'qInput': 'MONDO:0009346', 'oneOutput': 'MONDO:0021568'}] is not of type 'object'
IDs: 671b45c0301c8624abbd26ae78449ca2: [{'qInput': 'MONDO:0009346', 'oneOutput': 'MONDO, 671b45c0301c8624abbd26ae78449ca2: [{'qInput': 'MONDO:0009346', 'oneOutput': 'MONDO
Details: 
Failed validating 'type' in schema['properties']['components']['properties']['x-bte-kgs-operations']['patternProperties']['^[A-Za-z0-9_-]+$']['items']['properties']['response_mapping']['patternProperties']['^[A-Za-z0-9_:-]+$']:
    {'type': 'object'}

On instance['components']['x-bte-kgs-operations']['disease_arises_from_feature'][0]['response_mapping']['testExamples']:
    [{'oneOutput': 'MONDO:0021568', 'qInput': 'MONDO:0009346'}]


Error: 'supportBatch' is a required property
IDs: cc857d5b7c8b7609b5bbb38ff990bfff, cc857d5b7c8b7609b5bbb38ff990bfff
Details: 
Failed validating 'required' in schema['properties']['components']['properties']['x-bte-kgs-operations']['patternProperties']['^[A-Za-z0-9_-]+$']['items']:
    {

---