---  

# SmartAPI Schema Validation

---

In [26]:
import jsonschema
import json
from jsonschema import validate
import yaml
import datetime
from collections import Counter
from controller.smartapi import SmartAPI
import warnings
warnings.filterwarnings('ignore')

### Load and Set Schema 
SCHEMA: `smartapi_schemav2.json` - this schema file **does not require** a validation on `termsOfService` or `summary`.

In [27]:
raw_file="local_schema_folders/smartapi/smartapi_schemav2-path.json"
# Load the schema from a JSON file
with open(raw_file, 'r') as schema_file:
    schema = json.load(schema_file)

SCHEMA: `smartapi_schemav.json` - this schema file **does require** a validation on `termsOfService` or `summary`.

In [4]:
raw_file="smartapi_schema.json"
# Load the schema from a JSON file
with open(raw_file, 'r') as schema_file:
    schema = json.load(schema_file)


### Build Report

In [10]:
doc_ct=0
for smartapi in SmartAPI.get_all(1000):
    doc_ct+=1
doc_ct

271

Full Report

In [28]:
report_name="REPORT_smartapi_schema-FINAL_"+datetime.datetime.now().strftime("%Y%m%d%H%M%S")+".txt"

# Initialize counters and lists for reporting
pass_count = 0
fail_count = 0
fail_ids = []
fail_details = []
error_summary = Counter()
missing_properties = Counter()
error_categories = Counter()
from collections import defaultdict


In [29]:
with open(report_name, 'w') as report_file:

    # Initialize a dictionary to group errors by type
    grouped_errors = defaultdict(list)

    # Initialize counters and lists for skipped IDs
    skip_count = 0
    skip_ids = []
    skip_reasons = []

    # Traverse all SmartAPI entries
    for smartapi in SmartAPI.get_all(1000):  # Adjust the argument if needed to control the batch size
        try:
            # Decode the raw byte data to a string
            data_doc = smartapi.raw.decode('utf-8')

            # Load the YAML formatted string into a Python dictionary
            source_data = yaml.safe_load(data_doc)

            # Validate the entry against the schema
            validate(instance=source_data, schema=schema)
            pass_count += 1  # Increment pass counter

        except yaml.YAMLError as ye:
            fail_count += 1  # Increment fail counter
            fail_ids.append(smartapi._id)  # Append the failed entry's ID
            error_message = f"YAML Error: {ye}"
            grouped_errors[error_message].append(smartapi._id)
            
        except jsonschema.exceptions.ValidationError as ve:
            # Ignore errors related to "responsible developers"
            if 'responsible developers' in str(ve):
                skip_count += 1  # Increment skip counter
                skip_ids.append(smartapi._id)  # Append the skipped entry's ID
                skip_reasons.append("Skipped due to 'responsible developers' error")
                continue
            fail_count += 1  # Increment fail counter
            fail_ids.append(smartapi._id)  # Append the failed entry's ID
            error_message = f"Schema Validation Error: {ve}"
            grouped_errors[error_message].append(smartapi._id)

            # Additional processing for error categories
            if 'is a required property' in str(ve):
                missing_prop = str(ve).split("'")[1]  # Extract the missing property name
                missing_properties[missing_prop] += 1
            if 'enum' in str(ve):
                error_categories['Enum Constraint Violations'] += 1
            elif 'additionalProperties' in str(ve):
                error_categories['Unexpected Properties'] += 1
            else:
                error_categories['Other Errors'] += 1

    total_entries = pass_count + fail_count
    percent_passed = (pass_count / total_entries * 100) if total_entries else 0
    percent_failed = (fail_count / total_entries * 100) if total_entries else 0
    unique_error_ids = len(set(fail_ids))

    # Write the top summary statistics
    report_file.write(f"Validation Report Generated on {datetime.datetime.now()}\n")
    report_file.write("-------------------------------------------------\n")
    report_file.write(f"Total Entries Processed: {total_entries}\n")
    report_file.write(f"Total Passed: {pass_count} ({percent_passed:.2f}%)\n")
    report_file.write(f"Total Failed: {fail_count} ({percent_failed:.2f}%)\n")
    report_file.write(f"Total Skipped: {skip_count}\n\n")

    # Error Type Summary
    if error_summary:
        report_file.write("Error Type Summary:\n")
        for error_type, count in error_summary.items():
            report_file.write(f"{error_type}: {count}\n")

    report_file.write("\n")

    # Summary of Validation Error Categories
    if error_categories:
        report_file.write("Validation Error Categories Summary:\n")
        for category, count in error_categories.items():
            report_file.write(f"{category}: {count} times\n")

    report_file.write("\n")

    # Summary of Missing Required Properties
    if missing_properties:
        report_file.write("Missing Required Property Summary:\n")
        for prop, count in missing_properties.items():
            report_file.write(f"Missing '{prop}': {count} times\n")

    report_file.write("\n")

    # Summary of Most Common Errors
    if error_categories:
        most_common_error = max(error_categories, key=error_categories.get)
        report_file.write(f"Most Common Error Type: {most_common_error} ({error_categories[most_common_error]} occurrences)\n\n")

    # Summary of Most Common Missing Properties
    if missing_properties:
        most_common_missing_prop = max(missing_properties, key=missing_properties.get)
        report_file.write(f"Most Common Missing Property: '{most_common_missing_prop}' ({missing_properties[most_common_missing_prop]} times)\n\n")

    # Grouped Error Details
    if grouped_errors:
        report_file.write("Grouped Error Details:\n")
        report_file.write("-------------------------------------------------\n")
        for error, ids in grouped_errors.items():
            report_file.write(f"{error} occurred for the following IDs:\n")
            report_file.write(", ".join(ids) + "\n")
            report_file.write("-------------------------------------------------\n")
        report_file.write("\n")

    # Skipped IDs and Reasons
    if skip_count > 0:
        report_file.write("Skipped IDs and Reasons:\n")
        report_file.write("-------------------------------------------------\n")
        for i in range(len(skip_ids)):
            report_file.write(f"ID: {skip_ids[i]}, Reason: {skip_reasons[i]}\n")
            report_file.write("-------------------------------------------------\n")
        report_file.write("\n")

    # Optionally, you can also print the summary to the console
    print(f"Validation Summary: {pass_count} passed, {fail_count} failed.")
    if unique_error_ids > 0:
        print(f"Summary Count of Unique Error IDs: {unique_error_ids}")
    if fail_ids:
        print(f"Failed Validation IDs: {', '.join(fail_ids)}")
    if skip_count > 0:
        print(f"Skipped Validation IDs: {', '.join(skip_ids)}")
        print("Skipped Reasons:")
        for i in range(len(skip_ids)):
            print(f"ID: {skip_ids[i]}, Reason: {skip_reasons[i]}")


Validation Summary: 243 passed, 0 failed.
Skipped Validation IDs: edeb26858bd27d0322af93e7a9e08761, 1138c3297e8e403b6ac10cff5609b319, 1d288b3a3caf75d541ffaae3aab386c8, b772ebfbfa536bba37764d7fddb11d6f, 34bad236d77bea0a0ee6c6cba5be54a6, b99c6dd64abcefe87dcd0a51c249ee6d, f1b8f64c316a01d1722f0fb842499fe5, 02af7d098ab304e80d6f4806c3527027, e481efd21f8e8c1deac05662439c2294, 38e9e5169a72aee3659c9ddba956790d, f339b28426e7bf72028f60feefcd7465, 03283cc2b21c077be6794e1704b1d230, b48c34df08d16311e3bca06b135b828d, cc857d5b7c8b7609b5bbb38ff990bfff, 77ed27f111262d0289ed4f4071faa619, a5b0ec6bfde5008984d4b6cde402d61f, a7f784626a426d054885a5f33f17d3f8, 68f12100e74342ae0dd5013d5f453194, 32f36164fabed5d3abe6c2fd899c9418, 55a223c6c6e0291dbd05f2faf27d16f4, ec6d76016ef40f284359d17fbf78df20, 1f47552dabd67351d4c625adb0a10d00, 5a4c41bf2076b469a0e9cfcf2f2b8f29, e3edd325c76f2992a111b43a907a4870, 316eab811fd9ef1097df98bcaa9f7361, e9eb40ff7ad712e4e6f4f04b964b5966, 00fb85fc776279163199e6c50f6ddfc6, 895ec14a3650ec7a

Summary Report

In [36]:
raw_file="local_schema_folders/smartapi/smartapi_schemav2-path.json"
# Load the schema from a JSON file
with open(raw_file, 'r') as schema_file:
    schema = json.load(schema_file)
    
report_name="REPORT_SUMMARY_smartapi_schema-xpath_"+datetime.datetime.now().strftime("%Y%m%d%H%M%S")+".txt"

# Initialize counters and lists for reporting
pass_count = 0
fail_count = 0
fail_ids = []
fail_details = []
error_summary = Counter()
missing_properties = Counter()
error_categories = Counter()

Report and ignore `responsible developers`

In [37]:
with open(report_name, 'w') as report_file:
    # Traverse all SmartAPI entries
    for smartapi in SmartAPI.get_all(1000):  # Adjust the argument if needed to control the batch size
        try:
            # Decode the raw byte data to a string
            data_doc = smartapi.raw.decode('utf-8')

            # Load the YAML formatted string into a Python dictionary
            source_data = yaml.safe_load(data_doc)

            # Validate the entry against the schema
            validate(instance=source_data, schema=schema)
            pass_count += 1  # Increment pass counter

        except yaml.YAMLError as ye:
            fail_count += 1  # Increment fail counter
            fail_ids.append(smartapi._id)  # Append the failed entry's ID
            fail_details.append(f"YAML Error for ID {smartapi._id}: {ye}")
        except jsonschema.exceptions.ValidationError as ve:
            # Ignore errors related to "responsible developers"
            if 'responsible developers' in str(ve):
                continue
            fail_count += 1  # Increment fail counter
            fail_ids.append(smartapi._id)  # Append the failed entry's ID
            error_message = f"Schema Validation Error for ID {smartapi._id}: {ve}"
            fail_details.append(error_message)
            # Check if error is about a missing required property
            if 'is a required property' in str(ve):
                missing_prop = str(ve).split("'")[1]  # Extract the missing property name
                missing_properties[missing_prop] += 1
            # Categorize and count other types of validation errors
            if 'enum' in str(ve):
                error_categories['Enum Constraint Violations'] += 1
            elif 'additionalProperties' in str(ve):
                error_categories['Unexpected Properties'] += 1
            else:
                error_categories['Other Errors'] += 1

    total_entries = pass_count + fail_count
    percent_passed = (pass_count / total_entries * 100) if total_entries else 0
    percent_failed = (fail_count / total_entries * 100) if total_entries else 0
    unique_error_ids = len(set(fail_ids))

    # Write the top summary statistics
    report_file.write(f"Validation Report Generated on {datetime.datetime.now()}\n")
    report_file.write("-------------------------------------------------\n")
    report_file.write(f"Total Entries Processed: {total_entries}\n")
    report_file.write(f"Total Passed: {pass_count} ({percent_passed:.2f}%)\n")
    report_file.write(f"Total Failed: {fail_count} ({percent_failed:.2f}%)\n\n")

    # Error Type Summary
    if error_summary:
        report_file.write("Error Type Summary:\n")
        for error_type, count in error_summary.items():
            report_file.write(f"{error_type}: {count}\n")

    report_file.write("\n")

    # Summary of Validation Error Categories
    if error_categories:
        report_file.write("Validation Error Categories Summary:\n")
        for category, count in error_categories.items():
            report_file.write(f"{category}: {count} times\n")

    report_file.write("\n")

    # Summary of Missing Required Properties
    if missing_properties:
        report_file.write("Missing Required Property Summary:\n")
        for prop, count in missing_properties.items():
            report_file.write(f"Missing '{prop}': {count} times\n")

    report_file.write("\n")

    # Summary of Most Common Errors
    if error_categories:
        most_common_error = max(error_categories, key=error_categories.get)
        report_file.write(f"Most Common Error Type: {most_common_error} ({error_categories[most_common_error]} occurrences)\n\n")

    # Summary of Most Common Missing Properties
    if missing_properties:
        most_common_missing_prop = max(missing_properties, key=missing_properties.get)
        report_file.write(f"Most Common Missing Property: '{most_common_missing_prop}' ({missing_properties[most_common_missing_prop]} times)\n\n")

    # Summary List of Error IDs
    if fail_ids:
        report_file.write("Summary List of Error IDs:\n")
        report_file.write(", ".join(fail_ids) + "\n\n")

    # Summary Count of Unique Error IDs
    if unique_error_ids > 0:
        report_file.write(f"Summary Count of Unique Error IDs: {unique_error_ids}\n\n")

    # # Detailed Error Reports
    # if fail_details:
    #     report_file.write("Detailed Error Reports:\n")
    #     report_file.write("-------------------------------------------------\n")
    #     for detail in fail_details:
    #         report_file.write(detail + "\n")
    #         report_file.write("-------------------------------------------------\n")
    #     report_file.write("\n")

# Optionally, you can also print the summary to the console
print(f"Validation Summary: {pass_count} passed, {fail_count} failed.")
if unique_error_ids > 0:
    print(f"Summary Count of Unique Error IDs: {unique_error_ids}")
if fail_ids:
    print(f"Failed Validation IDs: {', '.join(fail_ids)}")

Validation Summary: 243 passed, 0 failed.



### Summary Schema Comparison report:

In [9]:

# Initialize counters and lists for reporting
pass_count = 0
fail_count = 0
fail_ids = []
fail_details = []
error_summary = Counter()
missing_properties = Counter()
error_categories = Counter()

In [8]:
# List of schema files
schema_files = ["smartapi_schemav1.json", "smartapi_schemav2-path.json", "smartapi_schemav2-xpath.json"]

# Create a report file
report_name = "report_smartapi_summary_" + datetime.datetime.now().strftime("%Y%m%d%H%M%S") + ".txt"
with open(report_name, 'w') as report_file:

    # Initialize overall counters for the summary statistics
    overall_pass_count = 0
    overall_fail_count = 0
    overall_error_categories = {}
    overall_missing_properties = {}

    # Loop through each schema file
    for raw_file in schema_files:
        # Initialize counters for the summary statistics per schema
        schema_pass_count = 0
        schema_fail_count = 0
        schema_error_categories = {}
        schema_missing_properties = {}

        # Load the schema from a JSON file
        with open(raw_file, 'r') as schema_file:
            schema = json.load(schema_file)

        # Run validation twice, once with "responsible developers" error and once ignoring it
        for ignore_dev_errors in [False, True]:
            # Initialize counters for reporting
            pass_count = 0
            fail_count = 0

            # Traverse all SmartAPI entries
            for smartapi in SmartAPI.get_all(1000):  # Adjust the argument if needed to control the batch size
                try:
                    # Decode the raw byte data to a string
                    data_doc = smartapi.raw.decode('utf-8')

                    # Load the YAML formatted string into a Python dictionary
                    source_data = yaml.safe_load(data_doc)

                    # Validate the entry against the schema
                    validate(instance=source_data, schema=schema)
                    pass_count += 1  # Increment pass counter

                except yaml.YAMLError as ye:
                    fail_count += 1  # Increment fail counter
                except jsonschema.exceptions.ValidationError as ve:
                    # Ignore errors related to "responsible developers" if ignore_dev_errors is True
                    if ignore_dev_errors and 'responsible developers' in str(ve):
                        continue

                    # Update error categories and missing properties
                    error_msg = str(ve)
                    if 'enum' in error_msg:
                        category = "Enum Constraint Violations"
                    else:
                        category = "Other Errors"

                    if category not in schema_error_categories:
                        schema_error_categories[category] = 0
                    schema_error_categories[category] += 1

                    if 'is a required property' in error_msg:
                        missing_property = error_msg.split("'")[1]
                        if missing_property not in schema_missing_properties:
                            schema_missing_properties[missing_property] = 0
                        schema_missing_properties[missing_property] += 1

                    fail_count += 1  # Increment fail counter

            total_entries = pass_count + fail_count
            percent_passed = (pass_count / total_entries * 100) if total_entries else 0
            percent_failed = (fail_count / total_entries * 100) if total_entries else 0

            # Update the summary statistics for the schema
            schema_pass_count += pass_count
            schema_fail_count += fail_count

            # Write the summary statistics for the current validation run to the report file
            report_file.write(f"Validation Report for {raw_file} (Ignoring 'responsible developers' errors: {ignore_dev_errors}) Generated on {datetime.datetime.now()}\n")
            report_file.write("-------------------------------------------------\n")
            report_file.write(f"Total Entries Processed: {total_entries}\n")
            report_file.write(f"Total Passed: {pass_count} ({percent_passed:.2f}%)\n")
            report_file.write(f"Total Failed: {fail_count} ({percent_failed:.2f}%)\n\n")

        # Write the schema-level summary to the report file
        report_file.write(f"Summary for {raw_file}:\n")
        report_file.write("-------------------------------------------------\n")

        # Summary of Validation Error Categories
        if schema_error_categories:
            report_file.write("Validation Error Categories Summary:\n")
            for category, count in schema_error_categories.items():
                report_file.write(f"{category}: {count} times\n")
        report_file.write("\n")

        # Summary of Missing Required Properties
        if schema_missing_properties:
            report_file.write("Missing Required Property Summary:\n")
            for prop, count in schema_missing_properties.items():
                report_file.write(f"Missing '{prop}': {count} times\n")
        report_file.write("\n")

        # Summary of Most Common Errors
        if schema_error_categories:
            most_common_error = max(schema_error_categories, key=schema_error_categories.get)
            report_file.write(f"Most Common Error Type: {most_common_error} ({schema_error_categories[most_common_error]} occurrences)\n\n")

        # Summary of Most Common Missing Properties
        if schema_missing_properties:
            most_common_missing_prop = max(schema_missing_properties, key=schema_missing_properties.get)
            report_file.write(f"Most Common Missing Property: '{most_common_missing_prop}' ({schema_missing_properties[most_common_missing_prop]} times)\n\n")

        # Update overall summary statistics
        overall_pass_count += schema_pass_count
        overall_fail_count += schema_fail_count
        for category, count in schema_error_categories.items():
            if category not in overall_error_categories:
                overall_error_categories[category] = 0
            overall_error_categories[category] += count
        for prop, count in schema_missing_properties.items():
            if prop not in overall_missing_properties:
                overall_missing_properties[prop] = 0
            overall_missing_properties[prop] += count

        # Write the overall summary statistics to the report file
        report_file.write("Overall Summary:\n")
        report_file.write("-------------------------------------------------\n")
        report_file.write(f"Total Entries Processed: {overall_pass_count + overall_fail_count}\n")
        report_file.write(f"Total Passed: {overall_pass_count}\n")
        report_file.write(f"Total Failed: {overall_fail_count}\n\n")

            # Summary of Validation Error Categories
        if overall_error_categories:
            report_file.write("Overall Validation Error Categories Summary:\n")
            for category, count in overall_error_categories.items():
                report_file.write(f"{category}: {count} times\n")
        report_file.write("\n")

        # Summary of Missing Required Properties
        if overall_missing_properties:
            report_file.write("Overall Missing Required Property Summary:\n")
            for prop, count in overall_missing_properties.items():
                report_file.write(f"Missing '{prop}': {count} times\n")
        report_file.write("\n")

        # Summary of Most Common Errors
        if overall_error_categories:
            most_common_error = max(overall_error_categories, key=overall_error_categories.get)
            report_file.write(f"Most Common Error Type: {most_common_error} ({overall_error_categories[most_common_error]} occurrences)\n\n")

        # Summary of Most Common Missing Properties
        if overall_missing_properties:
            most_common_missing_prop = max(overall_missing_properties, key=overall_missing_properties.get)
            report_file.write(f"Most Common Missing Property: '{most_common_missing_prop}' ({overall_missing_properties[most_common_missing_prop]} times)\n\n")


In [7]:
schema_files = ["smartapi_schemav1.json","smartapi_schemav2-path.json", "smartapi_schemav2-xpath.json"]

# Create a report file
report_name = "report_smartapi_summary_" + datetime.datetime.now().strftime("%Y%m%d%H%M%S") + ".txt"
error_file_name = "error_smartapi_summary_" + datetime.datetime.now().strftime("%Y%m%d%H%M%S") + ".txt"

with open(report_name, 'w') as report_file, open(error_file_name, 'w') as error_file:

    # Write the title for the report
    report_file.write("SmartAPI Schema Validation Report\n")
    report_file.write("==================================\n\n")

    # Loop through each schema file
    for raw_file in schema_files:

        # Load the schema from a JSON file
        with open(raw_file, 'r') as schema_file:
            schema = json.load(schema_file)

        # Run validation twice, once with "responsible developers" error and once ignoring it
        for ignore_dev_errors in [False, True]:
            # Initialize counters for reporting
            pass_count = 0
            fail_count = 0
            # Initialize counters for the summary statistics per schema
            schema_pass_count = 0
            schema_fail_count = 0
            schema_error_categories = {}
            schema_missing_properties = {}

            try:    
                # Traverse all SmartAPI entries
                for smartapi in SmartAPI.get_all(1000):  # Adjust the argument if needed to control the batch size
                    try:
                        # Decode the raw byte data to a string
                        data_doc = smartapi.raw.decode('utf-8')

                        # Load the YAML formatted string into a Python dictionary
                        source_data = yaml.safe_load(data_doc)

                        # Validate the entry against the schema
                        validate(instance=source_data, schema=schema)
                        pass_count += 1  # Increment pass counter

                    except yaml.YAMLError as ye:
                        fail_count += 1  # Increment fail counter
                    except jsonschema.exceptions.ValidationError as ve:
                        # Ignore errors related to "responsible developers" if ignore_dev_errors is True
                        if ignore_dev_errors and 'responsible developers' in str(ve):
                            continue

                        # Update error categories and missing properties
                        error_msg = str(ve)
                        if 'enum' in error_msg:
                            category = "Enum Constraint Violations"
                        else:
                            category = "Other Errors"

                        if category not in schema_error_categories:
                            schema_error_categories[category] = 0
                        schema_error_categories[category] += 1

                        if 'is a required property' in error_msg:
                            missing_property = error_msg.split("'")[1]
                            if missing_property not in schema_missing_properties:
                                schema_missing_properties[missing_property] = 0
                            schema_missing_properties[missing_property] += 1

                        fail_count += 1  # Increment fail counter
            except Exception as e:  # Catch all exceptions
                error_file.write(f"Skipped entry ID: {smartapi.id}, Error: {str(e)}\n")
                fail_count += 1  # Increment fail counter

            total_entries = pass_count + fail_count
            percent_passed = (pass_count / total_entries * 100) if total_entries else 0
            percent_failed = (fail_count / total_entries * 100) if total_entries else 0

            # Update the summary statistics for the schema
            schema_pass_count += pass_count
            schema_fail_count += fail_count

            # Write the summary statistics for the current validation run to the report file
            report_file.write(f"Validation Report for {raw_file} (Ignoring 'responsible developers' errors: {ignore_dev_errors}) Generated on {datetime.datetime.now()}\n")
            report_file.write("-------------------------------------------------\n")
            report_file.write(f"Total Entries Processed: {total_entries}\n")
            report_file.write(f"Total Passed: {pass_count} ({percent_passed:.2f}%)\n")
            report_file.write(f"Total Failed: {fail_count} ({percent_failed:.2f}%)\n\n")

            # Write the overall summary statistics to the report file for the current schema
            report_file.write(f"Overall Summary for {raw_file}:\n")
            report_file.write(f"Total Entries Processed: {schema_pass_count + schema_fail_count}\n")
            report_file.write(f"Total Passed: {schema_pass_count}\n")
            report_file.write(f"Total Failed: {schema_fail_count}\n\n")

            # Summary of Missing Required Properties for the current schema
            if schema_missing_properties:
                report_file.write(f"Overall Missing Required Property Summary for {raw_file}:\n")
                for prop, count in schema_missing_properties.items():
                    report_file.write(f"Missing '{prop}': {count} times\n")
            report_file.write("\n")

---

### Inspection

In [None]:
from controller.smartapi import SmartAPI


### ERRORS

#### Missing Properties 


**Path**  
 e712b9eb07e637a00ae468f757ce2a1f,342e4cec92030d74efd84b61650fb0ea, 67cc0e21b6238472f6f1f00e6b7c32aa

In [None]:
error_list = ['e712b9eb07e637a00ae468f757ce2a1f', '342e4cec92030d74efd84b61650fb0ea', '67cc0e21b6238472f6f1f00e6b7c32aa']


In [None]:
for error_id in error_list:
    doc=SmartAPI.get(error_id)
    print(doc.raw.decode('utf-8'))
    print(type(doc.raw.decode('utf-8')))

openapi: 3.0.0
info:
  title: REST API for Gene ID Conversion
  version: "1.0.0"
  description: |
    Genomic and gene expression data is integral to biomolecular data analysis. The types of identifiers used for genes differ across different resources providing such data sets. The ability to use a single type of gene identifier is imperative for integrating data from two or more resources. This gene ID conversion tool facilitates the use of a common gene identifier. Gene SYMBOL is likely the most common gene ID type a user will search for. A general user does not know in advance whether a gene symbol is indeed a gene SYMBOL or ALIAS. This is often seen for a small percentage of genes with change in the genome/transcriptome version. This tool generates various types of gene IDs (Entrez gene ID, symbol, gene name, Ensembl ID, Uniprot ID, REFSEQ ID, etc.) for a given gene ID type (e.g., symbol or alias). This tool also generate URLs to various commonly used data resources (NCBI Entrez, En

---

In [None]:
report="/Users/nacosta/Documents/smartAPI/WORKING_BRANCH/add-validation/smartAPI/src/smartapi_schemav2_report_20240729165259.txt"

In [None]:
import re

# Open the file
with open(report, 'r') as file:
    # Initialize a flag for the "Detailed Error Reports" section
    detailed_errors = False
    # Initialize a dictionary to store the error messages and IDs
    error_dict = {}
    # Initialize a dictionary to store the detailed error information
    error_info_dict = {}

    # Loop through each line in the file
    for line in file:
        # If the line contains "Detailed Error Reports", set the flag to True
        if "Detailed Error Reports" in line:
            detailed_errors = True

        # If the flag is True, process the line
        if detailed_errors:
            # If the line contains "Schema Validation Error for ID", extract the ID and error message
            if "Schema Validation Error for ID" in line:
                id = re.search(r"for ID (.*):", line).group(1)
                error = re.search(r": (.*)", line).group(1)
                # If the error message is already in the dictionary, append the ID to the list of IDs
                if error in error_dict:
                    error_dict[error].append(id)
                # If the error message is not in the dictionary, add it with the ID as the first item in the list of IDs
                else:
                    error_dict[error] = [id]
                    # Store the next 6 lines as the detailed error information
                    error_info_dict[error] = ''.join([next(file) for _ in range(6)])

# Print out the error messages, IDs, and detailed error information
for error, ids in error_dict.items():
    print(f"Error: {error}")
    print(f"IDs: {', '.join(ids)}")
    print(f"Details: {error_info_dict[error]}")
    print()

Error: 'responsible developers' is not one of ['responsible organization', 'responsible developer', 'contributor', 'support']
IDs: 03283cc2b21c077be6794e1704b1d230, e3edd325c76f2992a111b43a907a4870, b99c6dd64abcefe87dcd0a51c249ee6d, 00fb85fc776279163199e6c50f6ddfc6, edeb26858bd27d0322af93e7a9e08761, a7f784626a426d054885a5f33f17d3f8, 1d288b3a3caf75d541ffaae3aab386c8, 34bad236d77bea0a0ee6c6cba5be54a6, 77ed27f111262d0289ed4f4071faa619, b772ebfbfa536bba37764d7fddb11d6f, e481efd21f8e8c1deac05662439c2294, e9eb40ff7ad712e4e6f4f04b964b5966, 55a223c6c6e0291dbd05f2faf27d16f4, b48c34df08d16311e3bca06b135b828d, f1b8f64c316a01d1722f0fb842499fe5, 32f36164fabed5d3abe6c2fd899c9418, 895ec14a3650ec7ad85959a2d1554e2f, 1138c3297e8e403b6ac10cff5609b319, 1f47552dabd67351d4c625adb0a10d00, f339b28426e7bf72028f60feefcd7465, 38e9e5169a72aee3659c9ddba956790d, 68f12100e74342ae0dd5013d5f453194, cc857d5b7c8b7609b5bbb38ff990bfff, 316eab811fd9ef1097df98bcaa9f7361, ec6d76016ef40f284359d17fbf78df20, 02af7d098ab304e80d6

---

### Update the translator repos 

In [None]:
import glob
import re
import os

base_dir = "/Users/nacosta/Documents/smartAPI/WORKING_BRANCH/transl-api-reg/translator-api-registry"
pattern = re.compile(r'BioThings (\w+)')


Get the error API ids for the `responsible developers` typo error 

In [None]:
error_ids_res_devs = ['03283cc2b21c077be6794e1704b1d230', 'e3edd325c76f2992a111b43a907a4870', 'b99c6dd64abcefe87dcd0a51c249ee6d', '00fb85fc776279163199e6c50f6ddfc6', 'edeb26858bd27d0322af93e7a9e08761', 'a7f784626a426d054885a5f33f17d3f8', '1d288b3a3caf75d541ffaae3aab386c8', '34bad236d77bea0a0ee6c6cba5be54a6', '77ed27f111262d0289ed4f4071faa619', 'b772ebfbfa536bba37764d7fddb11d6f', 'e481efd21f8e8c1deac05662439c2294', 'e9eb40ff7ad712e4e6f4f04b964b5966', '55a223c6c6e0291dbd05f2faf27d16f4', 'b48c34df08d16311e3bca06b135b828d', 'f1b8f64c316a01d1722f0fb842499fe5', '32f36164fabed5d3abe6c2fd899c9418', '895ec14a3650ec7ad85959a2d1554e2f', '1138c3297e8e403b6ac10cff5609b319', '1f47552dabd67351d4c625adb0a10d00', 'f339b28426e7bf72028f60feefcd7465', '38e9e5169a72aee3659c9ddba956790d', '68f12100e74342ae0dd5013d5f453194', 'cc857d5b7c8b7609b5bbb38ff990bfff', '316eab811fd9ef1097df98bcaa9f7361', 'ec6d76016ef40f284359d17fbf78df20', '02af7d098ab304e80d6f4806c3527027', 'a5b0ec6bfde5008984d4b6cde402d61f', '5a4c41bf2076b469a0e9cfcf2f2b8f29']
api_dict={}
print(f"{len(error_ids_res_devs)} ids")

for id_ in error_ids_res_devs:
    doc=SmartAPI.get(id_)
    # print(doc.__dict__.keys())
    # print(doc.__dict__['_raw'])
    raw_output = doc.raw.decode('utf-8')
    yaml_doc = yaml.safe_load(raw_output)
    print(yaml_doc['info']['title'])
    api_dict[id_]=yaml_doc['info']['title']


28 ids
BioThings Rhea API
BioThings DGIdb API
BioThings BioPlanet Pathway-Gene API
BioThings DDInter API
BioThings PFOCR API
BioThings DISEASES API
BioThings SEMMEDDB API
BioThings GO Molecular Function API
BioThings MGIgene2phenotype API
BioThings RARe-SOURCE API
Biothings Therapeutic Target Database API
BioThings InnateDB API
BioThings BioPlanet Pathway-Disease API
BioThings SuppKG API
BioThings FooDB API
BioThings IDISK API
BioThings FoodData Central API
BioThings repoDB API
BioThings EBIgene2phenotype API
BioThings GO Cellular Component API
BioThings BindingDB API
BioThings AGR API
BioThings GO Biological Process API
BioThings GTRx API
BioThings UBERON API
Multiomics Wellness KP API
BioThings HPO API
Translator Annotation Service


In [None]:
len(api_dict)

28

Confirm the output of key: `['info']['contact']['x-role']` is the typo value - `responsible developers`

In [None]:
for id_ in error_ids_res_devs:
    doc=SmartAPI.get(id_)
    # print(doc.__dict__.keys())
    # print(doc.__dict__['_raw'])
    raw_output = doc.raw.decode('utf-8')
    yaml_doc = yaml.safe_load(raw_output)
    print(yaml_doc['info']['contact'])

{'email': 'help@biothings.io', 'name': 'BioThings Team', 'x-id': 'https://github.com/biothings', 'x-role': 'responsible developers'}
{'email': 'help@biothings.io', 'name': 'BioThings Team', 'x-id': 'https://github.com/biothings', 'x-role': 'responsible developers'}
{'email': 'help@biothings.io', 'name': 'BioThings Team', 'x-id': 'https://github.com/biothings', 'x-role': 'responsible developers'}
{'email': 'help@biothings.io', 'name': 'BioThings Team', 'x-id': 'https://github.com/biothings', 'x-role': 'responsible developers'}
{'email': 'help@biothings.io', 'name': 'BioThings Team', 'x-id': 'https://github.com/biothings', 'x-role': 'responsible developers'}
{'email': 'help@biothings.io', 'name': 'BioThings Team', 'x-id': 'https://github.com/biothings', 'x-role': 'responsible developers'}
{'email': 'help@biothings.io', 'name': 'BioThings Team', 'x-id': 'https://github.com/biothings', 'x-role': 'responsible developers'}
{'email': 'help@biothings.io', 'name': 'BioThings Team', 'x-id': 'htt

In [None]:
manual_edits=[]
for value in api_dict.values():
    match = pattern.search(value)
    if match:
        repo_name = match.group(1).lower()
        repo_path = os.path.join(base_dir, repo_name)
        if os.path.exists(repo_path):
            print(f"{value}: {repo_name}")
    else:
        manual_edits.append(value)
        print(f"****NOT FOUND:{value}****")

BioThings Rhea API: rhea
BioThings DGIdb API: dgidb
BioThings BioPlanet Pathway-Gene API: bioplanet
BioThings DDInter API: ddinter
BioThings PFOCR API: pfocr
BioThings DISEASES API: diseases
BioThings SEMMEDDB API: semmeddb
BioThings MGIgene2phenotype API: mgigene2phenotype
****NOT FOUND:Biothings Therapeutic Target Database API****
BioThings InnateDB API: innatedb
BioThings BioPlanet Pathway-Disease API: bioplanet
BioThings SuppKG API: suppkg
BioThings IDISK API: idisk
BioThings repoDB API: repodb
BioThings EBIgene2phenotype API: ebigene2phenotype
BioThings BindingDB API: bindingdb
BioThings AGR API: agr
BioThings GTRx API: gtrx
BioThings UBERON API: uberon
****NOT FOUND:Multiomics Wellness KP API****
BioThings HPO API: hpo
****NOT FOUND:Translator Annotation Service****


In [None]:
manual_edits

['Biothings Therapeutic Target Database API',
 'Multiomics Wellness KP API',
 'Translator Annotation Service']

Change the file

In [None]:

with open('error_log.txt', 'w') as error_file:
    for value in api_dict.values():
        match = pattern.search(value)
        if match:
            repo_name = match.group(1).lower()
            repo_path = os.path.join(base_dir, repo_name)
            if os.path.exists(repo_path):
                print(f"{value}: {repo_name}")
                for file in glob.glob(os.path.join(repo_path, "*.yml")) + glob.glob(os.path.join(repo_path, "*.yaml")):
                    try:
                        with open(file, 'r') as yaml_file:
                            yaml_doc = yaml.safe_load(yaml_file)
                            if 'x-role' in yaml_doc['info']['contact'] and yaml_doc['info']['contact']['x-role'] == 'responsible developers':
                                print(f"Before change 'x-role' in {file}: {yaml_doc['info']['contact']['x-role']}")
                                yaml_doc['info']['contact']['x-role'] = 'responsible developer'
                                print(f"After change 'x-role' in {file}: {yaml_doc['info']['contact']['x-role']}")
                        with open(file, 'w') as yaml_file:
                            yaml.safe_dump(yaml_doc, yaml_file)
                    except KeyError:
                        error_file.write(f"Error in {repo_name}: {value}\n")

BioThings Rhea API: rhea
Before change 'x-role' in /Users/nacosta/Documents/smartAPI/WORKING_BRANCH/transl-api-reg/translator-api-registry/rhea/smartapi.yaml: responsible developers
After change 'x-role' in /Users/nacosta/Documents/smartAPI/WORKING_BRANCH/transl-api-reg/translator-api-registry/rhea/smartapi.yaml: responsible developer
BioThings DGIdb API: dgidb
Before change 'x-role' in /Users/nacosta/Documents/smartAPI/WORKING_BRANCH/transl-api-reg/translator-api-registry/dgidb/openapi.yml: responsible developers
After change 'x-role' in /Users/nacosta/Documents/smartAPI/WORKING_BRANCH/transl-api-reg/translator-api-registry/dgidb/openapi.yml: responsible developer
Before change 'x-role' in /Users/nacosta/Documents/smartAPI/WORKING_BRANCH/transl-api-reg/translator-api-registry/dgidb/smartapi.yml: responsible developers
After change 'x-role' in /Users/nacosta/Documents/smartAPI/WORKING_BRANCH/transl-api-reg/translator-api-registry/dgidb/smartapi.yml: responsible developer
BioThings BioP

In [None]:
import os
import re
import tempfile

def replace_in_file(file_path, old_string, new_string):
    # Create temporary file read/write
    with tempfile.NamedTemporaryFile(mode='r+', delete=False) as temp_file:
        with open(file_path, 'r') as yaml_file:
            for line in yaml_file:
                # replace old_string with new_string
                temp_file.write(line.replace(old_string, new_string))
        # Copy the temp file to the original location
        os.replace(temp_file.name, file_path)

file_ct=0
# Use the function in your code
with open('error_log.txt', 'w') as error_file:
    for value in api_dict.values():
        match = pattern.search(value)
        if match:
            repo_name = match.group(1).lower()
            repo_path = os.path.join(base_dir, repo_name)
            if os.path.exists(repo_path):
                print(f"{value}: {repo_name}")
                for file in glob.glob(os.path.join(repo_path, "*.yml")) + glob.glob(os.path.join(repo_path, "*.yaml")):
                    try:
                        # Load the YAML file into a Python dictionary
                        with open(file, 'r') as yaml_file:
                            data = yaml.safe_load(yaml_file)

                        contact_name = data.get('info', {}).get('contact', {}).get('name')
                        # # Check if info.contact.name equals "Biothings Team"
                        if contact_name and "BioThings Team" in contact_name:
                            replace_in_file(file, 'responsible developers', 'responsible organization')
                            print(f"Replaced 'responsible developers' with 'responsible organization' in {file}")
                        else:
                            replace_in_file(file, 'responsible developers', 'responsible developer')
                            print(f"Replaced 'responsible developers' with 'responsible developer' in {file}")
                        file_ct += 1
                    except KeyError:
                        error_file.write(f"Error in {repo_name}: {value}\n")

BioThings Rhea API: rhea
Replaced 'responsible developers' with 'responsible organization' in /Users/nacosta/Documents/smartAPI/WORKING_BRANCH/transl-api-reg/translator-api-registry/rhea/smartapi.yaml
BioThings DGIdb API: dgidb
Replaced 'responsible developers' with 'responsible organization' in /Users/nacosta/Documents/smartAPI/WORKING_BRANCH/transl-api-reg/translator-api-registry/dgidb/openapi.yml
Replaced 'responsible developers' with 'responsible organization' in /Users/nacosta/Documents/smartAPI/WORKING_BRANCH/transl-api-reg/translator-api-registry/dgidb/smartapi.yml
BioThings BioPlanet Pathway-Gene API: bioplanet
Replaced 'responsible developers' with 'responsible organization' in /Users/nacosta/Documents/smartAPI/WORKING_BRANCH/transl-api-reg/translator-api-registry/bioplanet/bioplanet-pathway-gene.yaml
Replaced 'responsible developers' with 'responsible organization' in /Users/nacosta/Documents/smartAPI/WORKING_BRANCH/transl-api-reg/translator-api-registry/bioplanet/bioplanet-p

In [None]:
import os
import re
import tempfile

def replace_in_file(file_path, old_string, new_string):
    # Create temporary file read/write
    with tempfile.NamedTemporaryFile(mode='r+', delete=False) as temp_file:
        with open(file_path, 'r') as yaml_file:
            for line in yaml_file:
                # replace old_string with new_string
                temp_file.write(line.replace(old_string, new_string))
        # Copy the temp file to the original location
        os.replace(temp_file.name, file_path)

def search_and_replace(directory, old_string, new_string):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.yml') or file.endswith('.yaml'):
                file_path = os.path.join(root, file)
                replace_in_file(file_path, old_string, new_string)

# Use the function in your code
search_and_replace(base_dir, 'responsible developers', 'responsible developer')

In [None]:
file_ct

25

In [None]:

with open('error_log.txt', 'w') as error_file:
    for value in api_dict.values():
        match = pattern.search(value)
        if match:
            repo_name = match.group(1).lower()
            repo_path = os.path.join(base_dir, repo_name)
            if os.path.exists(repo_path):
                print(f"{value}: {repo_name}")
                for file in glob.glob(os.path.join(repo_path, "*.yml")) + glob.glob(os.path.join(repo_path, "*.yaml")):
                    try:
                        with open(file, 'r') as yaml_file:
                            yaml_doc = yaml.safe_load(yaml_file)
                            if 'x-role' in yaml_doc['info']['contact'] and yaml_doc['info']['contact']['x-role'] == 'responsible developers':
                                print(f"Before change 'x-role' in {file}: {yaml_doc['info']['contact']['x-role']}")
                                yaml_doc['info']['contact']['x-role'] = 'responsible developer'
                                print(f"After change 'x-role' in {file}: {yaml_doc['info']['contact']['x-role']}")
                        with open(file, 'w') as yaml_file:
                            yaml.safe_dump(yaml_doc, yaml_file)
                    except KeyError:
                        error_file.write(f"Error in {repo_name}: {value}\n")

---