From 3ccd8be733618baa2c469851662753c90d3ba1cf Mon Sep 17 00:00:00 2001 From: Michael Wood Date: Thu, 29 Feb 2024 10:16:28 +0000 Subject: [PATCH 01/10] setup: Add json-merge-patch dependency --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 99e2192..b57d850 100644 --- a/setup.py +++ b/setup.py @@ -16,6 +16,7 @@ 'rangedict', 'ijson', 'jsonschema<4', + 'json-merge-patch', ], extras_require={ 'perf': [ From 95f95b28637de6476ea3076de64d024ee6b048c1 Mon Sep 17 00:00:00 2001 From: Michael Wood Date: Thu, 29 Feb 2024 10:19:07 +0000 Subject: [PATCH 02/10] lib360dataquality/cove: Create new extensions mechanism This adds functionality to the Schema class to resolve a 360Giving json schema extension into the standard extension. The steps involved in this are: 1) unflatten the input data (if needed) using the vanilla schema 2) Replace the external reference in the package schema to a downloaded version of the grants schema 3) If an extension is detected merge in the extension 4) Re-unflatten the data with the new resolved schema file 5) Run common checks as normal Process codelists functionality has been extended to allow codelists to be loaded that are referenced by the extension. --- cove/cove_360/views.py | 43 +++++-- lib360dataquality/cove/schema.py | 189 +++++++++++++++++++++++++++-- lib360dataquality/cove/settings.py | 7 +- 3 files changed, 224 insertions(+), 15 deletions(-) diff --git a/cove/cove_360/views.py b/cove/cove_360/views.py index cb8229e..25c13b3 100644 --- a/cove/cove_360/views.py +++ b/cove/cove_360/views.py @@ -4,6 +4,7 @@ import json import logging import re +import os from decimal import Decimal from cove.views import explore_data_context, cove_web_input_error @@ -20,7 +21,7 @@ from libcove.lib.converters import convert_spreadsheet, convert_json from libcove.lib.exceptions import CoveInputDataError -from lib360dataquality.cove.schema import Schema360 +from lib360dataquality.cove.schema import Schema360, ExtensionsError from lib360dataquality.cove.threesixtygiving import TEST_CLASSES from lib360dataquality.cove.threesixtygiving import common_checks_360 @@ -63,7 +64,6 @@ def explore_360(request, pk, template='cove_360/explore.html'): print("Cache hit") return render(request, template, cached_context) - schema_360 = Schema360() context, db_data, error = explore_data_context(request, pk) if error: return error @@ -91,6 +91,7 @@ def explore_360(request, pk, template='cove_360/explore.html'): upload_url = db_data.upload_url() file_name = db_data.original_file.file.name file_type = context['file_type'] + schema_360 = Schema360(upload_dir) if file_type == 'json': # open the data first so we can inspect for record package @@ -114,16 +115,44 @@ def explore_360(request, pk, template='cove_360/explore.html'): 'link_text': _('Try Again'), 'msg': _('360Giving JSON should have an object as the top level, the JSON you supplied does not.'), }) - context.update(convert_json(upload_dir, upload_url, file_name, schema_url=schema_360.schema_url, - request=request, flatten=request.POST.get('flatten'), - lib_cove_config=lib_cove_config)) + + extension_metadatas = schema_360.resolve_extension(json_data) + + context.update(convert_json(upload_dir, upload_url, file_name, schema_url=schema_360.schema_file, + request=request, flatten=request.POST.get('flatten'), + lib_cove_config=lib_cove_config)) else: - context.update(convert_spreadsheet(upload_dir, upload_url, file_name, file_type, lib_cove_config, schema_360.schema_url, - schema_360.pkg_schema_url)) + # Convert spreadsheet to json + context.update(convert_spreadsheet(upload_dir, upload_url, file_name, file_type, lib_cove_config, schema_360.schema_file, + schema_360.pkg_schema_file)) + with open(context['converted_path'], encoding='utf-8') as fp: json_data = json.load(fp, parse_float=Decimal) + try: + # Check data for presence of any schema extensions if exists re-convert using the newly patched schema + if extension_metadatas := schema_360.resolve_extension(json_data): + # Delete old converted data. If it is detected by libcove it will skip conversion (unflattening) + os.unlink(context["converted_path"]) + + context.update(convert_spreadsheet(upload_dir, upload_url, file_name, file_type, lib_cove_config, schema_360.schema_file, schema_360.pkg_schema_file)) + # Re-load the newly flattened data + with open(context['converted_path'], encoding='utf-8') as fp: + json_data = json.load(fp, parse_float=Decimal) + + context["extension_metadatas"] = extension_metadatas + except ExtensionsError as err: + raise CoveInputDataError(context={ + 'sub_title': _("Sorry, we can't process the data with the specified extension(s)"), + 'link': 'index', + 'link_text': _('Try Again'), + 'msg': _(format_html('We think you tried to upload data that uses an extension to the 360Giving standard. However there was a problem with the extension.' + '\n\n Error message: {}', err)), + 'error': format(err) + }) + context = common_checks_360(context, upload_dir, json_data, schema_360) # Construct the 360Giving specific urls for codelists in the docs diff --git a/lib360dataquality/cove/schema.py b/lib360dataquality/cove/schema.py index 330b5d4..943671f 100644 --- a/lib360dataquality/cove/schema.py +++ b/lib360dataquality/cove/schema.py @@ -1,13 +1,188 @@ from urllib.parse import urljoin -from libcove.lib.common import SchemaJsonMixin +from libcove.lib.common import SchemaJsonMixin, get_schema_codelist_paths, load_core_codelists, load_codelist +from libcove.lib.tools import get_request from .settings import COVE_CONFIG as config +import requests +import json_merge_patch +import json +import os +from typing import Optional + +#FIXME - This will need changing when the extension goes live +EXTENSIONS_REGISTRY_BASE_URL = "https://raw.githubusercontent.com/ThreeSixtyGiving/extensions-registry/add-dei-extension/extensions/" + + +class ExtensionsError(Exception): + pass class Schema360(SchemaJsonMixin): - schema_host = config['schema_host'] - schema_name = config['schema_item_name'] - pkg_schema_name = config['schema_name'] - schema_url = urljoin(schema_host, schema_name) - pkg_schema_url = urljoin(schema_host, pkg_schema_name) - codelists = config['codelists_host'] + + codelists = config["codelists_host"] + schema_name = config["schema_item_name"] + pkg_schema_name = config["schema_name"] + pkg_schema_url = "" # required by libcove but not in use + schema_host = "" # required by libcove but not in use + extended = False # required by libcove but not in use + extension_codelist_urls = [] + + _pkg_schema_obj = {} + _schema_obj = {} + + def __init__(self, working_dir) -> None: + self.working_dir = working_dir + + self.schema_host = self.working_dir # required by lib-cove + + schema_url = urljoin(config["schema_host"], self.schema_name) + pkg_schema_url = urljoin(config["schema_host"], self.pkg_schema_name) + + self._pkg_schema_obj = get_request(pkg_schema_url).json() + self._schema_obj = get_request(schema_url).json() + + # Update the pkg schema to no longer point to an external reference for the + # grants schema. + # If an extension is applied this will be the local merged version of the grant + # schema. + self._pkg_schema_obj["properties"]["grants"]["items"]["$ref"] = self.schema_file + + self.write_pkg_schema_file() + self.write_schema_file() + + super().__init__() + + @property + def schema_file(self): + return os.path.join(self.working_dir, self.schema_name) + + @property + def pkg_schema_file(self): + return os.path.join(self.working_dir, self.pkg_schema_name) + + @property + def schema_str(self): + return json.dumps(self._schema_obj) + + @property + def pkg_schema_str(self): + return json.dumps(self._pkg_schema_obj) + + def write_schema_file(self): + with open(self.schema_file, "w") as fp: + fp.write(self.schema_str) + + def write_pkg_schema_file(self): + with open(self.pkg_schema_file, "w") as fp: + fp.write(self.pkg_schema_str) + + def process_codelists(self): + # From libcove common but with support for codelists from 360Giving extensions added. + + self.core_codelist_schema_paths = get_schema_codelist_paths( + self, use_extensions=False + ) + + extension_unique_files = frozenset( + url.split("/")[-1] for url in self.extension_codelist_urls + ) + + core_unique_files = frozenset( + value[0] for value in self.core_codelist_schema_paths.values() if value[0] not in extension_unique_files + ) + + # This loader uses the codelist host from the config + filename that was taken out of the schema + self.core_codelists = load_core_codelists( + self.codelists, + core_unique_files, + config=self.config if hasattr(self, "config") else None, + ) + + extension_codelists = {} + + for extension_codelist_url in self.extension_codelist_urls: + codelist_file = extension_codelist_url.split("/")[-1] + + extension_codelists[codelist_file] = load_codelist( + extension_codelist_url, + config=self.config if hasattr(self, "config") else None) + + # Update the codelists with any specified by the extension + # This has the unfortunate side-effect of making cove think these are part of + # the main standard however we have no current way to differentiate the paths + self.core_codelists.update(extension_codelists) + + # Ignore. Properties required by libcove: + self.extended_codelist_schema_paths = self.core_codelist_schema_paths + self.extended_codelists = self.core_codelists + self.extended_codelist_urls = {} + # End ignore + + # we do not want to cache if the requests failed. + if not self.core_codelists: + load_core_codelists.cache_clear() + return + + def resolve_extension(self, json_data) -> Optional[list]: + """ + If json_data contains an extension id this patches the schemas if the extension is valid + the internal representation of the schema is replaced with the new patched version. + We write the new schema file(s) to disk for flattentool and caching purposes. + + Returns an array of extension_infos or None + """ + + # FIXME NOTE this currently requires env SCHEMA_BRANCH=1.4-staging + + try: + extension_ids = json_data["extensions"] + except KeyError: + return None + + if len(extension_ids) == 0: + raise ExtensionsError("Extension key found but with no value(s)") + + extension_metadatas = [] + + for extension_id in extension_ids: + try: + r = requests.get(f"{EXTENSIONS_REGISTRY_BASE_URL}/{extension_id}.json") + r.raise_for_status() + extension_metadata = r.json() + extension_metadatas.append(extension_metadata) + except (json.JSONDecodeError, requests.HTTPError): + raise ExtensionsError("Couldn't not fetch or parse extension metadata") + + for extension_schemas in extension_metadata["schemas"]: + try: + r = requests.get(extension_schemas["uri"]) + r.raise_for_status() + extension = r.json() + except (json.JSONDecodeError, requests.HTTPError) as e: + raise ExtensionsError(f"Unable to fetch and decode supplied extension: {e}") + + if extension_schemas["target"] not in [ + self.schema_name, + self.pkg_schema_name, + ]: + raise ExtensionsError(f"Unknown target for extension {extension_schemas['target']} not in {[self.schema_name, self.pkg_schema_name]}") + + # Schema (grants) extension + if extension_schemas["target"] == self.schema_name: + self._schema_obj = json_merge_patch.merge( + self._schema_obj, extension + ) + + # Package schema extension + if extension_schemas["target"] == self.pkg_schema_name: + self._pkg_schema_obj = json_merge_patch.merge( + self._pkg_schema_obj, extension + ) + + self.extension_codelist_urls.extend(extension_metadata["codelists"]) + + # Write out the new schema objects + self.write_pkg_schema_file() + self.write_schema_file() + + return extension_metadatas diff --git a/lib360dataquality/cove/settings.py b/lib360dataquality/cove/settings.py index 81cc67d..1ef0e47 100644 --- a/lib360dataquality/cove/settings.py +++ b/lib360dataquality/cove/settings.py @@ -19,5 +19,10 @@ 'convert_titles': True, 'input_methods': ['upload', 'url', 'text'], 'support_email': 'support@threesixtygiving.org', - 'hashcomments': True + 'hashcomments': True, + "flatten_tool": { + "disable_local_refs": False, + "remove_empty_schema_columns": True, + "xml_comment": None, + }, } From d5ab295dc371d5bd422fa4bd3c1e76c959dd74b8 Mon Sep 17 00:00:00 2001 From: Michael Wood Date: Thu, 29 Feb 2024 10:28:10 +0000 Subject: [PATCH 03/10] cove: templates: Add in copy and conditionality specific to extensions --- cove/cove_360/templates/cove_360/explore.html | 28 ++++++++++++++----- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/cove/cove_360/templates/cove_360/explore.html b/cove/cove_360/templates/cove_360/explore.html index 8ee00c1..68c9492 100644 --- a/cove/cove_360/templates/cove_360/explore.html +++ b/cove/cove_360/templates/cove_360/explore.html @@ -60,24 +60,24 @@