diff --git a/pyproject.toml b/pyproject.toml index 308ad30..05641ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ classifiers = [ requires-python = ">=3.8" dependencies = [ - "hdx-python-api >= 6.1.1", + "hdx-python-api >= 6.1.2", "gspread", "regex", ] diff --git a/requirements.txt b/requirements.txt index 3a70b3c..88c4c67 100755 --- a/requirements.txt +++ b/requirements.txt @@ -31,7 +31,7 @@ colorama==0.4.6 # via typer coverage[toml]==7.3.1 # via pytest-cov -cryptography==41.0.3 +cryptography==41.0.4 # via pyopenssl defopt==6.4.0 # via hdx-python-api @@ -59,17 +59,17 @@ google-auth==2.23.0 # gspread google-auth-oauthlib==1.1.0 # via gspread -gspread==5.11.1 +gspread==5.11.2 # via hdx-python-scraper (pyproject.toml) -hdx-python-api==6.1.1 +hdx-python-api==6.1.2 # via hdx-python-scraper (pyproject.toml) -hdx-python-country==3.5.5 +hdx-python-country==3.5.6 # via hdx-python-api hdx-python-utilities==3.6.2 # via hdx-python-country humanize==4.8.0 # via frictionless -identify==2.5.28 +identify==2.5.29 # via pre-commit idna==3.4 # via @@ -93,7 +93,7 @@ jsonschema==4.17.3 # via # frictionless # tableschema-to-template -libhxl==5.0.2 +libhxl==5.0.3 # via hdx-python-country loguru==0.7.2 # via hdx-python-utilities @@ -113,7 +113,7 @@ nodeenv==1.8.0 # via pre-commit num2words==0.5.12 # via quantulum3 -numpy==1.25.2 +numpy==1.26.0 # via pandas oauthlib==3.2.2 # via requests-oauthlib @@ -121,7 +121,7 @@ openpyxl==3.1.2 # via hdx-python-utilities packaging==23.1 # via pytest -pandas==2.1.0 +pandas==2.1.1 # via hdx-python-scraper (pyproject.toml) petl==1.7.14 # via frictionless @@ -166,7 +166,6 @@ pyrsistent==0.19.3 pytest==7.4.2 # via # hdx-python-scraper (pyproject.toml) - # libhxl # pytest-cov pytest-cov==4.1.0 # via hdx-python-scraper (pyproject.toml) @@ -208,7 +207,7 @@ requests-oauthlib==1.3.1 # via google-auth-oauthlib rfc3986==2.0.0 # via frictionless -rich==13.5.2 +rich==13.5.3 # via typer rsa==4.9 # via google-auth @@ -242,7 +241,7 @@ text-unidecode==1.3 # via python-slugify typer[all]==0.9.0 # via frictionless -typing-extensions==4.7.1 +typing-extensions==4.8.0 # via # frictionless # inflect @@ -270,7 +269,7 @@ xlrd==2.0.1 # via hdx-python-utilities xlrd3==1.1.0 # via libhxl -xlsxwriter==3.1.3 +xlsxwriter==3.1.4 # via tableschema-to-template xlwt==1.3.0 # via hdx-python-utilities diff --git a/src/hdx/scraper/base_scraper.py b/src/hdx/scraper/base_scraper.py index c232ef0..1c851dc 100644 --- a/src/hdx/scraper/base_scraper.py +++ b/src/hdx/scraper/base_scraper.py @@ -346,14 +346,23 @@ def get_source_urls(self) -> Set[str]: """ return self.source_urls - def get_hapi_metadata(self) -> Optional[Dict]: + def get_hapi_dataset_metadata(self) -> Optional[Dict]: """ - Get HAPI metadata + Get HAPI dataset metadata Returns: - Optional[Dict]: HAPI metadata + Optional[Dict]: HAPI dataset metadata """ - return self.datasetinfo.get("hapi_metadata") + return self.datasetinfo.get("hapi_dataset_metadata") + + def get_hapi_resource_metadata(self) -> Optional[Dict]: + """ + Get HAPI resource metadata + + Returns: + Optional[Dict]: HAPI resource metadata + """ + return self.datasetinfo.get("hapi_resource_metadata") def add_population(self) -> None: """ diff --git a/src/hdx/scraper/runner.py b/src/hdx/scraper/runner.py index e2ab427..fde2ade 100644 --- a/src/hdx/scraper/runner.py +++ b/src/hdx/scraper/runner.py @@ -1146,51 +1146,61 @@ def get_source_urls( def get_hapi_metadata( self, names: Optional[ListTuple[str]] = None - ) -> List[Dict]: + ) -> Dict: """Get HAPI metadata for all datasets Args: names (Optional[ListTuple[str]]): Names of scrapers Returns: - List[Dict]: HAPI metadata for all datasets + Dict: HAPI metadata for all datasets """ if not names: names = self.scrapers.keys() - hapi_metadata_list = [] + results = {} for name in names: scraper = self.get_scraper(name) if not scraper.has_run: continue - hapi_metadata = scraper.get_hapi_metadata() - if hapi_metadata: - hapi_metadata_list.append(hapi_metadata) - return hapi_metadata_list + hapi_dataset_metadata = scraper.get_hapi_dataset_metadata() + hapi_resource_metadata = scraper.get_hapi_resource_metadata() + dataset_id = hapi_dataset_metadata["hdx_id"] + resource_id = hapi_resource_metadata["hdx_id"] + hapi_metadata = results.get( + dataset_id, copy(hapi_dataset_metadata) + ) + hapi_resources = hapi_metadata.get("resources", {}) + hapi_resources[resource_id] = hapi_resource_metadata + hapi_metadata["resources"] = hapi_resources + results[dataset_id] = hapi_metadata + return results def get_hapi_results( self, names: Optional[ListTuple[str]] = None, has_run: bool = True, - ) -> List[Dict]: - """Get the results (headers, values and HAPi metadata) for scrapers - limiting to those in names if given and limiting further to those that - have been set in the constructor if previously given. By default only - scrapers marked as having run are returned unless has_run is set to - False. A list of dictionaries is returned where each dictionary has - keys headers, values, HAPI metadata and fallbacks. Headers is - a tuple of (column headers, hxl hashtags). Values, sources and - fallbacks are all lists. + ) -> Dict: + """Get the results (headers and values per admin level and HAPI + metadata) for scrapers limiting to those in names if given and limiting + further to those that have been set in the constructor if previously + given. By default, only scrapers marked as having run are returned + unless has_run is set to False. A dictionary is returned where key is + HDX dataset id and value is a dictionary that has HAPI dataset metadata + as well as a results key. The value associated with the results key is + a dictionary where each key is an admin level. Each admin level key has + a value dictionary with headers, values and HAPI resource metadata. + Headers is a tuple of (column headers, hxl hashtags). Values is a list. Args: names (Optional[ListTuple[str]]): Names of scrapers. Defaults to None (all scrapers). has_run (bool): Only get results for scrapers marked as having run. Defaults to True. Returns: - List[Dict]: Headers, values and HAPI metadata for all datasets + Dict: Headers and values per admin level and HAPI metadata for all datasets """ if not names: names = self.scrapers.keys() - results = [] + results = {} def add_results(scraper_level, scrap, levels_used): nonlocal results @@ -1201,11 +1211,21 @@ def add_results(scraper_level, scrap, levels_used): if headers is None: return values = scrap.get_values(scraper_level) - hapi_metadata = copy(scrap.get_hapi_metadata()) - hapi_metadata["headers"] = headers - hapi_metadata["values"] = values + hapi_dataset_metadata = scrap.get_hapi_dataset_metadata() + hapi_resource_metadata = scrap.get_hapi_resource_metadata() + dataset_id = hapi_dataset_metadata["hdx_id"] + hapi_metadata = results.get( + dataset_id, copy(hapi_dataset_metadata) + ) + level_results = hapi_metadata.get("results", {}) + level_results[scraper_level] = { + "headers": headers, + "values": values, + "hapi_resource_metadata": hapi_resource_metadata, + } + hapi_metadata["results"] = level_results levels_used.add(scraper_level) - results.append(hapi_metadata) + results[dataset_id] = hapi_metadata for name in names: if self.scrapers_to_run and not any( diff --git a/src/hdx/scraper/utilities/reader.py b/src/hdx/scraper/utilities/reader.py index e604794..ce41dd4 100644 --- a/src/hdx/scraper/utilities/reader.py +++ b/src/hdx/scraper/utilities/reader.py @@ -354,7 +354,9 @@ def read_hdx_metadata( dataset_nameinfo = datasetinfo["dataset"] if isinstance(dataset_nameinfo, str): dataset = self.read_dataset(dataset_nameinfo) - hapi_metadata = self.get_hapi_dataset_metadata(dataset) + datasetinfo[ + "hapi_dataset_metadata" + ] = self.get_hapi_dataset_metadata(dataset) resource = None url = datasetinfo.get("url") if do_resource_check and not url: @@ -365,8 +367,8 @@ def read_hdx_metadata( if resource_name and resource["name"] != resource_name: continue url = resource["url"] - hapi_metadata[ - "resource" + datasetinfo[ + "hapi_resource_metadata" ] = self.get_hapi_resource_metadata(resource) break if not url: @@ -374,7 +376,6 @@ def read_hdx_metadata( f"Cannot find {format} resource in {dataset_nameinfo}!" ) datasetinfo["url"] = url - datasetinfo["hapi_metadata"] = hapi_metadata if "source_date" not in datasetinfo: datasetinfo[ "source_date" diff --git a/tests/hdx/scraper/test_readers.py b/tests/hdx/scraper/test_readers.py index 6a03e0d..261dacf 100755 --- a/tests/hdx/scraper/test_readers.py +++ b/tests/hdx/scraper/test_readers.py @@ -151,15 +151,15 @@ def test_read(self, configuration): "Targeted 2017": " _", "% targeted": "0", } - date = { + { "default_date": { "end": datetime(2016, 9, 1, 23, 59, 59, tzinfo=timezone.utc), } } assert datasetinfo == { - "name": "test", "dataset": "sahel-humanitarian-needs-overview", - "hapi_metadata": { + "format": "csv", + "hapi_dataset_metadata": { "hdx_id": "47f6ef46-500f-421a-9fa2-fefd93facf95", "hdx_stub": "sahel-humanitarian-needs-overview", "provider_code": "ac91832d-2477-4e1f-8520-9a591a7c3d69", @@ -175,21 +175,27 @@ def test_read(self, configuration): ), "startdate_str": "2016-09-01T00:00:00+00:00", }, - "resource": { - "download_url": "https://data.humdata.org/dataset/47f6ef46-500f-421a-9fa2-fefd93facf95/resource/2527ac5b-66fe-46f0-8b9b-7086d2c4ddd3/download/hno-2017-sahel-nutrition.csv", - "filename": "HNO -2017 -Sahel-nutrition.csv", - "format": "csv", - "hdx_id": "2527ac5b-66fe-46f0-8b9b-7086d2c4ddd3", - "update_date": datetime( - 2017, 3, 10, 10, 8, 37, tzinfo=timezone.utc - ), - }, "title": "Sahel : Humanitarian Needs Overview", }, - "format": "csv", + "hapi_resource_metadata": { + "download_url": "https://data.humdata.org/dataset/47f6ef46-500f-421a-9fa2-fefd93facf95/resource/2527ac5b-66fe-46f0-8b9b-7086d2c4ddd3/download/hno-2017-sahel-nutrition.csv", + "filename": "HNO -2017 -Sahel-nutrition.csv", + "format": "csv", + "hdx_id": "2527ac5b-66fe-46f0-8b9b-7086d2c4ddd3", + "update_date": datetime( + 2017, 3, 10, 10, 8, 37, tzinfo=timezone.utc + ), + }, "headers": 1, - "source_date": date, + "name": "test", "source": "Multiple organisations", + "source_date": { + "default_date": { + "end": datetime( + 2016, 9, 1, 23, 59, 59, tzinfo=timezone.utc + ) + } + }, "source_url": "https://data.humdata.org/dataset/sahel-humanitarian-needs-overview", "url": "https://data.humdata.org/dataset/47f6ef46-500f-421a-9fa2-fefd93facf95/resource/2527ac5b-66fe-46f0-8b9b-7086d2c4ddd3/download/hno-2017-sahel-nutrition.csv", } @@ -220,9 +226,9 @@ def test_read(self, configuration): "Total population": 100000, } assert datasetinfo == { - "name": "test", "dataset": "sahel-humanitarian-needs-overview", - "hapi_metadata": { + "format": "xlsx", + "hapi_dataset_metadata": { "hdx_id": "47f6ef46-500f-421a-9fa2-fefd93facf95", "hdx_stub": "sahel-humanitarian-needs-overview", "provider_code": "ac91832d-2477-4e1f-8520-9a591a7c3d69", @@ -238,23 +244,29 @@ def test_read(self, configuration): ), "startdate_str": "2016-09-01T00:00:00+00:00", }, - "resource": { - "download_url": "https://data.humdata.org/dataset/47f6ef46-500f-421a-9fa2-fefd93facf95/resource/d9248be4-7bfb-4a81-a7aa-c035dcb737a2/download/hno-2017-sahel-people-in-need.xlsx", - "filename": "HNO-2017-Sahel- People in " "need.xlsx", - "format": "xlsx", - "hdx_id": "d9248be4-7bfb-4a81-a7aa-c035dcb737a2", - "update_date": datetime( - 2017, 3, 10, 10, 8, 37, tzinfo=timezone.utc - ), - }, "title": "Sahel : Humanitarian Needs Overview", }, + "hapi_resource_metadata": { + "download_url": "https://data.humdata.org/dataset/47f6ef46-500f-421a-9fa2-fefd93facf95/resource/d9248be4-7bfb-4a81-a7aa-c035dcb737a2/download/hno-2017-sahel-people-in-need.xlsx", + "filename": "HNO-2017-Sahel- People in need.xlsx", + "format": "xlsx", + "hdx_id": "d9248be4-7bfb-4a81-a7aa-c035dcb737a2", + "update_date": datetime( + 2017, 3, 10, 10, 8, 37, tzinfo=timezone.utc + ), + }, + "headers": 1, + "name": "test", "resource": "HNO-2017-Sahel- People in need.xlsx", - "format": "xlsx", "sheet": 1, - "headers": 1, - "source_date": date, "source": "Multiple organisations", + "source_date": { + "default_date": { + "end": datetime( + 2016, 9, 1, 23, 59, 59, tzinfo=timezone.utc + ) + } + }, "source_url": "https://data.humdata.org/dataset/sahel-humanitarian-needs-overview", "url": "https://data.humdata.org/dataset/47f6ef46-500f-421a-9fa2-fefd93facf95/resource/d9248be4-7bfb-4a81-a7aa-c035dcb737a2/download/hno-2017-sahel-people-in-need.xlsx", } diff --git a/tests/hdx/scraper/test_scrapers_custom.py b/tests/hdx/scraper/test_scrapers_custom.py index da643f4..b7135ac 100644 --- a/tests/hdx/scraper/test_scrapers_custom.py +++ b/tests/hdx/scraper/test_scrapers_custom.py @@ -30,15 +30,15 @@ class Region: ) runner.add_custom(education_closures) runner.run() + hapi_metadata = runner.get_hapi_metadata() + assert hapi_metadata == {} hapi_results = runner.get_hapi_results() - assert hapi_results == [] + assert hapi_results == {} runner = Runner(("AFG",), today) runner.add_custom(education_closures) - hapi_metadata = runner.get_hapi_metadata() - assert hapi_metadata == [] hapi_results = runner.get_hapi_results() - assert hapi_results == [] + assert hapi_results == {} runner.run() name = education_closures.name headers = (["School Closure"], ["#impact+type"]) @@ -52,34 +52,40 @@ class Region: ) ] check_scraper(name, runner, "national", headers, values, sources) - hapi_metadata = runner.get_hapi_metadata()[0] + hapi_metadata = runner.get_hapi_metadata() assert hapi_metadata == { - "hdx_id": "6a41be98-75b9-4365-9ea3-e33d0dd2668b", - "hdx_stub": "global-school-closures-covid19", - "provider_code": "18f2d467-dcf8-4b7e-bffa-b3c338ba3a7c", - "provider_name": "unesco", - "reference_period": { - "enddate": datetime( - 2022, 4, 30, 23, 59, 59, tzinfo=timezone.utc - ), - "enddate_str": "2022-04-30T23:59:59+00:00", - "ongoing": False, - "startdate": datetime(2020, 2, 16, 0, 0, tzinfo=timezone.utc), - "startdate_str": "2020-02-16T00:00:00+00:00", - }, - "resource": { - "download_url": "https://data.humdata.org/dataset/6a41be98-75b9-4365-9ea3-e33d0dd2668b/resource/3b5baa74-c928-4cbc-adba-bf543c5d3050/download/covid_impact_education.csv", - "filename": "School Closures", - "format": "csv", - "hdx_id": "3b5baa74-c928-4cbc-adba-bf543c5d3050", - "update_date": datetime( - 2022, 4, 4, 9, 56, 5, tzinfo=timezone.utc - ), - }, - "title": "Global School Closures COVID-19", + "6a41be98-75b9-4365-9ea3-e33d0dd2668b": { + "hdx_id": "6a41be98-75b9-4365-9ea3-e33d0dd2668b", + "hdx_stub": "global-school-closures-covid19", + "provider_code": "18f2d467-dcf8-4b7e-bffa-b3c338ba3a7c", + "provider_name": "unesco", + "reference_period": { + "enddate": datetime( + 2022, 4, 30, 23, 59, 59, tzinfo=timezone.utc + ), + "enddate_str": "2022-04-30T23:59:59+00:00", + "ongoing": False, + "startdate": datetime( + 2020, 2, 16, 0, 0, tzinfo=timezone.utc + ), + "startdate_str": "2020-02-16T00:00:00+00:00", + }, + "resources": { + "3b5baa74-c928-4cbc-adba-bf543c5d3050": { + "download_url": "https://data.humdata.org/dataset/6a41be98-75b9-4365-9ea3-e33d0dd2668b/resource/3b5baa74-c928-4cbc-adba-bf543c5d3050/download/covid_impact_education.csv", + "filename": "School " "Closures", + "format": "csv", + "hdx_id": "3b5baa74-c928-4cbc-adba-bf543c5d3050", + "update_date": datetime( + 2022, 4, 4, 9, 56, 5, tzinfo=timezone.utc + ), + } + }, + "title": "Global School Closures " "COVID-19", + } } hapi_results = runner.get_hapi_results() - assert hapi_results[0] == { + assert next(iter(hapi_results.values())) == { "hdx_id": "6a41be98-75b9-4365-9ea3-e33d0dd2668b", "hdx_stub": "global-school-closures-covid19", "provider_code": "18f2d467-dcf8-4b7e-bffa-b3c338ba3a7c", @@ -93,18 +99,38 @@ class Region: "startdate": datetime(2020, 2, 16, 0, 0, tzinfo=timezone.utc), "startdate_str": "2020-02-16T00:00:00+00:00", }, - "resource": { - "download_url": "https://data.humdata.org/dataset/6a41be98-75b9-4365-9ea3-e33d0dd2668b/resource/3b5baa74-c928-4cbc-adba-bf543c5d3050/download/covid_impact_education.csv", - "filename": "School Closures", - "format": "csv", - "hdx_id": "3b5baa74-c928-4cbc-adba-bf543c5d3050", - "update_date": datetime( - 2022, 4, 4, 9, 56, 5, tzinfo=timezone.utc - ), + "results": { + "national": { + "hapi_resource_metadata": { + "download_url": "https://data.humdata.org/dataset/6a41be98-75b9-4365-9ea3-e33d0dd2668b/resource/3b5baa74-c928-4cbc-adba-bf543c5d3050/download/covid_impact_education.csv", + "filename": "School " "Closures", + "format": "csv", + "hdx_id": "3b5baa74-c928-4cbc-adba-bf543c5d3050", + "update_date": datetime( + 2022, 4, 4, 9, 56, 5, tzinfo=timezone.utc + ), + }, + "headers": (("School Closure",), ("#impact+type",)), + "values": ({"AFG": "Closed due to COVID-19"},), + }, + "regional": { + "hapi_resource_metadata": { + "download_url": "https://data.humdata.org/dataset/6a41be98-75b9-4365-9ea3-e33d0dd2668b/resource/3b5baa74-c928-4cbc-adba-bf543c5d3050/download/covid_impact_education.csv", + "filename": "School " "Closures", + "format": "csv", + "hdx_id": "3b5baa74-c928-4cbc-adba-bf543c5d3050", + "update_date": datetime( + 2022, 4, 4, 9, 56, 5, tzinfo=timezone.utc + ), + }, + "headers": ( + ("No. closed countries",), + ("#status+country+closed",), + ), + "values": ({"ROAP": 1},), + }, }, "title": "Global School Closures COVID-19", - "headers": (("School Closure",), ("#impact+type",)), - "values": ({"AFG": "Closed due to COVID-19"},), } headers = (["No. closed countries"], ["#status+country+closed"])