Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ classifiers = [
requires-python = ">=3.8"

dependencies = [
"hdx-python-api >= 6.1.1",
"hdx-python-api >= 6.1.2",
"gspread",
"regex",
]
Expand Down
23 changes: 11 additions & 12 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ colorama==0.4.6
# via typer
coverage[toml]==7.3.1
# via pytest-cov
cryptography==41.0.3
cryptography==41.0.4
# via pyopenssl
defopt==6.4.0
# via hdx-python-api
Expand Down Expand Up @@ -59,17 +59,17 @@ google-auth==2.23.0
# gspread
google-auth-oauthlib==1.1.0
# via gspread
gspread==5.11.1
gspread==5.11.2
# via hdx-python-scraper (pyproject.toml)
hdx-python-api==6.1.1
hdx-python-api==6.1.2
# via hdx-python-scraper (pyproject.toml)
hdx-python-country==3.5.5
hdx-python-country==3.5.6
# via hdx-python-api
hdx-python-utilities==3.6.2
# via hdx-python-country
humanize==4.8.0
# via frictionless
identify==2.5.28
identify==2.5.29
# via pre-commit
idna==3.4
# via
Expand All @@ -93,7 +93,7 @@ jsonschema==4.17.3
# via
# frictionless
# tableschema-to-template
libhxl==5.0.2
libhxl==5.0.3
# via hdx-python-country
loguru==0.7.2
# via hdx-python-utilities
Expand All @@ -113,15 +113,15 @@ nodeenv==1.8.0
# via pre-commit
num2words==0.5.12
# via quantulum3
numpy==1.25.2
numpy==1.26.0
# via pandas
oauthlib==3.2.2
# via requests-oauthlib
openpyxl==3.1.2
# via hdx-python-utilities
packaging==23.1
# via pytest
pandas==2.1.0
pandas==2.1.1
# via hdx-python-scraper (pyproject.toml)
petl==1.7.14
# via frictionless
Expand Down Expand Up @@ -166,7 +166,6 @@ pyrsistent==0.19.3
pytest==7.4.2
# via
# hdx-python-scraper (pyproject.toml)
# libhxl
# pytest-cov
pytest-cov==4.1.0
# via hdx-python-scraper (pyproject.toml)
Expand Down Expand Up @@ -208,7 +207,7 @@ requests-oauthlib==1.3.1
# via google-auth-oauthlib
rfc3986==2.0.0
# via frictionless
rich==13.5.2
rich==13.5.3
# via typer
rsa==4.9
# via google-auth
Expand Down Expand Up @@ -242,7 +241,7 @@ text-unidecode==1.3
# via python-slugify
typer[all]==0.9.0
# via frictionless
typing-extensions==4.7.1
typing-extensions==4.8.0
# via
# frictionless
# inflect
Expand Down Expand Up @@ -270,7 +269,7 @@ xlrd==2.0.1
# via hdx-python-utilities
xlrd3==1.1.0
# via libhxl
xlsxwriter==3.1.3
xlsxwriter==3.1.4
# via tableschema-to-template
xlwt==1.3.0
# via hdx-python-utilities
Expand Down
17 changes: 13 additions & 4 deletions src/hdx/scraper/base_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,14 +346,23 @@ def get_source_urls(self) -> Set[str]:
"""
return self.source_urls

def get_hapi_metadata(self) -> Optional[Dict]:
def get_hapi_dataset_metadata(self) -> Optional[Dict]:
"""
Get HAPI metadata
Get HAPI dataset metadata

Returns:
Optional[Dict]: HAPI metadata
Optional[Dict]: HAPI dataset metadata
"""
return self.datasetinfo.get("hapi_metadata")
return self.datasetinfo.get("hapi_dataset_metadata")

def get_hapi_resource_metadata(self) -> Optional[Dict]:
"""
Get HAPI resource metadata

Returns:
Optional[Dict]: HAPI resource metadata
"""
return self.datasetinfo.get("hapi_resource_metadata")

def add_population(self) -> None:
"""
Expand Down
64 changes: 42 additions & 22 deletions src/hdx/scraper/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1146,51 +1146,61 @@ def get_source_urls(

def get_hapi_metadata(
self, names: Optional[ListTuple[str]] = None
) -> List[Dict]:
) -> Dict:
"""Get HAPI metadata for all datasets

Args:
names (Optional[ListTuple[str]]): Names of scrapers

Returns:
List[Dict]: HAPI metadata for all datasets
Dict: HAPI metadata for all datasets
"""
if not names:
names = self.scrapers.keys()
hapi_metadata_list = []
results = {}
for name in names:
scraper = self.get_scraper(name)
if not scraper.has_run:
continue
hapi_metadata = scraper.get_hapi_metadata()
if hapi_metadata:
hapi_metadata_list.append(hapi_metadata)
return hapi_metadata_list
hapi_dataset_metadata = scraper.get_hapi_dataset_metadata()
hapi_resource_metadata = scraper.get_hapi_resource_metadata()
dataset_id = hapi_dataset_metadata["hdx_id"]
resource_id = hapi_resource_metadata["hdx_id"]
hapi_metadata = results.get(
dataset_id, copy(hapi_dataset_metadata)
)
hapi_resources = hapi_metadata.get("resources", {})
hapi_resources[resource_id] = hapi_resource_metadata
hapi_metadata["resources"] = hapi_resources
results[dataset_id] = hapi_metadata
return results

def get_hapi_results(
self,
names: Optional[ListTuple[str]] = None,
has_run: bool = True,
) -> List[Dict]:
"""Get the results (headers, values and HAPi metadata) for scrapers
limiting to those in names if given and limiting further to those that
have been set in the constructor if previously given. By default only
scrapers marked as having run are returned unless has_run is set to
False. A list of dictionaries is returned where each dictionary has
keys headers, values, HAPI metadata and fallbacks. Headers is
a tuple of (column headers, hxl hashtags). Values, sources and
fallbacks are all lists.
) -> Dict:
"""Get the results (headers and values per admin level and HAPI
metadata) for scrapers limiting to those in names if given and limiting
further to those that have been set in the constructor if previously
given. By default, only scrapers marked as having run are returned
unless has_run is set to False. A dictionary is returned where key is
HDX dataset id and value is a dictionary that has HAPI dataset metadata
as well as a results key. The value associated with the results key is
a dictionary where each key is an admin level. Each admin level key has
a value dictionary with headers, values and HAPI resource metadata.
Headers is a tuple of (column headers, hxl hashtags). Values is a list.

Args:
names (Optional[ListTuple[str]]): Names of scrapers. Defaults to None (all scrapers).
has_run (bool): Only get results for scrapers marked as having run. Defaults to True.

Returns:
List[Dict]: Headers, values and HAPI metadata for all datasets
Dict: Headers and values per admin level and HAPI metadata for all datasets
"""
if not names:
names = self.scrapers.keys()
results = []
results = {}

def add_results(scraper_level, scrap, levels_used):
nonlocal results
Expand All @@ -1201,11 +1211,21 @@ def add_results(scraper_level, scrap, levels_used):
if headers is None:
return
values = scrap.get_values(scraper_level)
hapi_metadata = copy(scrap.get_hapi_metadata())
hapi_metadata["headers"] = headers
hapi_metadata["values"] = values
hapi_dataset_metadata = scrap.get_hapi_dataset_metadata()
hapi_resource_metadata = scrap.get_hapi_resource_metadata()
dataset_id = hapi_dataset_metadata["hdx_id"]
hapi_metadata = results.get(
dataset_id, copy(hapi_dataset_metadata)
)
level_results = hapi_metadata.get("results", {})
level_results[scraper_level] = {
"headers": headers,
"values": values,
"hapi_resource_metadata": hapi_resource_metadata,
}
hapi_metadata["results"] = level_results
levels_used.add(scraper_level)
results.append(hapi_metadata)
results[dataset_id] = hapi_metadata

for name in names:
if self.scrapers_to_run and not any(
Expand Down
9 changes: 5 additions & 4 deletions src/hdx/scraper/utilities/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,9 @@ def read_hdx_metadata(
dataset_nameinfo = datasetinfo["dataset"]
if isinstance(dataset_nameinfo, str):
dataset = self.read_dataset(dataset_nameinfo)
hapi_metadata = self.get_hapi_dataset_metadata(dataset)
datasetinfo[
"hapi_dataset_metadata"
] = self.get_hapi_dataset_metadata(dataset)
resource = None
url = datasetinfo.get("url")
if do_resource_check and not url:
Expand All @@ -365,16 +367,15 @@ def read_hdx_metadata(
if resource_name and resource["name"] != resource_name:
continue
url = resource["url"]
hapi_metadata[
"resource"
datasetinfo[
"hapi_resource_metadata"
] = self.get_hapi_resource_metadata(resource)
break
if not url:
raise ValueError(
f"Cannot find {format} resource in {dataset_nameinfo}!"
)
datasetinfo["url"] = url
datasetinfo["hapi_metadata"] = hapi_metadata
if "source_date" not in datasetinfo:
datasetinfo[
"source_date"
Expand Down
68 changes: 40 additions & 28 deletions tests/hdx/scraper/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,15 +151,15 @@ def test_read(self, configuration):
"Targeted 2017": " _",
"% targeted": "0",
}
date = {
{
"default_date": {
"end": datetime(2016, 9, 1, 23, 59, 59, tzinfo=timezone.utc),
}
}
assert datasetinfo == {
"name": "test",
"dataset": "sahel-humanitarian-needs-overview",
"hapi_metadata": {
"format": "csv",
"hapi_dataset_metadata": {
"hdx_id": "47f6ef46-500f-421a-9fa2-fefd93facf95",
"hdx_stub": "sahel-humanitarian-needs-overview",
"provider_code": "ac91832d-2477-4e1f-8520-9a591a7c3d69",
Expand All @@ -175,21 +175,27 @@ def test_read(self, configuration):
),
"startdate_str": "2016-09-01T00:00:00+00:00",
},
"resource": {
"download_url": "https://data.humdata.org/dataset/47f6ef46-500f-421a-9fa2-fefd93facf95/resource/2527ac5b-66fe-46f0-8b9b-7086d2c4ddd3/download/hno-2017-sahel-nutrition.csv",
"filename": "HNO -2017 -Sahel-nutrition.csv",
"format": "csv",
"hdx_id": "2527ac5b-66fe-46f0-8b9b-7086d2c4ddd3",
"update_date": datetime(
2017, 3, 10, 10, 8, 37, tzinfo=timezone.utc
),
},
"title": "Sahel : Humanitarian Needs Overview",
},
"format": "csv",
"hapi_resource_metadata": {
"download_url": "https://data.humdata.org/dataset/47f6ef46-500f-421a-9fa2-fefd93facf95/resource/2527ac5b-66fe-46f0-8b9b-7086d2c4ddd3/download/hno-2017-sahel-nutrition.csv",
"filename": "HNO -2017 -Sahel-nutrition.csv",
"format": "csv",
"hdx_id": "2527ac5b-66fe-46f0-8b9b-7086d2c4ddd3",
"update_date": datetime(
2017, 3, 10, 10, 8, 37, tzinfo=timezone.utc
),
},
"headers": 1,
"source_date": date,
"name": "test",
"source": "Multiple organisations",
"source_date": {
"default_date": {
"end": datetime(
2016, 9, 1, 23, 59, 59, tzinfo=timezone.utc
)
}
},
"source_url": "https://data.humdata.org/dataset/sahel-humanitarian-needs-overview",
"url": "https://data.humdata.org/dataset/47f6ef46-500f-421a-9fa2-fefd93facf95/resource/2527ac5b-66fe-46f0-8b9b-7086d2c4ddd3/download/hno-2017-sahel-nutrition.csv",
}
Expand Down Expand Up @@ -220,9 +226,9 @@ def test_read(self, configuration):
"Total population": 100000,
}
assert datasetinfo == {
"name": "test",
"dataset": "sahel-humanitarian-needs-overview",
"hapi_metadata": {
"format": "xlsx",
"hapi_dataset_metadata": {
"hdx_id": "47f6ef46-500f-421a-9fa2-fefd93facf95",
"hdx_stub": "sahel-humanitarian-needs-overview",
"provider_code": "ac91832d-2477-4e1f-8520-9a591a7c3d69",
Expand All @@ -238,23 +244,29 @@ def test_read(self, configuration):
),
"startdate_str": "2016-09-01T00:00:00+00:00",
},
"resource": {
"download_url": "https://data.humdata.org/dataset/47f6ef46-500f-421a-9fa2-fefd93facf95/resource/d9248be4-7bfb-4a81-a7aa-c035dcb737a2/download/hno-2017-sahel-people-in-need.xlsx",
"filename": "HNO-2017-Sahel- People in " "need.xlsx",
"format": "xlsx",
"hdx_id": "d9248be4-7bfb-4a81-a7aa-c035dcb737a2",
"update_date": datetime(
2017, 3, 10, 10, 8, 37, tzinfo=timezone.utc
),
},
"title": "Sahel : Humanitarian Needs Overview",
},
"hapi_resource_metadata": {
"download_url": "https://data.humdata.org/dataset/47f6ef46-500f-421a-9fa2-fefd93facf95/resource/d9248be4-7bfb-4a81-a7aa-c035dcb737a2/download/hno-2017-sahel-people-in-need.xlsx",
"filename": "HNO-2017-Sahel- People in need.xlsx",
"format": "xlsx",
"hdx_id": "d9248be4-7bfb-4a81-a7aa-c035dcb737a2",
"update_date": datetime(
2017, 3, 10, 10, 8, 37, tzinfo=timezone.utc
),
},
"headers": 1,
"name": "test",
"resource": "HNO-2017-Sahel- People in need.xlsx",
"format": "xlsx",
"sheet": 1,
"headers": 1,
"source_date": date,
"source": "Multiple organisations",
"source_date": {
"default_date": {
"end": datetime(
2016, 9, 1, 23, 59, 59, tzinfo=timezone.utc
)
}
},
"source_url": "https://data.humdata.org/dataset/sahel-humanitarian-needs-overview",
"url": "https://data.humdata.org/dataset/47f6ef46-500f-421a-9fa2-fefd93facf95/resource/d9248be4-7bfb-4a81-a7aa-c035dcb737a2/download/hno-2017-sahel-people-in-need.xlsx",
}
Expand Down
Loading