Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 18 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -344,8 +344,8 @@ input_cols. That new column is given a header and a HXL tag (in output_columns a

The needs mini scraper takes data for the latest available date for each country. subsets allows the definition of
multiple indicators by way of filters. A filter is defined for each indicator (in this case there is one) which
contains one or more filters of the form column=value. The pipe (|) is used as a separator - it means “and” not
“or”.
contains one or more filters in Python syntax. Column names can be used directly and if not already specified in
input_cols or date_col, should be included in filter_cols.

needs:
format: "xlsx"
Expand All @@ -356,8 +356,11 @@ contains one or more filters of the form column=value. The pipe (|) is used as a
- "Country Code"
date_col: "Year"
date_type: "year"
filter_cols:
- "Metric"
- "PiN Value for Dataviz"
subsets:
- filter: "Metric=People in need|PiN Value for Dataviz=yes"
- filter: "Metric == 'People in need' and PiN Value for Dataviz == 'yes'"
input_cols:
- "Value"
output_columns:
Expand Down Expand Up @@ -391,7 +394,8 @@ fuzzy match if the input has more than 3 characters.
- "#population"

The covid tests mini scraper applies a prefilter to the data that only processes rows where the value in the column
"new_tests" is not None and is greater than zero.
"new_tests" is not None and is greater than zero. If "new_tests" was not specified in input_cols or date_col, then
it would need to be under a key filter_cols.

covidtests:
source: "Our World in Data"
Expand Down Expand Up @@ -452,8 +456,10 @@ mini scraper).
- "reference_year"
- "reference_code"
date_type: "int"
filter_cols:
- "chtype"
subsets:
- filter: "chtype=current"
- filter: "chtype == 'current'"
input_cols:
- "phase3"
- "phase4"
Expand Down Expand Up @@ -504,8 +510,10 @@ either “-2222” or “-4444” is the value included in the sum of any column
- "Admin1"
date_col: "Year"
date_type: "year"
filter_cols:
- "Vaccine"
subsets:
- filter: "Vaccine=HepB1"
- filter: "Vaccine == 'HepB1'"
input_cols:
- "Numerator"
- "Denominator"
Expand Down Expand Up @@ -646,9 +654,12 @@ single_maxdate as shown below:
date_col: "AllocationYear"
date_type: "year"
single_maxdate: True
filter_cols:
- "FundType"
- "GenderMarker"
subsets:
...
- filter: "FundType=CBPF|GenderMarker=0"
- filter: "FundType == 'CBPF' and GenderMarker == '0'"
input_cols:
- "Budget"
input_transforms:
Expand Down
30 changes: 18 additions & 12 deletions src/hdx/scraper/rowparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,6 @@ def get_level(lvl):
self.level = get_level(level)
self.today = today
self.sort = datasetinfo.get('sort')
prefilter = datasetinfo.get('prefilter')
if prefilter is not None:
for subset in subsets:
for col in subset['input_cols']:
prefilter = prefilter.replace(col, f"row['{col}']")
self.prefilter = prefilter
self.datecol = datasetinfo.get('date_col')
self.datetype = datasetinfo.get('date_type')
if self.datetype:
Expand All @@ -73,6 +67,11 @@ def get_level(lvl):
self.admcols = datasetinfo.get('adm_cols', list())
self.admexact = datasetinfo.get('adm_exact', False)
self.subsets = subsets
self.filter_cols = datasetinfo.get('filter_cols', list())
prefilter = datasetinfo.get('prefilter')
if prefilter is not None:
prefilter = self.get_filter_str_for_eval(prefilter)
self.prefilter = prefilter
adms = datasetinfo.get('adm_vals')
if adms is None:
self.adms = [countryiso3s, self.adminone.pcodes]
Expand All @@ -94,6 +93,16 @@ def get_level(lvl):
self.filters = dict()
self.read_external_filter(datasetinfo)

def get_filter_str_for_eval(self, filter):
for col in self.filter_cols:
filter = filter.replace(col, f"row['{col}']")
if self.datecol:
filter = filter.replace(self.datecol, f"row['{self.datecol}']")
for subset in self.subsets:
for col in subset['input_cols']:
filter = filter.replace(col, f"row['{col}']")
return filter

def filter_sort_rows(self, iterator, hxlrow):
# type: (Iterator[Dict], Dict) -> Iterator[Dict]
"""Apply prefilter and sort the input data before processing. If date_col is specified along with any of
Expand Down Expand Up @@ -267,12 +276,9 @@ def get_adm(admcol, i):
filter = subset['filter']
process = True
if filter:
filters = filter.split('|')
for filterstr in filters:
filter = filterstr.split('=')
if row[filter[0]] != filter[1]:
process = False
break
filter = self.get_filter_str_for_eval(filter)
if not eval(filter):
process = False
should_process_subset.append(process)

if self.datecol:
Expand Down
2 changes: 1 addition & 1 deletion src/hdx/scraper/version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.2.8
1.2.9
34 changes: 19 additions & 15 deletions tests/config/project_configuration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ scraper_national:
date_type: "date"
use_date_from_date_col: True
subsets:
- filter: "Date_reported=2020-08-06"
- filter: "Date_reported == '2020-08-06'"
input_cols:
- "Cumulative_cases"
- "Cumulative_deaths"
Expand Down Expand Up @@ -209,6 +209,7 @@ scraper_national:
prefilter: "new_tests is not None and new_tests > 0"
date_col: "date"
date_type: "date"
use_date_from_date_col: True
adm_cols:
- "iso_code"
input_cols:
Expand Down Expand Up @@ -319,8 +320,11 @@ scraper_global:
date_col: "AllocationYear"
date_type: "year"
single_maxdate: True
filter_cols:
- "FundType"
- "GenderMarker"
subsets:
- filter: "FundType=CBPF"
- filter: "FundType == 'CBPF'"
input_cols:
- "Budget"
input_transforms:
Expand All @@ -332,7 +336,7 @@ scraper_global:
- "CBPFFunding"
output_hxltags:
- "#value+cbpf+funding+total+usd"
- filter: "FundType=CBPF|GenderMarker="
- filter: "FundType == 'CBPF' and GenderMarker == ''"
input_cols:
- "Budget"
input_transforms:
Expand All @@ -344,7 +348,7 @@ scraper_global:
- "CBPFFundingGMEmpty"
output_hxltags:
- "#value+cbpf+funding+gmempty+total+usd"
- filter: "FundType=CBPF|GenderMarker=0"
- filter: "FundType == 'CBPF' and GenderMarker == '0'"
input_cols:
- "Budget"
input_transforms:
Expand All @@ -356,7 +360,7 @@ scraper_global:
- "CBPFFundingGM0"
output_hxltags:
- "#value+cbpf+funding+gm0+total+usd"
- filter: "FundType=CBPF|GenderMarker=1"
- filter: "FundType == 'CBPF' and GenderMarker == '1'"
input_cols:
- "Budget"
input_transforms:
Expand All @@ -368,7 +372,7 @@ scraper_global:
- "CBPFFundingGM1"
output_hxltags:
- "#value+cbpf+funding+gm1+total+usd"
- filter: "FundType=CBPF|GenderMarker=2"
- filter: "FundType == 'CBPF' and GenderMarker == '2'"
input_cols:
- "Budget"
input_transforms:
Expand All @@ -380,7 +384,7 @@ scraper_global:
- "CBPFFundingGM2"
output_hxltags:
- "#value+cbpf+funding+gm2+total+usd"
- filter: "FundType=CBPF|GenderMarker=3"
- filter: "FundType == 'CBPF' and GenderMarker == '3'"
input_cols:
- "Budget"
input_transforms:
Expand All @@ -392,7 +396,7 @@ scraper_global:
- "CBPFFundingGM3"
output_hxltags:
- "#value+cbpf+funding+gm3+total+usd"
- filter: "FundType=CBPF|GenderMarker=4"
- filter: "FundType == 'CBPF' and GenderMarker == '4'"
input_cols:
- "Budget"
input_transforms:
Expand All @@ -404,7 +408,7 @@ scraper_global:
- "CBPFFundingGM4"
output_hxltags:
- "#value+cbpf+funding+gm4+total+usd"
- filter: "FundType=CERF"
- filter: "FundType == 'CERF'"
input_cols:
- "Budget"
input_transforms:
Expand All @@ -416,7 +420,7 @@ scraper_global:
- "CERFFunding"
output_hxltags:
- "#value+cerf+funding+total+usd"
- filter: "FundType=CERF|GenderMarker="
- filter: "FundType == 'CERF' and GenderMarker == ''"
input_cols:
- "Budget"
input_transforms:
Expand All @@ -428,7 +432,7 @@ scraper_global:
- "CERFFundingGMEmpty"
output_hxltags:
- "#value+cerf+funding+gmempty+total+usd"
- filter: "FundType=CERF|GenderMarker=0"
- filter: "FundType == 'CERF' and GenderMarker == '0'"
input_cols:
- "Budget"
input_transforms:
Expand All @@ -440,7 +444,7 @@ scraper_global:
- "CERFFundingGM0"
output_hxltags:
- "#value+cerf+funding+gm0+total+usd"
- filter: "FundType=CERF|GenderMarker=1"
- filter: "FundType == 'CERF' and GenderMarker == '1'"
input_cols:
- "Budget"
input_transforms:
Expand All @@ -452,7 +456,7 @@ scraper_global:
- "CERFFundingGM1"
output_hxltags:
- "#value+cerf+funding+gm1+total+usd"
- filter: "FundType=CERF|GenderMarker=2"
- filter: "FundType == 'CERF' and GenderMarker == '2'"
input_cols:
- "Budget"
input_transforms:
Expand All @@ -464,7 +468,7 @@ scraper_global:
- "CERFFundingGM2"
output_hxltags:
- "#value+cerf+funding+gm2+total+usd"
- filter: "FundType=CERF|GenderMarker=3"
- filter: "FundType == 'CERF' and GenderMarker == '3'"
input_cols:
- "Budget"
input_transforms:
Expand All @@ -476,7 +480,7 @@ scraper_global:
- "CERFFundingGM3"
output_hxltags:
- "#value+cerf+funding+gm3+total+usd"
- filter: "FundType=CERF|GenderMarker=4"
- filter: "FundType == 'CERF' and GenderMarker == '4'"
input_cols:
- "Budget"
input_transforms:
Expand Down
2 changes: 1 addition & 1 deletion tests/hdx/scraper/test_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_get_tabular(self, configuration):
results = run_scrapers(scraper_configuration, ['AFG', 'PHL'], adminone, level, downloader, today=today, scrapers=['covidtests'], population_lookup=population_lookup)
assert results['headers'] == [['New Tests', 'New Tests Per Thousand', 'New Tests Per Thousand (7-day)', 'Positive Test Rate'], ['#affected+tested', '#affected+tested+per1000', '#affected+tested+avg+per1000', '#affected+tested+positive+pct']]
assert results['values'] == [{'PHL': 39611}, {'PHL': 0.361}, {'PHL': 0.312}, {'PHL': 0.072}]
assert results['sources'] == [('#affected+tested', '2021-08-25', 'Our World in Data', 'https://data.humdata.org/dataset/total-covid-19-tests-performed-by-country'), ('#affected+tested+per1000', '2021-08-25', 'Our World in Data', 'https://data.humdata.org/dataset/total-covid-19-tests-performed-by-country'), ('#affected+tested+avg+per1000', '2021-08-25', 'Our World in Data', 'https://data.humdata.org/dataset/total-covid-19-tests-performed-by-country'), ('#affected+tested+positive+pct', '2021-08-25', 'Our World in Data', 'https://data.humdata.org/dataset/total-covid-19-tests-performed-by-country')]
assert results['sources'] == [('#affected+tested', '2020-10-01', 'Our World in Data', 'https://data.humdata.org/dataset/total-covid-19-tests-performed-by-country'), ('#affected+tested+per1000', '2020-10-01', 'Our World in Data', 'https://data.humdata.org/dataset/total-covid-19-tests-performed-by-country'), ('#affected+tested+avg+per1000', '2020-10-01', 'Our World in Data', 'https://data.humdata.org/dataset/total-covid-19-tests-performed-by-country'), ('#affected+tested+positive+pct', '2020-10-01', 'Our World in Data', 'https://data.humdata.org/dataset/total-covid-19-tests-performed-by-country')]
today = parse_date('2021-05-03')
results = run_scrapers(scraper_configuration, ['AFG', 'PHL'], adminone, level, downloader, today=today, scrapers=['ourworldindata'], population_lookup=population_lookup)
assert results['headers'] == [['TotalDosesAdministered'], ['#capacity+doses+administered+total']]
Expand Down