From 1597be70b151211966c54ddd69ba9467748f89bf Mon Sep 17 00:00:00 2001 From: Mike Date: Wed, 1 Sep 2021 12:45:52 +1200 Subject: [PATCH 1/2] Somehow broken --- src/hdx/scraper/rowparser.py | 30 +++++++++++++++---------- tests/config/project_configuration.yml | 31 +++++++++++++------------- 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/src/hdx/scraper/rowparser.py b/src/hdx/scraper/rowparser.py index 10bd0c7..7749f04 100755 --- a/src/hdx/scraper/rowparser.py +++ b/src/hdx/scraper/rowparser.py @@ -46,12 +46,6 @@ def get_level(lvl): self.level = get_level(level) self.today = today self.sort = datasetinfo.get('sort') - prefilter = datasetinfo.get('prefilter') - if prefilter is not None: - for subset in subsets: - for col in subset['input_cols']: - prefilter = prefilter.replace(col, f"row['{col}']") - self.prefilter = prefilter self.datecol = datasetinfo.get('date_col') self.datetype = datasetinfo.get('date_type') if self.datetype: @@ -73,6 +67,11 @@ def get_level(lvl): self.admcols = datasetinfo.get('adm_cols', list()) self.admexact = datasetinfo.get('adm_exact', False) self.subsets = subsets + self.filter_cols = datasetinfo.get('filter_cols', list()) + prefilter = datasetinfo.get('prefilter') + if prefilter is not None: + prefilter = self.get_filter_str_for_eval(prefilter) + self.prefilter = prefilter adms = datasetinfo.get('adm_vals') if adms is None: self.adms = [countryiso3s, self.adminone.pcodes] @@ -94,6 +93,16 @@ def get_level(lvl): self.filters = dict() self.read_external_filter(datasetinfo) + def get_filter_str_for_eval(self, filter): + for col in self.filter_cols: + filter = filter.replace(col, f"row['{col}']") + if self.datecol: + filter = filter.replace(self.datecol, f"row['{self.datecol}']") + for subset in self.subsets: + for col in subset['input_cols']: + filter = filter.replace(col, f"row['{col}']") + return filter + def filter_sort_rows(self, iterator, hxlrow): # type: (Iterator[Dict], Dict) -> Iterator[Dict] """Apply prefilter and sort the input data before processing. If date_col is specified along with any of @@ -267,12 +276,9 @@ def get_adm(admcol, i): filter = subset['filter'] process = True if filter: - filters = filter.split('|') - for filterstr in filters: - filter = filterstr.split('=') - if row[filter[0]] != filter[1]: - process = False - break + filter = self.get_filter_str_for_eval(filter) + if not eval(filter): + process = False should_process_subset.append(process) if self.datecol: diff --git a/tests/config/project_configuration.yml b/tests/config/project_configuration.yml index 512da5c..c5c2ba1 100755 --- a/tests/config/project_configuration.yml +++ b/tests/config/project_configuration.yml @@ -98,7 +98,7 @@ scraper_national: date_type: "date" use_date_from_date_col: True subsets: - - filter: "Date_reported=2020-08-06" + - filter: "Date_reported == '2020-08-06'" input_cols: - "Cumulative_cases" - "Cumulative_deaths" @@ -209,6 +209,7 @@ scraper_national: prefilter: "new_tests is not None and new_tests > 0" date_col: "date" date_type: "date" + use_date_from_date_col: True adm_cols: - "iso_code" input_cols: @@ -320,7 +321,7 @@ scraper_global: date_type: "year" single_maxdate: True subsets: - - filter: "FundType=CBPF" + - filter: "FundType == CBPF" input_cols: - "Budget" input_transforms: @@ -332,7 +333,7 @@ scraper_global: - "CBPFFunding" output_hxltags: - "#value+cbpf+funding+total+usd" - - filter: "FundType=CBPF|GenderMarker=" + - filter: "FundType == CBPF and GenderMarker == " input_cols: - "Budget" input_transforms: @@ -344,7 +345,7 @@ scraper_global: - "CBPFFundingGMEmpty" output_hxltags: - "#value+cbpf+funding+gmempty+total+usd" - - filter: "FundType=CBPF|GenderMarker=0" + - filter: "FundType == CBPF and GenderMarker == 0" input_cols: - "Budget" input_transforms: @@ -356,7 +357,7 @@ scraper_global: - "CBPFFundingGM0" output_hxltags: - "#value+cbpf+funding+gm0+total+usd" - - filter: "FundType=CBPF|GenderMarker=1" + - filter: "FundType == CBPF and GenderMarker == 1" input_cols: - "Budget" input_transforms: @@ -368,7 +369,7 @@ scraper_global: - "CBPFFundingGM1" output_hxltags: - "#value+cbpf+funding+gm1+total+usd" - - filter: "FundType=CBPF|GenderMarker=2" + - filter: "FundType == CBPF and GenderMarker == 2" input_cols: - "Budget" input_transforms: @@ -380,7 +381,7 @@ scraper_global: - "CBPFFundingGM2" output_hxltags: - "#value+cbpf+funding+gm2+total+usd" - - filter: "FundType=CBPF|GenderMarker=3" + - filter: "FundType == CBPF and GenderMarker == 3" input_cols: - "Budget" input_transforms: @@ -392,7 +393,7 @@ scraper_global: - "CBPFFundingGM3" output_hxltags: - "#value+cbpf+funding+gm3+total+usd" - - filter: "FundType=CBPF|GenderMarker=4" + - filter: "FundType == CBPF and GenderMarker == 4" input_cols: - "Budget" input_transforms: @@ -404,7 +405,7 @@ scraper_global: - "CBPFFundingGM4" output_hxltags: - "#value+cbpf+funding+gm4+total+usd" - - filter: "FundType=CERF" + - filter: "FundType == 'CERF'" input_cols: - "Budget" input_transforms: @@ -416,7 +417,7 @@ scraper_global: - "CERFFunding" output_hxltags: - "#value+cerf+funding+total+usd" - - filter: "FundType=CERF|GenderMarker=" + - filter: "FundType == 'CERF' and GenderMarker == " input_cols: - "Budget" input_transforms: @@ -428,7 +429,7 @@ scraper_global: - "CERFFundingGMEmpty" output_hxltags: - "#value+cerf+funding+gmempty+total+usd" - - filter: "FundType=CERF|GenderMarker=0" + - filter: "FundType == 'CERF' and GenderMarker == 0" input_cols: - "Budget" input_transforms: @@ -440,7 +441,7 @@ scraper_global: - "CERFFundingGM0" output_hxltags: - "#value+cerf+funding+gm0+total+usd" - - filter: "FundType=CERF|GenderMarker=1" + - filter: "FundType == 'CERF' and GenderMarker == 1" input_cols: - "Budget" input_transforms: @@ -452,7 +453,7 @@ scraper_global: - "CERFFundingGM1" output_hxltags: - "#value+cerf+funding+gm1+total+usd" - - filter: "FundType=CERF|GenderMarker=2" + - filter: "FundType == 'CERF' and GenderMarker == 2" input_cols: - "Budget" input_transforms: @@ -464,7 +465,7 @@ scraper_global: - "CERFFundingGM2" output_hxltags: - "#value+cerf+funding+gm2+total+usd" - - filter: "FundType=CERF|GenderMarker=3" + - filter: "FundType == 'CERF' and GenderMarker == 3" input_cols: - "Budget" input_transforms: @@ -476,7 +477,7 @@ scraper_global: - "CERFFundingGM3" output_hxltags: - "#value+cerf+funding+gm3+total+usd" - - filter: "FundType=CERF|GenderMarker=4" + - filter: "FundType == 'CERF' and GenderMarker == 4" input_cols: - "Budget" input_transforms: From 56782602510557a7026e419b9a2421cf87634803 Mon Sep 17 00:00:00 2001 From: Mike Date: Wed, 1 Sep 2021 14:19:28 +1200 Subject: [PATCH 2/2] Make filter handling consistent and use Python syntax --- README.md | 25 +++++++++++++++------- src/hdx/scraper/version.txt | 2 +- tests/config/project_configuration.yml | 29 ++++++++++++++------------ tests/hdx/scraper/test_scraper.py | 2 +- 4 files changed, 36 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 2654bb5..ac8401f 100755 --- a/README.md +++ b/README.md @@ -344,8 +344,8 @@ input_cols. That new column is given a header and a HXL tag (in output_columns a The needs mini scraper takes data for the latest available date for each country. subsets allows the definition of multiple indicators by way of filters. A filter is defined for each indicator (in this case there is one) which -contains one or more filters of the form column=value. The pipe (|) is used as a separator - it means “and” not -“or”. +contains one or more filters in Python syntax. Column names can be used directly and if not already specified in +input_cols or date_col, should be included in filter_cols. needs: format: "xlsx" @@ -356,8 +356,11 @@ contains one or more filters of the form column=value. The pipe (|) is used as a - "Country Code" date_col: "Year" date_type: "year" + filter_cols: + - "Metric" + - "PiN Value for Dataviz" subsets: - - filter: "Metric=People in need|PiN Value for Dataviz=yes" + - filter: "Metric == 'People in need' and PiN Value for Dataviz == 'yes'" input_cols: - "Value" output_columns: @@ -391,7 +394,8 @@ fuzzy match if the input has more than 3 characters. - "#population" The covid tests mini scraper applies a prefilter to the data that only processes rows where the value in the column -"new_tests" is not None and is greater than zero. +"new_tests" is not None and is greater than zero. If "new_tests" was not specified in input_cols or date_col, then +it would need to be under a key filter_cols. covidtests: source: "Our World in Data" @@ -452,8 +456,10 @@ mini scraper). - "reference_year" - "reference_code" date_type: "int" + filter_cols: + - "chtype" subsets: - - filter: "chtype=current" + - filter: "chtype == 'current'" input_cols: - "phase3" - "phase4" @@ -504,8 +510,10 @@ either “-2222” or “-4444” is the value included in the sum of any column - "Admin1" date_col: "Year" date_type: "year" + filter_cols: + - "Vaccine" subsets: - - filter: "Vaccine=HepB1" + - filter: "Vaccine == 'HepB1'" input_cols: - "Numerator" - "Denominator" @@ -646,9 +654,12 @@ single_maxdate as shown below: date_col: "AllocationYear" date_type: "year" single_maxdate: True + filter_cols: + - "FundType" + - "GenderMarker" subsets: ... - - filter: "FundType=CBPF|GenderMarker=0" + - filter: "FundType == 'CBPF' and GenderMarker == '0'" input_cols: - "Budget" input_transforms: diff --git a/src/hdx/scraper/version.txt b/src/hdx/scraper/version.txt index 5975b14..434dcac 100755 --- a/src/hdx/scraper/version.txt +++ b/src/hdx/scraper/version.txt @@ -1 +1 @@ -1.2.8 \ No newline at end of file +1.2.9 \ No newline at end of file diff --git a/tests/config/project_configuration.yml b/tests/config/project_configuration.yml index c5c2ba1..b96e6c9 100755 --- a/tests/config/project_configuration.yml +++ b/tests/config/project_configuration.yml @@ -320,8 +320,11 @@ scraper_global: date_col: "AllocationYear" date_type: "year" single_maxdate: True + filter_cols: + - "FundType" + - "GenderMarker" subsets: - - filter: "FundType == CBPF" + - filter: "FundType == 'CBPF'" input_cols: - "Budget" input_transforms: @@ -333,7 +336,7 @@ scraper_global: - "CBPFFunding" output_hxltags: - "#value+cbpf+funding+total+usd" - - filter: "FundType == CBPF and GenderMarker == " + - filter: "FundType == 'CBPF' and GenderMarker == ''" input_cols: - "Budget" input_transforms: @@ -345,7 +348,7 @@ scraper_global: - "CBPFFundingGMEmpty" output_hxltags: - "#value+cbpf+funding+gmempty+total+usd" - - filter: "FundType == CBPF and GenderMarker == 0" + - filter: "FundType == 'CBPF' and GenderMarker == '0'" input_cols: - "Budget" input_transforms: @@ -357,7 +360,7 @@ scraper_global: - "CBPFFundingGM0" output_hxltags: - "#value+cbpf+funding+gm0+total+usd" - - filter: "FundType == CBPF and GenderMarker == 1" + - filter: "FundType == 'CBPF' and GenderMarker == '1'" input_cols: - "Budget" input_transforms: @@ -369,7 +372,7 @@ scraper_global: - "CBPFFundingGM1" output_hxltags: - "#value+cbpf+funding+gm1+total+usd" - - filter: "FundType == CBPF and GenderMarker == 2" + - filter: "FundType == 'CBPF' and GenderMarker == '2'" input_cols: - "Budget" input_transforms: @@ -381,7 +384,7 @@ scraper_global: - "CBPFFundingGM2" output_hxltags: - "#value+cbpf+funding+gm2+total+usd" - - filter: "FundType == CBPF and GenderMarker == 3" + - filter: "FundType == 'CBPF' and GenderMarker == '3'" input_cols: - "Budget" input_transforms: @@ -393,7 +396,7 @@ scraper_global: - "CBPFFundingGM3" output_hxltags: - "#value+cbpf+funding+gm3+total+usd" - - filter: "FundType == CBPF and GenderMarker == 4" + - filter: "FundType == 'CBPF' and GenderMarker == '4'" input_cols: - "Budget" input_transforms: @@ -417,7 +420,7 @@ scraper_global: - "CERFFunding" output_hxltags: - "#value+cerf+funding+total+usd" - - filter: "FundType == 'CERF' and GenderMarker == " + - filter: "FundType == 'CERF' and GenderMarker == ''" input_cols: - "Budget" input_transforms: @@ -429,7 +432,7 @@ scraper_global: - "CERFFundingGMEmpty" output_hxltags: - "#value+cerf+funding+gmempty+total+usd" - - filter: "FundType == 'CERF' and GenderMarker == 0" + - filter: "FundType == 'CERF' and GenderMarker == '0'" input_cols: - "Budget" input_transforms: @@ -441,7 +444,7 @@ scraper_global: - "CERFFundingGM0" output_hxltags: - "#value+cerf+funding+gm0+total+usd" - - filter: "FundType == 'CERF' and GenderMarker == 1" + - filter: "FundType == 'CERF' and GenderMarker == '1'" input_cols: - "Budget" input_transforms: @@ -453,7 +456,7 @@ scraper_global: - "CERFFundingGM1" output_hxltags: - "#value+cerf+funding+gm1+total+usd" - - filter: "FundType == 'CERF' and GenderMarker == 2" + - filter: "FundType == 'CERF' and GenderMarker == '2'" input_cols: - "Budget" input_transforms: @@ -465,7 +468,7 @@ scraper_global: - "CERFFundingGM2" output_hxltags: - "#value+cerf+funding+gm2+total+usd" - - filter: "FundType == 'CERF' and GenderMarker == 3" + - filter: "FundType == 'CERF' and GenderMarker == '3'" input_cols: - "Budget" input_transforms: @@ -477,7 +480,7 @@ scraper_global: - "CERFFundingGM3" output_hxltags: - "#value+cerf+funding+gm3+total+usd" - - filter: "FundType == 'CERF' and GenderMarker == 4" + - filter: "FundType == 'CERF' and GenderMarker == '4'" input_cols: - "Budget" input_transforms: diff --git a/tests/hdx/scraper/test_scraper.py b/tests/hdx/scraper/test_scraper.py index 061fbf7..5776220 100755 --- a/tests/hdx/scraper/test_scraper.py +++ b/tests/hdx/scraper/test_scraper.py @@ -35,7 +35,7 @@ def test_get_tabular(self, configuration): results = run_scrapers(scraper_configuration, ['AFG', 'PHL'], adminone, level, downloader, today=today, scrapers=['covidtests'], population_lookup=population_lookup) assert results['headers'] == [['New Tests', 'New Tests Per Thousand', 'New Tests Per Thousand (7-day)', 'Positive Test Rate'], ['#affected+tested', '#affected+tested+per1000', '#affected+tested+avg+per1000', '#affected+tested+positive+pct']] assert results['values'] == [{'PHL': 39611}, {'PHL': 0.361}, {'PHL': 0.312}, {'PHL': 0.072}] - assert results['sources'] == [('#affected+tested', '2021-08-25', 'Our World in Data', 'https://data.humdata.org/dataset/total-covid-19-tests-performed-by-country'), ('#affected+tested+per1000', '2021-08-25', 'Our World in Data', 'https://data.humdata.org/dataset/total-covid-19-tests-performed-by-country'), ('#affected+tested+avg+per1000', '2021-08-25', 'Our World in Data', 'https://data.humdata.org/dataset/total-covid-19-tests-performed-by-country'), ('#affected+tested+positive+pct', '2021-08-25', 'Our World in Data', 'https://data.humdata.org/dataset/total-covid-19-tests-performed-by-country')] + assert results['sources'] == [('#affected+tested', '2020-10-01', 'Our World in Data', 'https://data.humdata.org/dataset/total-covid-19-tests-performed-by-country'), ('#affected+tested+per1000', '2020-10-01', 'Our World in Data', 'https://data.humdata.org/dataset/total-covid-19-tests-performed-by-country'), ('#affected+tested+avg+per1000', '2020-10-01', 'Our World in Data', 'https://data.humdata.org/dataset/total-covid-19-tests-performed-by-country'), ('#affected+tested+positive+pct', '2020-10-01', 'Our World in Data', 'https://data.humdata.org/dataset/total-covid-19-tests-performed-by-country')] today = parse_date('2021-05-03') results = run_scrapers(scraper_configuration, ['AFG', 'PHL'], adminone, level, downloader, today=today, scrapers=['ourworldindata'], population_lookup=population_lookup) assert results['headers'] == [['TotalDosesAdministered'], ['#capacity+doses+administered+total']]