-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add function to remove dates from dataset titles and optionally use them as date of dataset
- Loading branch information
Showing
7 changed files
with
283 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
ckanapi==4.3 | ||
hdx-python-country==2.2.5 | ||
hdx-python-country==2.3.1 | ||
ndg-httpsclient==0.5.1 | ||
pyasn1==0.4.8 | ||
pyOpenSSL==19.1.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
# -*- coding: utf-8 -*- | ||
"""Helper to the Dataset class for handling processing dataset titles. | ||
""" | ||
import logging | ||
import re | ||
from datetime import datetime, timedelta | ||
from parser import ParserError | ||
from string import punctuation, whitespace | ||
from typing import List, Tuple, Optional | ||
|
||
from hdx.utilities.dateparse import parse_date_range, parse_date | ||
from hdx.utilities.text import remove_end_characters, remove_from_end, PUNCTUATION_MINUS_BRACKETS, remove_string | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class DatasetTitleHelper(object): | ||
YEAR_RANGE_PATTERN = re.compile('([12]\d\d\d)(-| % | and )([12]\d\d\d)') | ||
YEAR_RANGE_PATTERN2 = re.compile('([12]\d\d\d)(/|-)(\d\d)') | ||
YEAR_PATTERN = re.compile('([12]\d\d\d)') | ||
PUNCTUATION_PATTERN = re.compile('[%s]' % punctuation) | ||
|
||
@classmethod | ||
def fuzzy_match_dates_in_title(cls, title, ranges): | ||
# type: (str, List[Tuple[datetime,datetime]]) -> str | ||
""" | ||
Fuzzy match dates in title appending to ranges | ||
Args: | ||
title (str): Title to parse | ||
ranges (List[Tuple[datetime,datetime]]): List of date ranges found so far | ||
Returns: | ||
str: Title with dates removed | ||
""" | ||
match = cls.YEAR_PATTERN.search(title) | ||
while match: | ||
start = match.start() | ||
end = match.end() | ||
stringlr = title[start - 13:end] | ||
stringlr = cls.PUNCTUATION_PATTERN.split(stringlr)[-1] | ||
fuzzylr = dict() | ||
startdatelr = None | ||
enddatelr = None | ||
deltalr = timedelta(days=1000) | ||
try: | ||
startdatelr, enddatelr = parse_date_range(stringlr, fuzzy=fuzzylr) | ||
if startdatelr and enddatelr: | ||
deltalr = enddatelr - startdatelr | ||
except ParserError: | ||
pass | ||
fuzzyrl = dict() | ||
stringrl = title[start:end + 13] | ||
stringrl = cls.PUNCTUATION_PATTERN.split(stringrl)[0] | ||
startdaterl = None | ||
enddaterl = None | ||
deltarl = timedelta(days=1000) | ||
try: | ||
startdaterl, enddaterl = parse_date_range(stringrl, fuzzy=fuzzyrl) | ||
if startdaterl and enddaterl: | ||
deltarl = enddaterl - startdaterl | ||
except ParserError: | ||
pass | ||
if startdatelr and deltalr <= deltarl: | ||
date_components = fuzzylr['date'] | ||
ranges.append((startdatelr, enddatelr)) | ||
elif startdaterl: | ||
date_components = fuzzyrl['date'] | ||
ranges.append((startdaterl, enddaterl)) | ||
else: | ||
year = match.group(0) | ||
date_components = (year) | ||
ranges.append(parse_date_range(year)) | ||
newtitle = title | ||
for date_component in date_components: | ||
newtitle = remove_string(newtitle, date_component, PUNCTUATION_MINUS_BRACKETS) | ||
logger.info('Removing date from title: %s -> %s' % (title, newtitle)) | ||
title = newtitle | ||
match = cls.YEAR_PATTERN.search(title, end) | ||
try: | ||
fuzzy = dict() | ||
startdate, enddate = parse_date_range(title, fuzzy=fuzzy) | ||
if startdate == enddate and len(fuzzy['date']) == 1: # only accept dates where day, month and year are | ||
# all together not split throughout the string and where the date is a precise day not a range | ||
ranges.append((startdate, enddate)) | ||
date_component = fuzzy['date'][0] | ||
newtitle = remove_string(title, date_component, PUNCTUATION_MINUS_BRACKETS) | ||
logger.info('Removing date from title: %s -> %s' % (title, newtitle)) | ||
title = newtitle | ||
except (ParserError, OverflowError): | ||
pass | ||
|
||
return title | ||
|
||
@classmethod | ||
def get_date_from_title(cls, title): | ||
# type: (str) -> Tuple[str,Optional[datetime],Optional[datetime]] | ||
""" | ||
Get dataset date from title and clean title of dates | ||
Args: | ||
title (str): Title to get date from and clean | ||
Returns: | ||
Tuple[str,Optional[datetime],Optional[datetime]]: Cleaned title, start and end dates | ||
""" | ||
ranges = list() | ||
for match in cls.YEAR_RANGE_PATTERN.finditer(title): | ||
startdate = parse_date('%s-01-01' % match.group(1), '%Y-%m-%d') | ||
enddate = parse_date('%s-12-31' % match.group(3), '%Y-%m-%d') | ||
ranges.append((startdate, enddate)) | ||
newtitle = remove_string(title, match.group(0)) | ||
logger.info('Removing date range from title: %s -> %s' % (title, newtitle)) | ||
title = newtitle | ||
|
||
for match in cls.YEAR_RANGE_PATTERN2.finditer(title): | ||
first_year = match.group(1) | ||
startdate = parse_date('%s-01-01' % first_year, '%Y-%m-%d') | ||
enddate = parse_date('%s%s-12-31' % (first_year[:2], match.group(3)), '%Y-%m-%d') | ||
ranges.append((startdate, enddate)) | ||
newtitle = remove_string(title, match.group(0)) | ||
logger.info('Removing date range from title: %s -> %s' % (title, newtitle)) | ||
title = newtitle | ||
|
||
title = cls.fuzzy_match_dates_in_title(title, ranges) | ||
|
||
title = title.replace('()', '') | ||
title = remove_end_characters(title, '%s%s' % (PUNCTUATION_MINUS_BRACKETS, whitespace)) | ||
title = remove_from_end(title, ['as of'], 'Removing - from title: %s -> %s') | ||
if len(ranges) == 0: | ||
return title, None, None | ||
else: | ||
startdate, enddate = sorted(ranges)[0] | ||
return title, startdate, enddate |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
4.0.0 | ||
4.0.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# -*- coding: UTF-8 -*- | ||
"""Dataset Title Helper Tests""" | ||
from datetime import datetime | ||
|
||
from hdx.data.dataset_title_helper import DatasetTitleHelper | ||
|
||
|
||
class TestDatasetTitleHelper: | ||
def test_fuzzy_match_dates_in_title(self): | ||
ranges = list() | ||
assert DatasetTitleHelper.fuzzy_match_dates_in_title('Myanmar Town July 2019', ranges) == 'Myanmar Town' | ||
assert ranges == [(datetime(2019, 7, 1, 0, 0), datetime(2019, 7, 31, 0, 0))] | ||
ranges = list() | ||
assert DatasetTitleHelper.fuzzy_match_dates_in_title('Myanmar Town 2019 July', ranges) == 'Myanmar Town' | ||
assert ranges == [(datetime(2019, 7, 1, 0, 0), datetime(2019, 7, 31, 0, 0))] | ||
|
||
def test_get_date_from_title(self): | ||
title = 'Myanmar Town 2019 July' | ||
assert DatasetTitleHelper.get_date_from_title(title) == ( | ||
'Myanmar Town', datetime(2019, 7, 1, 0, 0), datetime(2019, 7, 31, 0, 0)) | ||
assert DatasetTitleHelper.get_date_from_title('Formal Sector School Location Upper Myanmar (2019)') == ( | ||
'Formal Sector School Location Upper Myanmar', datetime(2019, 1, 1, 0, 0), datetime(2019, 12, 31, 0, 0)) | ||
assert DatasetTitleHelper.get_date_from_title('ICA Armenia, 2017 - Drought Risk, 1981-2015') == ( | ||
'ICA Armenia - Drought Risk', datetime(1981, 1, 1, 0, 0), datetime(2015, 12, 31, 0, 0)) | ||
assert DatasetTitleHelper.get_date_from_title('Central African Republic, Bridges, January 2019') == ( | ||
'Central African Republic, Bridges', datetime(2019, 1, 1, 0, 0), datetime(2019, 1, 31, 0, 0)) | ||
assert DatasetTitleHelper.get_date_from_title( | ||
'Afghanistan:District Accessibility for WFP and Partners Staff as of 05 May 2019') == \ | ||
('Afghanistan:District Accessibility for WFP and Partners Staff', datetime(2019, 5, 5, 0, 0), | ||
datetime(2019, 5, 5, 0, 0)) | ||
assert DatasetTitleHelper.get_date_from_title('Tanintharyi Region Land Cover - March 2016 (Original)') == ( | ||
'Tanintharyi Region Land Cover (Original)', datetime(2016, 3, 1, 0, 0), datetime(2016, 3, 31, 0, 0)) | ||
assert DatasetTitleHelper.get_date_from_title( | ||
'Kachin State and Sagaing Region 2002-2014 Forest Cover Change') == \ | ||
('Kachin State and Sagaing Region Forest Cover Change', datetime(2002, 1, 1, 0, 0), | ||
datetime(2014, 12, 31, 0, 0)) | ||
assert DatasetTitleHelper.get_date_from_title('Ward boundaries Yangon City_mimu_v8_1') == \ | ||
('Ward boundaries Yangon City_mimu_v8_1', None, None) | ||
assert DatasetTitleHelper.get_date_from_title('Mon_State_Village_Tract_Boundaries') == \ | ||
('Mon_State_Village_Tract_Boundaries', None, None) | ||
assert DatasetTitleHelper.get_date_from_title('ICA Afghanistan, 2019 - Landslide hazard, 2013') == \ | ||
('ICA Afghanistan - Landslide hazard', datetime(2013, 1, 1, 0, 0), datetime(2013, 12, 31, 0, 0)) | ||
assert DatasetTitleHelper.get_date_from_title( | ||
'Afghanistan Percentage of Food Insecure Population Based on Combined Food Consumption Score and Coping Strategy Index by Province - ALCS 2013/14') == \ | ||
( | ||
'Afghanistan Percentage of Food Insecure Population Based on Combined Food Consumption Score and Coping Strategy Index by Province - ALCS', | ||
datetime(2013, 1, 1, 0, 0), datetime(2014, 12, 31, 0, 0)) | ||
assert DatasetTitleHelper.get_date_from_title('Mon_State_Village_Tract_Boundaries 9999') == \ | ||
('Mon_State_Village_Tract_Boundaries 9999', None, None) | ||
assert DatasetTitleHelper.get_date_from_title('Mon_State_Village_Tract_Boundaries 10/12/01 lala') == \ | ||
('Mon_State_Village_Tract_Boundaries 10/12/01 lala', None, None) # It's the Mon that makes an extra | ||
# date component that causes it to ignore the date (correctly) | ||
assert DatasetTitleHelper.get_date_from_title('State_Village_Tract_Boundaries 10/12/01 lala') == \ | ||
('State_Village_Tract_Boundaries lala', datetime(2001, 12, 10, 0, 0), datetime(2001, 12, 10, 0, 0)) |