Skip to content

Commit

Permalink
Add date to updated_by_script
Browse files Browse the repository at this point in the history
Add function to remove dates from dataset titles and optionally use them as date of dataset
  • Loading branch information
mcarans committed Jan 20, 2020
1 parent a1262c5 commit ba4552b
Show file tree
Hide file tree
Showing 7 changed files with 283 additions and 15 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ckanapi==4.3
hdx-python-country==2.2.5
hdx-python-country==2.3.1
ndg-httpsclient==0.5.1
pyasn1==0.4.8
pyOpenSSL==19.1.0
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from setuptools import setup, find_packages

requirements = ['ckanapi>=4.3',
'hdx-python-country>=2.2.5',
'hdx-python-country>=2.3.1',
'ndg-httpsclient',
'pyasn1',
'pyOpenSSL'
Expand Down
38 changes: 36 additions & 2 deletions src/hdx/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import hdx.data.showcase
import hdx.data.user
import hdx.data.vocabulary
from hdx.data.dataset_title_helper import DatasetTitleHelper
from hdx.data.hdxobject import HDXObject, HDXError
from hdx.hdx_configuration import Configuration
from hdx.hdx_locations import Locations
Expand Down Expand Up @@ -354,6 +355,16 @@ def check_required_fields(self, ignore_fields=list(), allow_no_resources=False):
ignore_fields = ['package_id']
resource.check_required_fields(ignore_fields=ignore_fields)

def set_updated_by_script(self, **kwargs):
# type: (Any) -> None
"""Set metadata field updated_by_script
Returns:
None
"""
scriptinfo = kwargs.get('updated_by_script', self.configuration.get_user_agent())
self.data['updated_by_script'] = '%s (%s)' % (scriptinfo, datetime.utcnow().isoformat())

def _dataset_merge_hdx_update(self, update_resources, update_resources_by_name,
remove_additional_resources, create_default_views, hxl_update, **kwargs):
# type: (bool, bool, bool, bool, bool, Any) -> None
Expand Down Expand Up @@ -436,7 +447,7 @@ def _dataset_merge_hdx_update(self, update_resources, update_resources_by_name,
if 'ignore_check' not in kwargs: # allow ignoring of field checks
ignore_field = self.configuration['dataset'].get('ignore_on_update')
self.check_required_fields(ignore_fields=[ignore_field])
self.data['updated_by_script'] = kwargs.get('updated_by_script', self.configuration.get_user_agent())
self.set_updated_by_script(**kwargs)
self._save_to_hdx('update', 'id', force_active=True)
hdx.data.filestore_helper.FilestoreHelper.add_filestore_resources(self.data['resources'], filestore_resources)
self.init_resources()
Expand Down Expand Up @@ -523,7 +534,7 @@ def create_in_hdx(self, allow_no_resources=False, update_resources=True, update_
hdx.data.filestore_helper.FilestoreHelper.check_filestore_resource(resource, ignore_fields, filestore_resources)
self.data['resources'] = self._convert_hdxobjects(self.resources)
self.clean_tags()
self.data['updated_by_script'] = kwargs.get('updated_by_script', self.configuration.get_user_agent())
self.set_updated_by_script(**kwargs)
self._save_to_hdx('create', 'name', force_active=True)
hdx.data.filestore_helper.FilestoreHelper.add_filestore_resources(self.data['resources'], filestore_resources)
self.init_resources()
Expand Down Expand Up @@ -1562,3 +1573,26 @@ def get_hdx_url(self):
if not name:
return None
return '%s/dataset/%s' % (self.configuration.get_hdx_site_url(), name)

def remove_dates_from_title(self, change_title=True, set_dataset_date=False):
# type: (bool, bool) -> str
"""Remove dates from dataset title returning True if dates were found in title or False if not. The
title in the dataset metadata will be changed by default. The dataset's metadata field dataset date will
not be changed by default.
Args:
change_title (bool): Whether to change the dataset title. Defaults to True.
set_dataset_date (bool): Whether to set the dataset date from date(s) in the title. Defaults to False.
Returns:
bool: True if dates were found in title, False if not
"""
if 'title' not in self.data:
raise HDXError('Dataset has no title!')
title = self.data['title']
newtitle, startdate, enddate = DatasetTitleHelper.get_date_from_title(title)
if change_title:
self.data['title'] = newtitle
if set_dataset_date and startdate:
self.set_dataset_date_from_datetime(startdate, enddate)
return newtitle != title
136 changes: 136 additions & 0 deletions src/hdx/data/dataset_title_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
# -*- coding: utf-8 -*-
"""Helper to the Dataset class for handling processing dataset titles.
"""
import logging
import re
from datetime import datetime, timedelta
from parser import ParserError
from string import punctuation, whitespace
from typing import List, Tuple, Optional

from hdx.utilities.dateparse import parse_date_range, parse_date
from hdx.utilities.text import remove_end_characters, remove_from_end, PUNCTUATION_MINUS_BRACKETS, remove_string

logger = logging.getLogger(__name__)


class DatasetTitleHelper(object):
YEAR_RANGE_PATTERN = re.compile('([12]\d\d\d)(-| % | and )([12]\d\d\d)')
YEAR_RANGE_PATTERN2 = re.compile('([12]\d\d\d)(/|-)(\d\d)')
YEAR_PATTERN = re.compile('([12]\d\d\d)')
PUNCTUATION_PATTERN = re.compile('[%s]' % punctuation)

@classmethod
def fuzzy_match_dates_in_title(cls, title, ranges):
# type: (str, List[Tuple[datetime,datetime]]) -> str
"""
Fuzzy match dates in title appending to ranges
Args:
title (str): Title to parse
ranges (List[Tuple[datetime,datetime]]): List of date ranges found so far
Returns:
str: Title with dates removed
"""
match = cls.YEAR_PATTERN.search(title)
while match:
start = match.start()
end = match.end()
stringlr = title[start - 13:end]
stringlr = cls.PUNCTUATION_PATTERN.split(stringlr)[-1]
fuzzylr = dict()
startdatelr = None
enddatelr = None
deltalr = timedelta(days=1000)
try:
startdatelr, enddatelr = parse_date_range(stringlr, fuzzy=fuzzylr)
if startdatelr and enddatelr:
deltalr = enddatelr - startdatelr
except ParserError:
pass
fuzzyrl = dict()
stringrl = title[start:end + 13]
stringrl = cls.PUNCTUATION_PATTERN.split(stringrl)[0]
startdaterl = None
enddaterl = None
deltarl = timedelta(days=1000)
try:
startdaterl, enddaterl = parse_date_range(stringrl, fuzzy=fuzzyrl)
if startdaterl and enddaterl:
deltarl = enddaterl - startdaterl
except ParserError:
pass
if startdatelr and deltalr <= deltarl:
date_components = fuzzylr['date']
ranges.append((startdatelr, enddatelr))
elif startdaterl:
date_components = fuzzyrl['date']
ranges.append((startdaterl, enddaterl))
else:
year = match.group(0)
date_components = (year)
ranges.append(parse_date_range(year))
newtitle = title
for date_component in date_components:
newtitle = remove_string(newtitle, date_component, PUNCTUATION_MINUS_BRACKETS)
logger.info('Removing date from title: %s -> %s' % (title, newtitle))
title = newtitle
match = cls.YEAR_PATTERN.search(title, end)
try:
fuzzy = dict()
startdate, enddate = parse_date_range(title, fuzzy=fuzzy)
if startdate == enddate and len(fuzzy['date']) == 1: # only accept dates where day, month and year are
# all together not split throughout the string and where the date is a precise day not a range
ranges.append((startdate, enddate))
date_component = fuzzy['date'][0]
newtitle = remove_string(title, date_component, PUNCTUATION_MINUS_BRACKETS)
logger.info('Removing date from title: %s -> %s' % (title, newtitle))
title = newtitle
except (ParserError, OverflowError):
pass

return title

@classmethod
def get_date_from_title(cls, title):
# type: (str) -> Tuple[str,Optional[datetime],Optional[datetime]]
"""
Get dataset date from title and clean title of dates
Args:
title (str): Title to get date from and clean
Returns:
Tuple[str,Optional[datetime],Optional[datetime]]: Cleaned title, start and end dates
"""
ranges = list()
for match in cls.YEAR_RANGE_PATTERN.finditer(title):
startdate = parse_date('%s-01-01' % match.group(1), '%Y-%m-%d')
enddate = parse_date('%s-12-31' % match.group(3), '%Y-%m-%d')
ranges.append((startdate, enddate))
newtitle = remove_string(title, match.group(0))
logger.info('Removing date range from title: %s -> %s' % (title, newtitle))
title = newtitle

for match in cls.YEAR_RANGE_PATTERN2.finditer(title):
first_year = match.group(1)
startdate = parse_date('%s-01-01' % first_year, '%Y-%m-%d')
enddate = parse_date('%s%s-12-31' % (first_year[:2], match.group(3)), '%Y-%m-%d')
ranges.append((startdate, enddate))
newtitle = remove_string(title, match.group(0))
logger.info('Removing date range from title: %s -> %s' % (title, newtitle))
title = newtitle

title = cls.fuzzy_match_dates_in_title(title, ranges)

title = title.replace('()', '')
title = remove_end_characters(title, '%s%s' % (PUNCTUATION_MINUS_BRACKETS, whitespace))
title = remove_from_end(title, ['as of'], 'Removing - from title: %s -> %s')
if len(ranges) == 0:
return title, None, None
else:
startdate, enddate = sorted(ranges)[0]
return title, startdate, enddate
2 changes: 1 addition & 1 deletion src/hdx/version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
4.0.0
4.0.1
64 changes: 54 additions & 10 deletions tests/hdx/data/test_dataset.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@

# -*- coding: UTF-8 -*-
"""Dataset Tests"""
import copy
import datetime
import json
import re
import tempfile
from os import remove
from os.path import join
from parser import ParserError

import pytest
from hdx.location.country import Country
Expand Down Expand Up @@ -638,9 +639,13 @@ def test_update_in_hdx(self, configuration, post_update):
assert dataset['id'] == 'TEST1'
assert dataset['dataset_date'] == '02/26/2016'
assert dataset['state'] == 'active'
assert dataset['updated_by_script'] == 'HDXPythonLibrary/%s-test' % get_api_version()
pattern = r'HDXPythonLibrary/%s-test \([12]\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d.\d\d\d\d\d\d\)' % get_api_version()
match = re.search(pattern, dataset['updated_by_script'])
assert match
dataset.update_in_hdx(updated_by_script='hi')
assert dataset['updated_by_script'] == 'hi'
pattern = r'hi \([12]\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d.\d\d\d\d\d\d\)'
match = re.search(pattern, dataset['updated_by_script'])
assert match

dataset['id'] = 'NOTEXIST'
with pytest.raises(HDXError):
Expand Down Expand Up @@ -956,11 +961,11 @@ def test_get_set_dataset_date(self, configuration, read):
dataset.set_dataset_year_range('2013')
assert dataset.get_dataset_date_as_datetime() == datetime.datetime(2013, 1, 1, 0, 0)
assert dataset.get_dataset_end_date_as_datetime() == datetime.datetime(2013, 12, 31, 0, 0)
with pytest.raises(ValueError):
with pytest.raises(ParserError):
dataset.set_dataset_date('lalala')
with pytest.raises(ValueError):
with pytest.raises(ParserError):
dataset.set_dataset_date('lalala', 'lalala')
with pytest.raises(ValueError):
with pytest.raises(ParserError):
dataset.set_dataset_date('lalala', 'lalala', date_format='%Y/%m/%d')
with pytest.raises(HDXError):
dataset.set_dataset_year_range(23.5)
Expand All @@ -986,13 +991,13 @@ def test_get_set_dataset_date(self, configuration, read):
assert dataset['dataset_date'] == '01/01/2013-12/31/2014'
dataset.set_dataset_date('2013', dataset_end_date='2014', date_format='%Y')
assert dataset['dataset_date'] == '01/01/2013-12/31/2014'
with pytest.raises(ValueError):
with pytest.raises(ParserError):
dataset.set_dataset_date('2013-09', allow_range=False)
with pytest.raises(ValueError):
with pytest.raises(ParserError):
dataset.set_dataset_date('2013-09', date_format='%Y-%m', allow_range=False)
with pytest.raises(ValueError):
with pytest.raises(ParserError):
dataset.set_dataset_date('2013-09', dataset_end_date='2014-02', allow_range=False)
with pytest.raises(ValueError):
with pytest.raises(ParserError):
dataset.set_dataset_date('2013-09', dataset_end_date='2014-02', date_format='%Y-%m', allow_range=False)

def test_transform_update_frequency(self):
Expand Down Expand Up @@ -1357,3 +1362,42 @@ def test_get_hdx_url(self, configuration, hdx_config_yaml, project_config_yaml):
project_config_yaml=project_config_yaml)
dataset = Dataset(dataset_data)
assert dataset.get_hdx_url() == 'https://feature-data.humdata.org/dataset/MyDataset1'

def test_remove_dates_from_title(self):
dataset = Dataset()
with pytest.raises(HDXError):
dataset.remove_dates_from_title()
assert 'title' not in dataset
title = 'Title with no dates'
dataset['title'] = title
assert dataset.remove_dates_from_title() is False
assert dataset['title'] == title
assert 'dataset_date' not in dataset
assert dataset.remove_dates_from_title(set_dataset_date=True) is False
title = 'ICA Armenia, 2017 - Drought Risk, 1981-2015'
dataset['title'] = title
assert dataset.remove_dates_from_title(change_title=False) is True
assert dataset['title'] == title
assert 'dataset_date' not in dataset
assert dataset.remove_dates_from_title() is True
newtitle = 'ICA Armenia - Drought Risk'
assert dataset['title'] == newtitle
assert 'dataset_date' not in dataset
title = 'ICA Armenia, 2017 - Drought Risk, 1981-2015'
dataset['title'] = title
assert dataset.remove_dates_from_title(set_dataset_date=True) is True
assert dataset['title'] == newtitle
assert dataset['dataset_date'] == '01/01/1981-12/31/2015'
assert dataset.remove_dates_from_title() is False
dataset['title'] = 'Mon_State_Village_Tract_Boundaries 9999 2001'
assert dataset.remove_dates_from_title(set_dataset_date=True) is True
assert dataset['title'] == 'Mon_State_Village_Tract_Boundaries 9999'
assert dataset['dataset_date'] == '01/01/2001-12/31/2001'
dataset['title'] = 'Mon_State_Village_Tract_Boundaries 2001 99'
assert dataset.remove_dates_from_title(set_dataset_date=True) is True
assert dataset['title'] == 'Mon_State_Village_Tract_Boundaries 99'
assert dataset['dataset_date'] == '01/01/2001-12/31/2001'
dataset['title'] = 'Mon_State_Village_Tract_Boundaries 9999 2001 99'
assert dataset.remove_dates_from_title(set_dataset_date=True) is True
assert dataset['title'] == 'Mon_State_Village_Tract_Boundaries 9999 99'
assert dataset['dataset_date'] == '01/01/2001-12/31/2001'
54 changes: 54 additions & 0 deletions tests/hdx/data/test_dataset_title_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# -*- coding: UTF-8 -*-
"""Dataset Title Helper Tests"""
from datetime import datetime

from hdx.data.dataset_title_helper import DatasetTitleHelper


class TestDatasetTitleHelper:
def test_fuzzy_match_dates_in_title(self):
ranges = list()
assert DatasetTitleHelper.fuzzy_match_dates_in_title('Myanmar Town July 2019', ranges) == 'Myanmar Town'
assert ranges == [(datetime(2019, 7, 1, 0, 0), datetime(2019, 7, 31, 0, 0))]
ranges = list()
assert DatasetTitleHelper.fuzzy_match_dates_in_title('Myanmar Town 2019 July', ranges) == 'Myanmar Town'
assert ranges == [(datetime(2019, 7, 1, 0, 0), datetime(2019, 7, 31, 0, 0))]

def test_get_date_from_title(self):
title = 'Myanmar Town 2019 July'
assert DatasetTitleHelper.get_date_from_title(title) == (
'Myanmar Town', datetime(2019, 7, 1, 0, 0), datetime(2019, 7, 31, 0, 0))
assert DatasetTitleHelper.get_date_from_title('Formal Sector School Location Upper Myanmar (2019)') == (
'Formal Sector School Location Upper Myanmar', datetime(2019, 1, 1, 0, 0), datetime(2019, 12, 31, 0, 0))
assert DatasetTitleHelper.get_date_from_title('ICA Armenia, 2017 - Drought Risk, 1981-2015') == (
'ICA Armenia - Drought Risk', datetime(1981, 1, 1, 0, 0), datetime(2015, 12, 31, 0, 0))
assert DatasetTitleHelper.get_date_from_title('Central African Republic, Bridges, January 2019') == (
'Central African Republic, Bridges', datetime(2019, 1, 1, 0, 0), datetime(2019, 1, 31, 0, 0))
assert DatasetTitleHelper.get_date_from_title(
'Afghanistan:District Accessibility for WFP and Partners Staff as of 05 May 2019') == \
('Afghanistan:District Accessibility for WFP and Partners Staff', datetime(2019, 5, 5, 0, 0),
datetime(2019, 5, 5, 0, 0))
assert DatasetTitleHelper.get_date_from_title('Tanintharyi Region Land Cover - March 2016 (Original)') == (
'Tanintharyi Region Land Cover (Original)', datetime(2016, 3, 1, 0, 0), datetime(2016, 3, 31, 0, 0))
assert DatasetTitleHelper.get_date_from_title(
'Kachin State and Sagaing Region 2002-2014 Forest Cover Change') == \
('Kachin State and Sagaing Region Forest Cover Change', datetime(2002, 1, 1, 0, 0),
datetime(2014, 12, 31, 0, 0))
assert DatasetTitleHelper.get_date_from_title('Ward boundaries Yangon City_mimu_v8_1') == \
('Ward boundaries Yangon City_mimu_v8_1', None, None)
assert DatasetTitleHelper.get_date_from_title('Mon_State_Village_Tract_Boundaries') == \
('Mon_State_Village_Tract_Boundaries', None, None)
assert DatasetTitleHelper.get_date_from_title('ICA Afghanistan, 2019 - Landslide hazard, 2013') == \
('ICA Afghanistan - Landslide hazard', datetime(2013, 1, 1, 0, 0), datetime(2013, 12, 31, 0, 0))
assert DatasetTitleHelper.get_date_from_title(
'Afghanistan Percentage of Food Insecure Population Based on Combined Food Consumption Score and Coping Strategy Index by Province - ALCS 2013/14') == \
(
'Afghanistan Percentage of Food Insecure Population Based on Combined Food Consumption Score and Coping Strategy Index by Province - ALCS',
datetime(2013, 1, 1, 0, 0), datetime(2014, 12, 31, 0, 0))
assert DatasetTitleHelper.get_date_from_title('Mon_State_Village_Tract_Boundaries 9999') == \
('Mon_State_Village_Tract_Boundaries 9999', None, None)
assert DatasetTitleHelper.get_date_from_title('Mon_State_Village_Tract_Boundaries 10/12/01 lala') == \
('Mon_State_Village_Tract_Boundaries 10/12/01 lala', None, None) # It's the Mon that makes an extra
# date component that causes it to ignore the date (correctly)
assert DatasetTitleHelper.get_date_from_title('State_Village_Tract_Boundaries 10/12/01 lala') == \
('State_Village_Tract_Boundaries lala', datetime(2001, 12, 10, 0, 0), datetime(2001, 12, 10, 0, 0))

0 comments on commit ba4552b

Please sign in to comment.