Skip to content

Commit

Permalink
Unicode support (#135)
Browse files Browse the repository at this point in the history
* Always treat units as unicode. Closes #133.

Also, allow a file encoding in the coding standards test, so that we can have some literal unicode characters for testing with.

* Fix date2num test which is incorrectly using repr when str was intended.

* Tidy up the Unit constructor for the py2 case, so that it is easier to see that the code can be deleted when the codebase becomes py3 only.

* Handle unicode object in py2 specially, and always ensure that py2k returns a non-unicode for __str__ (unless sys.getdefaultencoding says otherwise).

* Ensure that the error raised in Unit constructor handles unicode too.
  • Loading branch information
pelson authored and bjlittle committed Jan 22, 2019
1 parent 18b72bc commit 1e9af2a
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 11 deletions.
29 changes: 24 additions & 5 deletions cf_units/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -804,10 +804,21 @@ def __init__(self, unit, calendar=None):
ut_unit = _ud.NULL_UNIT
calendar_ = None

encoding = UT_UTF8

if unit is None:
unit = ''

if six.PY2:
if not isinstance(unit, six.text_type):
# Cast everything that isn't a unicode object to a str.
unit = str(unit)
if isinstance(unit, str):
# All str in py2 should be treated as ASCII.
encoding = UT_ASCII
else:
unit = str(unit).strip()
unit = str(unit)
unit = unit.strip()

if unit.lower().endswith(' utc'):
unit = unit[:unit.lower().rfind(' utc')]
Expand All @@ -830,10 +841,15 @@ def __init__(self, unit, calendar=None):
unit = _NO_UNIT_STRING
else:
category = _CATEGORY_UDUNIT
if six.PY2:
str_unit = unit.encode(sys.getdefaultencoding(), 'replace')
else:
str_unit = unit
try:
ut_unit = _ud.parse(_ud_system, unit.encode('ascii'), UT_ASCII)
ut_unit = _ud.parse(_ud_system, unit.encode('utf8'), encoding)
except _ud.UdunitsError as e:
self._propogate_error('Failed to parse unit "%s"' % unit, e)
self._propogate_error(
'Failed to parse unit "%s"' % str_unit, e)
if _OP_SINCE in unit.lower():
if calendar is None:
calendar_ = CALENDAR_GREGORIAN
Expand Down Expand Up @@ -1446,7 +1462,10 @@ def __str__(self):
'miles/hour'
"""
return self.origin or self.symbol
r = self.origin or self.symbol
if six.PY2 and sys.getdefaultencoding() == 'ascii':
r = r.encode('ascii', 'replace')
return r

def __repr__(self):
"""
Expand Down Expand Up @@ -1893,7 +1912,7 @@ def utime(self):
if self.is_long_time_interval():
interval = self.origin.split(' ')[0]
emsg = ('Time units with interval of "months", "years" '
'(or singular of these) cannot be processed, got {!r}.')
'(or singular of these) cannot be processed, got "{!s}".')
raise ValueError(emsg.format(interval))

#
Expand Down
4 changes: 2 additions & 2 deletions cf_units/tests/integration/test_date2num.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# (C) British Crown Copyright 2016 - 2018, Met Office
# (C) British Crown Copyright 2016 - 2019, Met Office
#
# This file is part of cf-units.
#
Expand Down Expand Up @@ -73,7 +73,7 @@ def test_long_time_interval(self):
# This test should fail with an error that we need to catch properly.
unit = 'years since 1970-01-01'
date = datetime.datetime(1970, 1, 1, 0, 0, 5)
exp_emsg = 'interval of "months", "years" .* got \'years\'.'
exp_emsg = 'interval of "months", "years" .* got "years".'
with six.assertRaisesRegex(self, ValueError, exp_emsg):
date2num(date, unit, self.calendar)

Expand Down
8 changes: 4 additions & 4 deletions cf_units/tests/test_coding_standards.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# (C) British Crown Copyright 2013 - 2018, Met Office
# (C) British Crown Copyright 2013 - 2019, Met Office
#
# This file is part of cf-units.
#
Expand Down Expand Up @@ -52,9 +52,9 @@


LICENSE_RE_PATTERN = re.escape(LICENSE_TEMPLATE).replace(r'\{YEARS\}', '(.*?)')
# Add shebang possibility to the LICENSE_RE_PATTERN
LICENSE_RE_PATTERN = r'(\#\!.*\n)?' + LICENSE_RE_PATTERN
LICENSE_RE = re.compile(LICENSE_RE_PATTERN, re.MULTILINE)
SHEBANG = r'(\#\!.*\n)?'
ENCODING = r'(\# \-\*\- coding\: .* \-\*\-\n)?'
LICENSE_RE = re.compile(SHEBANG + ENCODING + LICENSE_RE_PATTERN, re.MULTILINE)


# Guess cf_units repo directory of cf_units - realpath is used to mitigate
Expand Down
29 changes: 29 additions & 0 deletions cf_units/tests/test_unit.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
# (C) British Crown Copyright 2010 - 2019, Met Office
#
# This file is part of cf-units.
Expand Down Expand Up @@ -74,6 +75,34 @@ def test_unsupported_calendar(self):
with six.assertRaisesRegex(self, ValueError, 'unsupported calendar'):
Unit('hours since 1970-01-01 00:00:00', calendar='wibble')

def test_calendar_w_unicode(self):
calendar = unit.CALENDAR_365_DAY
u = Unit(u'hours\xb2 hours-1 since epoch', calendar=calendar)
self.assertEqual(u.calendar, calendar)
if six.PY2:
# Python 2 str MUST return an ascii string, yet the input
# was a unicode. We therefore return the ASCII encoded form.
expected = 'hours? hours-1 since 1970-01-01 00:00:00'
else:
expected = 'hours\xb2 hours-1 since 1970-01-01 00:00:00'
self.assertEqual(str(u), expected)

@unittest.skipIf(six.PY2, "Unicode literals in str aren't a thing")
def test_unicode_valid(self):
# Some unicode characters are allowed.
u = Unit('m²')
assert u.symbol == 'm2'

def test_py2k_unicode(self):
u = Unit(u'm\xb2')
assert u.symbol == 'm2'

def test_unicode_invalid(self):
# Not all unicode characters are allowed.
msg = '[UT_UNKNOWN] Failed to parse unit "ø"'
with self.assertRaises(ValueError, msg=msg):
Unit('ø')


class Test_modulus(unittest.TestCase):

Expand Down

0 comments on commit 1e9af2a

Please sign in to comment.