Update supported Python versions and zope.interface and zope.schema.

This exposed a bug in unicode normalization. Fixes #23, fixes #24, fixes #25 and fixes #26.
OpenNTI · Mar 27, 2020 · 7f6abb0 · 7f6abb0
1 parent e5570af
commit 7f6abb0
Show file tree

Hide file tree

Showing 10 changed files with 210 additions and 121 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -2,16 +2,31 @@
  Changes
 =========
 
-1.2.2 (unreleased)
+1.3.0 (unreleased)
 ==================
 
-- Nothing changed yet.
+- Add support for Python 3.8.
 
+- Depend on zope.interface 5.0.
+
+- Update the datrie dependency. See https://github.com/NextThought/nti.contentfragments/issues/24
+
+- Make ``IUnicodeContentFragment`` extend
+  ``zope.interface.common.collections.ISequence`` instead of the
+  semi-deprecated ``zope.interface.common.sequence.IReadSequence``.
+
+- Replace custom interfaces ``IString``, ``IUnicode`` and ``IBytes``
+  with aliases for ``INativeString``, ``ITextString`` and
+  ``IByteString`` from ``zope.interface.common.builtins``. These
+  custom aliases are now deprecated. See https://github.com/NextThought/nti.contentfragments/issues/23.
+
+- Fix unicode normalization breaking schema fields with zope.schema
+  6.0. See https://github.com/NextThought/nti.contentfragments/issues/26
 
 1.2.1 (2019-11-07)
 ==================
 
-- Remove a word from the censored word list. See issue #22.
+- Remove a word from the censored word list. See issue https://github.com/NextThought/nti.contentfragments/issues/22.
 
 
 1.2.0 (2018-10-15)

diff --git a/docs/conf.py b/docs/conf.py
@@ -19,7 +19,8 @@
 # import os
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
-
+import pkg_resources
+rqmt = pkg_resources.require('nti.contentfragments')[0]
 # -- General configuration ------------------------------------------------
 
 # If your documentation needs a minimal Sphinx version, state it here.
@@ -65,11 +66,9 @@
 # built documents.
 #
 # The short X.Y version.
-with open('../version.txt') as f:
-    release = f.read().strip()
-version = u'1.0'
+version = '%s.%s' % tuple(map(int, rqmt.version.split('.')[:2]))
 # The full version, including alpha/beta/rc tags.
-release = u'1.0'
+release = rqmt.version
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@ def _read(fname):
     with codecs.open(fname, encoding='utf-8') as f:
         return f.read()
 
-version = _read('version.txt').strip()
+version = '1.3.0.dev0'
 
 setup(
     name='nti.contentfragments',
@@ -28,7 +28,7 @@ def _read(fname):
     author_email='jason@nextthought.com',
     description="NTI ContentFragments",
     url="https://github.com/NextThought/nti.contentfragments",
-    long_description=_read('README.rst'),
+    long_description=_read('README.rst') + '\n\n' + _read('CHANGES.rst'),
     license='Apache',
     keywords='Content fragments semantic typing interfaces classes sanitize censor',
     classifiers=[
@@ -40,6 +40,7 @@ def _read(fname):
         'Programming Language :: Python :: 3',
         'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
         'Programming Language :: Python :: Implementation :: CPython',
         'Programming Language :: Python :: Implementation :: PyPy',
     ],
@@ -58,20 +59,13 @@ def _read(fname):
         # xml.etree.ElementTree, even on PyPy.
         'lxml >= 4.2.5',
         'repoze.lru >= 0.6',
-        'zope.component >= 4.5.0',
+        'zope.component >= 4.6.1',
         'zope.event >= 4.4.0',
-        'zope.interface >= 4.5.0',
-        'zope.mimetype >= 2.3.2',
-        'zope.security >= 4.3.0',
+        'zope.interface >= 5.0.0',
+        'zope.mimetype >= 2.4.0',
+        'zope.security >= 5.1.1',
         'zope.cachedescriptors >= 4.3.1',
-        'nti.schema >= 1.12.0',
-    ],
-    extras_require={
-        'test': TESTS_REQUIRE,
-        'docs': [
-            'repoze.sphinx.autointerface',
-            'sphinx_rtd_theme',
-        ],
+        'nti.schema >= 1.14.0',
         # html5lib > 0.99999999 install datrie if appropriate for the platform
         # with its own [datrie] extra. But we do not explicitly depend
         # on that version to help avoid conflicts, and older versions of
@@ -80,8 +74,14 @@ def _read(fname):
 
         # datrie 0.7.1 does not build on CPython 3.7. See
         # https://github.com/pytries/datrie/issues/52
-        ":platform_python_implementation == 'CPython' and python_version < '3.7'": [
-            "datrie"
+
+        "datrie >= 0.8.2 ; platform_python_implementation == 'CPython'",
+    ],
+    extras_require={
+        'test': TESTS_REQUIRE,
+        'docs': [
+            'repoze.sphinx.autointerface',
+            'sphinx_rtd_theme',
         ],
 
     },

diff --git a/src/nti/contentfragments/interfaces.py b/src/nti/contentfragments/interfaces.py
@@ -6,11 +6,13 @@
 
 from __future__ import print_function, absolute_import, division
 __docformat__ = "restructuredtext en"
-
+import sys
 logger = __import__('logging').getLogger(__name__)
 # pylint:disable=inherit-non-class,too-many-ancestors,no-self-argument,abstract-method
 # pylint:disable=useless-object-inheritance
 PY2 = str is bytes
+PYPY = hasattr(sys, 'pypy_version_info')
+PYPY2 = PY2 and PYPY
 if PY2: # pragma: no cover
     import copy_reg # pylint:disable=import-error
     text_type = unicode # pylint:disable=undefined-variable
@@ -21,15 +23,14 @@
 from zope import component
 from zope import interface
 
-from zope.interface.common import sequence
+from zope.interface.common.collections import ISequence
+from zope.interface.common.builtins import INativeString
+from zope.interface.common.builtins import IByteString
+from zope.interface.common.builtins import ITextString
 
 from zope.contenttype import add_files as zc_add_files
 
-try:
-    from zope.mimetype import types as mime_types
-except ImportError:  # pragma: no cover
-    # They moved this in zope.mimetype 2.0 (python 3 compat?)
-    from zope.mimetype import mtypes as mime_types
+from zope.mimetype import mtypes as mime_types
 mime_types.setup()  # register interface classes and utilities if not already
 
 resource_filename = __import__('pkg_resources').resource_filename
@@ -49,22 +50,10 @@ def _setup():
     zc_add_files([mime_map_file])
 _setup()
 
-
-class IString(interface.Interface):
-    """Marker interface for native strings."""
-
-
-class IUnicode(interface.Interface):
-    """Marker interface for unicode strings."""
-
-
-class IBytes(interface.Interface):
-    """Marker interface for byte strings."""
-
-
-interface.classImplements(str, IString)
-interface.classImplements(bytes, IBytes)
-interface.classImplements(text_type, IUnicode)
+# BWC aliases. These will be removed in the future.
+IString = INativeString
+IUnicode = ITextString
+IBytes = IByteString
 
 
 class IContentFragment(interface.Interface):
@@ -73,17 +62,28 @@ class IContentFragment(interface.Interface):
     be in.
     """
 
-
-class IUnicodeContentFragment(IContentFragment, sequence.IReadSequence):
+class IUnicodeContentFragment(IContentFragment, ISequence):
     """
     Content represented as a unicode string.
 
     Although it is simplest to subclass :class:`unicode`, that is not required.
     At a minimum, what is required are the `__getitem__` method (and others
     declared by :class:`IReadSequence`), plus the `encode` method.
+
+    .. versionchanged:: 1.3.0
+       Extend ``zope.interface.common.collections.ISequence`` instead of the semi-deprecated
+       ``zope.interface.common.sequence.IReadSequence``. Except on PyPy2, where
+       ``ISequence`` cannot validate against unicode objects.
     """
     # TODO: extend IUnicode?
 
+if PYPY2: # pragma: no cover
+    IUnicodeContentFragment.__bases__ = tuple(
+        x
+        for x in IUnicodeContentFragment.__bases__
+        if x is not ISequence
+    )
+
 
 @interface.implementer(IUnicodeContentFragment)
 class UnicodeContentFragment(text_type):

diff --git a/src/nti/contentfragments/schema.py b/src/nti/contentfragments/schema.py
@@ -15,9 +15,10 @@
 # pylint: disable=too-many-ancestors
 # pylint:disable=useless-object-inheritance
 
+import unicodedata
+
 from zope.interface import implementer
 
-from .interfaces import IContentFragment
 from .interfaces import HTMLContentFragment as HTMLContentFragmentType
 from .interfaces import IHTMLContentFragment
 from .interfaces import LatexContentFragment
@@ -42,33 +43,57 @@
 from nti.schema.field import ValidText as Text
 from nti.schema.field import ValidTextLine as TextLine
 
-def _massage_kwargs(self, kwargs):
-
-    assert self._iface.isOrExtends(IUnicodeContentFragment), self._iface
-    assert self._iface.implementedBy(self._impl), self._impl
-
-    # We're imported too early for ZCA to be configured and we can't automatically
-    # adapt.
-    if 'default' in kwargs and not self._iface.providedBy(kwargs['default']):
-        kwargs['default'] = self._impl(kwargs['default'])
-    if 'default' not in kwargs and 'defaultFactory' not in kwargs and not kwargs.get('min_length'):  # 0/None
-        kwargs['defaultFactory'] = self._impl
-    return kwargs
-
 class _FromUnicodeMixin(object):
 
+    # Set the interface to use as self.schema. This will be implemented by
+    # objects returned from ``fromUnicode``. However...
+    _iface = None
+    # If the adapter registered to produce _iface may produce some
+    # interface less restrictive than that (e.g., _iface is HTML, but
+    # we can produce plain text)
+    # set this to become self.schema.
+    _iface_upper_bound = None
+    # This is the class used to copy defaults.
+    _impl = lambda *args: None
 
     def __init__(self, *args, **kwargs):
-        super(_FromUnicodeMixin, self).__init__(self._iface,
-                                                *args,
-                                                **_massage_kwargs(self, kwargs))
+        super(_FromUnicodeMixin, self).__init__(
+            self._iface_upper_bound or self._iface, # Becomes self.schema.
+            *args,
+            **self.__massage_kwargs(kwargs))
+
+    def __massage_kwargs(self, kwargs):
+
+        assert self._iface.isOrExtends(IUnicodeContentFragment), self._iface
+        assert self._iface.implementedBy(self._impl), self._impl
+
+        # We're imported too early for ZCA to be configured and we can't automatically
+        # adapt.
+        if 'default' in kwargs and not self._iface.providedBy(kwargs['default']):
+            kwargs['default'] = self._impl(kwargs['default'])
+        if 'default' not in kwargs and 'defaultFactory' not in kwargs and not kwargs.get('min_length'):  # 0/None
+            kwargs['defaultFactory'] = self._impl
+        # Disable unicode normalization at this level; we need to handle it
+        # to properly deal with our content fragment subclasses.
+        assert 'unicode_normalization' not in kwargs
+        kwargs['unicode_normalization'] = None
+        return kwargs
 
     def fromUnicode(self, value):
         """
         We implement :class:`.IFromUnicode` by adapting the given object
         to our text schema.
+
+        This happens *after* unicode normalization.
         """
-        return super(_FromUnicodeMixin, self).fromUnicode(self.schema(value))
+        # unicodedate.normalize does not preserve the class of the
+        # object it's given (it goes back to text_type; always under PyPy, only if
+        # changes are needed under CPython). So we must handle normalization ourself
+        # before converting to the schema.
+        value = unicodedata.normalize(self.__class__.unicode_normalization, value)
+        value = self.schema(value)
+        result = super(_FromUnicodeMixin, self).fromUnicode(value)
+        return result
 
 
 @implementer(ITextUnicodeContentFragmentField)
@@ -156,6 +181,8 @@ class SanitizedHTMLContentFragment(HTMLContentFragment):
     """
     A :class:`Text` type that also requires the object implement
     an interface descending from :class:`.ISanitizedHTMLContentFragment`.
+    Note that the default adapter for this can actually produce
+    ``IPlainTextContentFragment`` if there is no HTML present in the input.
 
     Pass the keyword arguments for :class:`zope.schema.Text` to the constructor; the ``schema``
     argument for :class:`~zope.schema.Object` is already handled.
@@ -168,7 +195,6 @@ class SanitizedHTMLContentFragment(HTMLContentFragment):
     _iface = ISanitizedHTMLContentFragment
     _impl = SanitizedHTMLContentFragmentType
 
-
 @implementer(IPlainTextField)
 class PlainText(TextUnicodeContentFragment):
     """

diff --git a/src/nti/contentfragments/tests/__init__.py b/src/nti/contentfragments/tests/__init__.py
@@ -7,6 +7,7 @@
 # pylint:disable=useless-object-inheritance
 
 from hamcrest import assert_that
+from hamcrest import is_
 
 from nti.testing.layers import ZopeComponentLayer
 from nti.testing.layers import ConfiguringLayerMixin
@@ -52,6 +53,48 @@ def _getTargetClass(self):
     def _getTargetInterface(self):
         raise NotImplementedError()
 
+    def _transform_normalized_for_comparison(self, val):
+        return val
+
+    def _transform_raw_for_fromUnicode(self, raw):
+        return raw
+
     def test_implements_interface(self):
         inst = self._makeOne()
         assert_that(inst, verifiably_provides(self._getTargetInterface()))
+
+    def test_fromUnicode_implements_schema(self):
+        inst = self._makeOne()
+        assert_that(
+            inst.fromUnicode(
+                self._transform_raw_for_fromUnicode(u'abc')),
+            verifiably_provides(inst.schema))
+
+    def test_fromUnicode_normalizes(self):
+        import unicodedata
+        inst = self._makeOne()
+        raw = b'A\xcc\x88O\xcc\x88U\xcc\x88'.decode('utf-8')
+        normalized = unicodedata.normalize('NFC', raw)
+        self.assertEqual(
+            [unicodedata.name(c) for c in raw],
+            [
+                'LATIN CAPITAL LETTER A',
+                'COMBINING DIAERESIS',
+                'LATIN CAPITAL LETTER O',
+                'COMBINING DIAERESIS',
+                'LATIN CAPITAL LETTER U',
+                'COMBINING DIAERESIS',
+            ]
+        )
+        assert_that(
+            [unicodedata.name(c) for c in normalized],
+            is_([
+                'LATIN CAPITAL LETTER A WITH DIAERESIS',
+                'LATIN CAPITAL LETTER O WITH DIAERESIS',
+                'LATIN CAPITAL LETTER U WITH DIAERESIS',
+            ])
+        )
+
+        fromUnicode = inst.fromUnicode(self._transform_raw_for_fromUnicode(raw))
+        self.assertEqual(fromUnicode, self._transform_normalized_for_comparison(normalized))
+        assert_that(fromUnicode, verifiably_provides(inst.schema))