Skip to content

Commit

Permalink
support typographic apostrophes pyenchant#93
Browse files Browse the repository at this point in the history
  • Loading branch information
TimKam committed Nov 27, 2016
1 parent 3c44469 commit 68253a3
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 0 deletions.
3 changes: 3 additions & 0 deletions enchant/tokenize/en.py
Expand Up @@ -144,6 +144,9 @@ def _consume_alpha_u(self,text,offset):
def next(self):
text = self._text
offset = self._offset
# Ensure typographic apostrophes are treated exactly as typewriter apostrophes
if isinstance(text, unicode):
text = text.replace(u"\u2019", u"\u0027")
while offset < len(text):
# Find start of next word (must be alpha)
while offset < len(text):
Expand Down
8 changes: 8 additions & 0 deletions enchant/tokenize/tests.py
Expand Up @@ -313,3 +313,11 @@ def test_finnish_text(self):
for (itmO,itmV) in zip(outputT,tokenize_en(inputT)):
self.assertEqual(itmO,itmV)

def test_typographic_apostrophe_en(self):
""""Typographic apostrophes shouldn't be word separators in English."""
from enchant.tokenize import en
tknzr = wrap_tokenizer(basic_tokenize, en.tokenize)
input = u"They\u2019ve"
output = [(u"They\u0027ve", 0)]
self.assertEqual(output, [i for i in tknzr(input)])

0 comments on commit 68253a3

Please sign in to comment.