Skip to content

Commit

Permalink
🐛 Fix accessing the 'alphabets' property when the payload contain sur…
Browse files Browse the repository at this point in the history
…rogate like characters (#68)
  • Loading branch information
Ousret committed Jul 23, 2021
1 parent bdb91cd commit 87a5a98
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 3 deletions.
8 changes: 5 additions & 3 deletions charset_normalizer/models.py
Expand Up @@ -203,9 +203,11 @@ def alphabets(self) -> List[str]:
return self._unicode_ranges
detected_ranges = set() # type: Set[str]
for character in str(self):
detected_ranges.add(
unicode_range(character)
)
detected_range = unicode_range(character) # type: Optional[str]
if detected_range:
detected_ranges.add(
unicode_range(character)
)
self._unicode_ranges = sorted(list(detected_ranges))
return self._unicode_ranges

Expand Down
15 changes: 15 additions & 0 deletions tests/test_on_byte.py
Expand Up @@ -58,6 +58,21 @@ def test_ensure_u8_fallback(self):
msg="Fallback UTF-8 miss-detection. You clearly have tempered with it. Testing with {}".format(payload)
)

def test_alphabets_property_undefined_range(self):
payload = b'\xef\xbb\xbf\xf0\x9f\xa9\xb3'

r = from_bytes(payload)

self.assertEqual(
r.best().encoding,
"utf_8"
)

self.assertEqual(
r.best().alphabets,
[]
)

def test_ensure_ascii(self):

for payload in [
Expand Down

0 comments on commit 87a5a98

Please sign in to comment.