🐛 Fix accessing the 'alphabets' property when the payload contain sur…

…rogate like characters (#68)
Ousret · Jul 23, 2021 · 87a5a98 · 87a5a98
1 parent bdb91cd
commit 87a5a98
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 3 deletions.
diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py
@@ -203,9 +203,11 @@ def alphabets(self) -> List[str]:
             return self._unicode_ranges
         detected_ranges = set()  # type: Set[str]
         for character in str(self):
-            detected_ranges.add(
-                unicode_range(character)
-            )
+            detected_range = unicode_range(character)  # type: Optional[str]
+            if detected_range:
+                detected_ranges.add(
+                    unicode_range(character)
+                )
         self._unicode_ranges = sorted(list(detected_ranges))
         return self._unicode_ranges
 

diff --git a/tests/test_on_byte.py b/tests/test_on_byte.py
@@ -58,6 +58,21 @@ def test_ensure_u8_fallback(self):
             msg="Fallback UTF-8 miss-detection. You clearly have tempered with it. Testing with {}".format(payload)
         )
 
+    def test_alphabets_property_undefined_range(self):
+        payload = b'\xef\xbb\xbf\xf0\x9f\xa9\xb3'
+
+        r = from_bytes(payload)
+
+        self.assertEqual(
+            r.best().encoding,
+            "utf_8"
+        )
+
+        self.assertEqual(
+            r.best().alphabets,
+            []
+        )
+
     def test_ensure_ascii(self):
 
         for payload in [