Merge pull request #23 from Ousret/develop

Release 2.0.5
jawah · Apr 19, 2020 · 51c3168 · 51c3168
2 parents a966162 + 9959d43
commit 51c3168
Show file tree

Hide file tree

Showing 7 changed files with 97 additions and 52 deletions.
diff --git a/README.md b/README.md
@@ -49,7 +49,7 @@ charset = headers['Content-Type'].split(';')[-1].split('=')[-1].replace('"', '')
 * A backwards-compatible syntax using bracket style.
 * Capability to alter headers using simple, human-readable operator notation `+` and `-`.
 * Flexibility if headers are from IMAP4 or HTTP, use as you need with one library.
-* Ability to parse any object and extract recognized headers from it.
+* Ability to parse any object and extract recognized headers from it, it also support UTF-8 encoded headers.
 * Fully type-annotated.
 * Provide great auto-completion in Python interpreter or any capable IDE.
 * Absolutely no dependencies.
@@ -60,7 +60,7 @@ Plus all the features that you would expect from handling headers...
 * Properties syntax for headers and attribute in header.
 * Supports headers and attributes OneToOne, OneToMany and ManySquashedIntoOne.
 * Capable of parsing `bytes`, `fp`, `str`, `dict`, `email.Message`, `requests.Response` and `httpx._models.Response`.
-* Automatically unquote value of an attribute when retrieving it.
+* Automatically unquote and unfold value of an attribute when retrieving it.
 * Case insensitive with header name and attribute key.
 * Character `-` equal `_` in addition of above feature.
 * Any syntax you like, we like.

diff --git a/kiss_headers/api.py b/kiss_headers/api.py
@@ -1,13 +1,14 @@
 from email.message import Message
-from email.parser import BytesHeaderParser, HeaderParser
-from io import BytesIO, IOBase
+from email.parser import HeaderParser
+from io import RawIOBase
 from typing import Any, Iterable, List, Mapping, Optional, Tuple
 
 from kiss_headers.models import Header, Headers
 from kiss_headers.structures import CaseInsensitiveDict
 from kiss_headers.utils import (
     decode_partials,
     extract_class_name,
+    extract_encoded_headers,
     header_content_split,
     header_name_to_class,
     is_legal_header_name,
@@ -26,15 +27,11 @@ def parse_it(raw_headers: Any) -> Headers:
 
     if isinstance(raw_headers, str):
         headers = HeaderParser().parsestr(raw_headers, headersonly=True).items()
-    elif isinstance(raw_headers, bytes) or isinstance(raw_headers, IOBase):
-        headers = (
-            BytesHeaderParser()
-            .parse(
-                BytesIO(raw_headers) if isinstance(raw_headers, bytes) else raw_headers,  # type: ignore
-                headersonly=True,
-            )
-            .items()
+    elif isinstance(raw_headers, bytes) or isinstance(raw_headers, RawIOBase):
+        decoded, not_decoded = extract_encoded_headers(
+            raw_headers if isinstance(raw_headers, bytes) else raw_headers.read() or b""
         )
+        return parse_it(decoded)
     elif isinstance(raw_headers, Mapping) or isinstance(raw_headers, Message):
         headers = raw_headers.items()
     else:

diff --git a/kiss_headers/models.py b/kiss_headers/models.py
@@ -15,12 +15,14 @@
 
 from kiss_headers.structures import CaseInsensitiveDict
 from kiss_headers.utils import (
+    extract_comments,
     header_content_split,
     header_name_to_class,
     header_strip,
     is_legal_header_name,
     normalize_str,
     prettify_header_name,
+    unfold,
     unpack_protected_keyword,
     unquote,
 )
@@ -66,10 +68,9 @@ def __init__(self, name: str, content: str):
         ]
 
         self._not_valued_attrs: List[str] = list()
-        self._valued_attrs: MutableMapping[str, Union[str, List[str]]] = dict()
-        self._valued_attrs_normalized: MutableMapping[
+        self._valued_attrs: MutableMapping[
             str, Union[str, List[str]]
-        ] = dict()
+        ] = CaseInsensitiveDict()
 
         for member in self._members:
             if member == "":
@@ -91,9 +92,6 @@ def __init__(self, name: str, content: str):
                     else:
                         self._valued_attrs[key].append(value)  # type: ignore
 
-                self._valued_attrs_normalized[normalize_str(key)] = self._valued_attrs[
-                    key
-                ]
                 continue
 
             self._not_valued_attrs.append(unquote(member))
@@ -133,6 +131,11 @@ def content(self) -> str:
 
         return self._content
 
+    @property
+    def comments(self) -> List[str]:
+        """Retrieve comments in header content."""
+        return extract_comments(self.content)
+
     def __lt__(self, other: object) -> bool:
         """
         This method is only implemented to make sorted work with Header.
@@ -245,7 +248,7 @@ def __isub__(self, other: str) -> "Header":
 
         other = normalize_str(other)
 
-        if other in self._valued_attrs_normalized:
+        if other in self._valued_attrs:
             del self[other]
 
         if other in self._not_valued_attrs:
@@ -286,7 +289,6 @@ def __setattr__(self, key: str, value: str) -> None:
             "_content",
             "_members",
             "_not_valued_attrs",
-            "_valued_attrs_normalized",
             "_valued_attrs",
         }:
             return super().__setattr__(key, value)
@@ -300,15 +302,13 @@ def __setitem__(self, key: str, value: str) -> None:
         Set an attribute bracket syntax like. This will erase previously set attribute named after the key.
         Any value that are not a str are casted to str.
         """
-        key_normalized = normalize_str(key)
 
         if key in self:
             del self[key]
         if not isinstance(value, str):
             value = str(value)
 
         self._valued_attrs[key] = value
-        self._valued_attrs_normalized[key_normalized] = self._valued_attrs[key]
 
         self._content += '{semi_colon_r}{key}="{value}"'.format(
             key=key,
@@ -326,22 +326,14 @@ def __delitem__(self, key: str) -> None:
         >>> str(headers.content_type)
         'text/html'
         """
-        key_normalized = normalize_str(key)
-
-        if key_normalized not in self._valued_attrs_normalized:
+        if key not in self._valued_attrs:
             raise KeyError(
                 "'{item}' attribute is not defined within '{header}' header.".format(
                     item=key, header=self.name
                 )
             )
 
-        del self._valued_attrs_normalized[key]
-        not_normalized_keys = self._valued_attrs.keys()
-
-        for key_ in not_normalized_keys:
-            if normalize_str(key_) == key_normalized:
-                del self._valued_attrs[key_]
-                break
+        del self._valued_attrs[key]
 
         for elem in findall(
             r"{key_name}=.*?(?=[;\n])".format(key_name=escape(key)),
@@ -362,7 +354,7 @@ def __delattr__(self, item: str) -> None:
         """
         item = normalize_str(item)
 
-        if item not in self._valued_attrs_normalized:
+        if item not in self._valued_attrs:
             raise AttributeError(
                 "'{item}' attribute is not defined within '{header}' header.".format(
                     item=item, header=self.name
@@ -430,7 +422,7 @@ def __dir__(self) -> Iterable[str]:
         Provide a better auto-completion when using python interpreter. We are feeding __dir__ so Python can be aware
         of what properties are callable. In other word, more precise auto-completion when not using IDE.
         """
-        return list(super().__dir__()) + list(self._valued_attrs_normalized.keys())
+        return list(super().__dir__()) + list(self._valued_attrs.keys())
 
     @property
     def attrs(self) -> List[str]:
@@ -450,6 +442,15 @@ def has(self, attr: str) -> bool:
     def get(self, attr: str) -> Optional[Union[str, List[str]]]:
         """
         Retrieve associated value of an attribute.
+        >>> header = Header("Content-Type", "application/json; charset=UTF-8; format=flowed")
+        >>> header.charset
+        'UTF-8'
+        >>> header.ChArSeT
+        'UTF-8'
+        >>> header.FORMAT
+        'flowed'
+        >>> header.format
+        'flowed'
         """
         if attr not in self._valued_attrs:
             return None
@@ -471,16 +472,17 @@ def has_many(self, name: str) -> bool:
 
         return isinstance(r, list) and len(r) > 1
 
-    def __getitem__(self, item: Union[str]) -> Union[str, List[str]]:
+    def __getitem__(self, item: Union[str, int]) -> Union[str, List[str]]:
         """
-        This method will allow you to retrieve attribute value using the bracket syntax, list-like.
+        This method will allow you to retrieve attribute value using the bracket syntax, list-like or dict-like.
         """
-        normalized_item = normalize_str(item)
+        if isinstance(item, int):
+            return (
+                self._members[item] if not OUTPUT_LOCK_TYPE else [self._members[item]]
+            )
 
         if item in self._valued_attrs:
             value = self._valued_attrs[item]
-        elif normalized_item in self._valued_attrs_normalized:
-            value = self._valued_attrs_normalized[normalized_item]
         else:
             raise KeyError(
                 "'{item}' attribute is not defined within '{header}' header.".format(
@@ -492,9 +494,9 @@ def __getitem__(self, item: Union[str]) -> Union[str, List[str]]:
             value = [value]
 
         return (
-            unquote(value)
+            unfold(unquote(value))
             if not isinstance(value, list)
-            else [unquote(v) for v in value]
+            else [unfold(unquote(v)) for v in value]
         )
 
     def __getattr__(self, item: str) -> Union[str, List[str]]:
@@ -504,10 +506,7 @@ def __getattr__(self, item: str) -> Union[str, List[str]]:
         """
         item = unpack_protected_keyword(item)
 
-        if (
-            item not in self._valued_attrs
-            and normalize_str(item) not in self._valued_attrs_normalized
-        ):
+        if item not in self._valued_attrs:
             raise AttributeError(
                 "'{item}' attribute is not defined within '{header}' header.".format(
                     item=item, header=self.name
@@ -525,7 +524,7 @@ def __contains__(self, item: str) -> bool:
         item = normalize_str(item)
         for attr in self.attrs:
             target = normalize_str(attr)
-            if item == target or item in target.split(" "):
+            if item == target or item in header_content_split(target, " "):
                 return True
         return False
 

diff --git a/kiss_headers/structures.py b/kiss_headers/structures.py
@@ -2,6 +2,8 @@
 from collections.abc import Mapping, MutableMapping
 from typing import Any, Iterator, Optional, Tuple
 
+from kiss_headers.utils import normalize_str
+
 
 """
 Disclaimer : CaseInsensitiveDict has been borrowed from `psf/requests`.
@@ -45,13 +47,13 @@ def __init__(self, data: Optional[Mapping] = None, **kwargs: Any):
     def __setitem__(self, key: str, value: Any) -> None:
         # Use the lowercased key for lookups, but store the actual
         # key alongside the value.
-        self._store[key.lower().replace("-", "_")] = (key, value)
+        self._store[normalize_str(key)] = (key, value)
 
     def __getitem__(self, key: str) -> Any:
-        return self._store[key.lower().replace("-", "_")][1]
+        return self._store[normalize_str(key)][1]
 
     def __delitem__(self, key: str) -> None:
-        del self._store[key.lower().replace("-", "_")]
+        del self._store[normalize_str(key)]
 
     def __iter__(self) -> Iterator[Tuple[str, Any]]:
         return (casedkey for casedkey, mappedvalue in self._store.values())

diff --git a/kiss_headers/utils.py b/kiss_headers/utils.py
@@ -85,8 +85,8 @@ def header_content_split(string: str, delimiter: str) -> List[str]:
     >>> header_content_split("text/html; charset=UTF-8", ";")
     ['text/html', 'charset=UTF-8']
     """
-    if len(delimiter) != 1 or delimiter not in {";", ","}:
-        raise ValueError("Delimiter should be either semi-colon or a coma.")
+    if len(delimiter) != 1 or delimiter not in {";", ",", " "}:
+        raise ValueError("Delimiter should be either semi-colon, a coma or a space.")
 
     in_double_quote: bool = False
     in_parenthesis: bool = False
@@ -372,3 +372,44 @@ def is_legal_header_name(name: str) -> bool:
         name != ""
         and search(r"[^\x00-\x7F]|[:;(),<>=@?\[\]\r\n\t &{}\\]", name) is None
     )
+
+
+def extract_comments(content: str) -> List[str]:
+    """
+    Extract parts of content that are considered as comments. Between parenthesis.
+    >>> extract_comments("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:50.0) Gecko/20100101 Firefox/50.0 (hello) llll (abc)")
+    ['Macintosh; Intel Mac OS X 10.9; rv:50.0', 'hello', 'abc']
+    """
+    return findall(r"\(([^)]+)\)", content)
+
+
+def unfold(content: str) -> str:
+    """Some header content may have folded content (LF + 9 spaces or LF + 7 spaces) in it, making your job at reading them a little more difficult.
+    This function undo the folding in given content.
+    >>> unfold("eqHS2AQD+hfNNlTiLej73CiBUGVQifX4watAaxUkdjGeH578i7n3Wwcdw2nLz+U0bH\\n         ehSe/2QytZGWM5CewwNdumT1IVGzjFs+cRgfK0V6JlEIOoV3bRXxnjenWFfWdVNXtw8s")
+    'eqHS2AQD+hfNNlTiLej73CiBUGVQifX4watAaxUkdjGeH578i7n3Wwcdw2nLz+U0bHehSe/2QytZGWM5CewwNdumT1IVGzjFs+cRgfK0V6JlEIOoV3bRXxnjenWFfWdVNXtw8s'
+    """
+    return content.replace("\n" + (9 * " "), "").replace("\n" + (7 * " "), " ")
+
+
+def extract_encoded_headers(payload: bytes) -> Tuple[str, bytes]:
+    """This function purpose is to extract lines that can be decoded using utf-8.
+    >>> extract_encoded_headers("Host: developer.mozilla.org\\r\\nX-Hello-World: 死の漢字\\r\\n\\r\\n".encode("utf-8"))
+    ('Host: developer.mozilla.org\\r\\nX-Hello-World: 死の漢字\\r\\n', b'')
+    >>> extract_encoded_headers("Host: developer.mozilla.org\\r\\nX-Hello-World: 死の漢字\\r\\n\\r\\nThat IS totally random.".encode("utf-8"))
+    ('Host: developer.mozilla.org\\r\\nX-Hello-World: 死の漢字\\r\\n', b'\\r\\nThat IS totally random.')
+    """
+    result: str = ""
+    lines: List[bytes] = payload.splitlines()
+    index: int = 0
+
+    for line, index in zip(lines, range(0, len(lines))):
+        if line == b"":
+            return result, b"\r\n".join(lines[index:])
+
+        try:
+            result += line.decode("utf-8") + "\r\n"
+        except UnicodeDecodeError:
+            break
+
+    return result, b"\r\n".join(lines[index:])
diff --git a/kiss_headers/version.py b/kiss_headers/version.py
@@ -2,5 +2,5 @@
 Expose version
 """
 
-__version__ = "2.0.4"
+__version__ = "2.0.5"
 VERSION = __version__.split(".")
diff --git a/tests/test_headers_from_string.py b/tests/test_headers_from_string.py
@@ -54,6 +54,12 @@ def test_decode_partials(self):
             decode_partials([("Subject", "=?iso-8859-1?q?p=F6stal?=")]),
         )
 
+    def test_bytes_headers(self):
+
+        self.assertEqual(
+            MyKissHeadersFromStringTest.headers, parse_it(RAW_HEADERS.encode("utf-8"))
+        )
+
     def test_two_headers_eq(self):
 
         self.assertEqual(MyKissHeadersFromStringTest.headers, parse_it(RAW_HEADERS))