TeamMsgExtractor · TheElementalOfDestruction · Mar 14, 2025 · Mar 13, 2025 · Mar 13, 2025 · Mar 13, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,10 @@
+**v0.53.2**
+* [[TeamMsgExtractor #452](https://github.com/TeamMsgExtractor/msg-extractor/issues/452)] Adjusted code to allow html encoding to be cached to try to speed up `bs4` operations.
+* [[TeamMsgExtractor #453](https://github.com/TeamMsgExtractor/msg-extractor/issues/453)] Fixed handler for too large filetimes so that some filetimes being too large doesn't break the handler.
+* Fixed a bug that would cause an error in task objects due to a lack of `enumerate`.
+* Fix `TOCEntry` not initializing `DVTargetDevice` correctly.
+* Add temporary properties for `ContentID` to `SignedAttachment`. AFAIK these can't ever be set, but this prevents errors in some places.
+
 **v0.53.1**
 * Expanded allowable range for `red-black-tree-mod`.
 * Fix issue with `MessageBase.asEmailMessage()` that prevented embedded MSG files from being attached.

diff --git a/README.rst b/README.rst
@@ -260,8 +260,8 @@ your access to the newest major version of extract-msg.
 .. |License: GPL v3| image:: https://img.shields.io/badge/License-GPLv3-blue.svg
    :target: LICENSE.txt
 
-.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.53.1-blue.svg
-   :target: https://pypi.org/project/extract-msg/0.53.1/
+.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.53.2-blue.svg
+   :target: https://pypi.org/project/extract-msg/0.53.2/
 
 .. |PyPI2| image:: https://img.shields.io/badge/python-3.8+-brightgreen.svg
    :target: https://www.python.org/downloads/release/python-3810/

diff --git a/extract_msg/__init__.py b/extract_msg/__init__.py
@@ -27,8 +27,8 @@
 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 __author__ = 'Destiny Peterson & Matthew Walker'
-__date__ = '2025-02-05'
-__version__ = '0.53.1'
+__date__ = '2025-03-14'
+__version__ = '0.53.2'
 
 __all__ = [
     # Modules:

diff --git a/extract_msg/attachments/signed_att.py b/extract_msg/attachments/signed_att.py
@@ -45,7 +45,7 @@ def __init__(self, msg, data: bytes, name: str, mimetype: str, node: email.messa
         self.__node = node
         self.__treePath = msg.treePath + [makeWeakRef(self)]
 
-        self.__data = None
+        self.__data = b''
         # To add support for embedded MSG files, we are going to completely
         # ignore the mimetype and just do a few simple checks to see if we can
         # use the bytes as am embedded file.
@@ -59,7 +59,7 @@ def __init__(self, msg, data: bytes, name: str, mimetype: str, node: email.messa
             except Exception:
                 logger.exception('Signed message was an OLE file, but could not be read as an MSG file due to an exception.')
 
-        if self.__data is None:
+        if not self.__data:
             self.__data = data
 
     def _handleFnc(self, _zip, filename, customPath: pathlib.Path, kwargs) -> pathlib.Path:
@@ -205,6 +205,12 @@ def saveEmbededMessage(self, **kwargs) -> constants.SAVE_TYPE:
     def asBytes(self) -> bytes:
         return self.__asBytes
 
+    @property
+    def contentID(self) -> None:
+        return None
+
+    cid = contentID
+
     @property
     def data(self) -> Union[bytes, MSGFile]:
         """

diff --git a/extract_msg/msg_classes/message_base.py b/extract_msg/msg_classes/message_base.py
@@ -96,6 +96,7 @@ def __init__(self, path, **kwargs):
             except Exception as e:
                 # Prevent an error in the body from preventing opening.
                 logger.exception('Critical error accessing the body. File opened but accessing the body will throw an exception.')
+            self._htmlEncoding = None
         except:
             try:
                 self.close()
@@ -142,6 +143,16 @@ def _genRecipient(self, recipientStr: str, recipientType: RecipientType) -> Opti
 
         return value
 
+    def _getHtmlEncoding(self, soup: bs4.BeautifulSoup) -> None:
+        """
+        Helper function to set the html encoding.
+        """
+        if not self._htmlEncoding:
+            try:
+                self._htmlEncoding = cast(Optional[str], soup.original_encoding or soup.declared_html_encoding)
+            except AttributeError:
+                pass
+
     def asEmailMessage(self) -> EmailMessage:
         """
         Returns an instance of EmailMessage used to represent the contents of
@@ -380,7 +391,8 @@ def getSaveHtmlBody(self, preparedHtml: bool = False, charset: str = 'utf-8', **
 
             # If we are preparing the HTML, then we should
             if preparedHtml and charset:
-                bs = bs4.BeautifulSoup(data, features = 'html.parser')
+                bs = bs4.BeautifulSoup(data, features = 'html.parser', from_encoding = self._htmlEncoding)
+                self._getHtmlEncoding(bs)
                 if not bs.find('meta', {'http-equiv': 'Content-Type'}):
                     # Setup the attributes for the tag.
                     tagAttrs = {
@@ -405,7 +417,7 @@ def getSaveHtmlBody(self, preparedHtml: bool = False, charset: str = 'utf-8', **
 
             return data
         else:
-            return self.htmlBody
+            return self.htmlBody or b''
 
     def getSavePdfBody(self, wkPath = None, wkOptions = None, **kwargs) -> bytes:
         """
@@ -501,7 +513,7 @@ def injectHtmlHeader(self, prepared: bool = False) -> bytes:
             body = self.htmlBody
 
         # Validate the HTML.
-        if not validateHtml(body):
+        if not validateHtml(body, self._htmlEncoding):
             logger.warning('HTML body failed to validate. Code will attempt to correct it.')
 
             # If we are here, then we need to do what we can to fix the HTML
@@ -511,7 +523,8 @@ def injectHtmlHeader(self, prepared: bool = False) -> bytes:
             # the <html> and <body> tag are missing, we determine where to put
             # the body tag (around everything if there is no <head> tag,
             # otherwise at the end) and then wrap it all in the <html> tag.
-            parser = bs4.BeautifulSoup(body, features = 'html.parser')
+            parser = bs4.BeautifulSoup(body, features = 'html.parser', from_encoding = self._htmlEncoding)
+            self._getHtmlEncoding(parser)
             if not parser.find('html') and not parser.find('body'):
                 if parser.find('head') or parser.find('footer'):
                     # Create the parser we will be using for the corrections.
@@ -1186,7 +1199,8 @@ def htmlBodyPrepared(self) -> Optional[bytes]:
             return self.htmlBody
 
         # Create the BeautifulSoup instance to use.
-        soup = bs4.BeautifulSoup(self.htmlBody, 'html.parser')
+        soup = bs4.BeautifulSoup(self.htmlBody, 'html.parser', from_encoding = self._htmlEncoding)
+        self._getHtmlEncoding(soup)
 
         # Get a list of image tags to see if we can inject into. If the source
         # of an image starts with "cid:" that means it is one of the attachments

diff --git a/extract_msg/msg_classes/task_request.py b/extract_msg/msg_classes/task_request.py
@@ -63,7 +63,7 @@ def taskObject(self) -> Optional[Task]:
         # The task object MUST be the first attachment, but we will be
         # lenient and allow it to be in any position. It not existing,
         # however, will not be tolerated.
-        task = next(((index, att) for index, att in self.attachments if isinstance(att.data, Task)), None)
+        task = next(((index, att) for index, att in enumerate(self.attachments) if isinstance(att.data, Task)), None)
 
         if task is None:
             if ErrorBehavior.STANDARDS_VIOLATION in self.errorBehavior:

diff --git a/extract_msg/structures/toc_entry.py b/extract_msg/structures/toc_entry.py
@@ -20,7 +20,7 @@ def __init__(self, reader: Optional[Union[bytes, BytesReader]] = None):
             self.__lindex = 0
             self.__tymed = 0
             self.__advf = 0
-            self.__targetDevice = DVTargetDevice()
+            self.__targetDevice = DVTargetDevice(None)
             return
 
         if isinstance(reader, bytes):

diff --git a/extract_msg/utils.py b/extract_msg/utils.py
@@ -295,7 +295,14 @@ def filetimeToDatetime(rawTime: int) -> datetime.datetime:
             # Just make null dates from all of these time stamps.
             from .null_date import NullDate
             date = NullDate(1970, 1, 1, 1)
-            date += datetime.timedelta(seconds = filetimeToUtc(rawTime))
+            try:
+                date += datetime.timedelta(seconds = filetimeToUtc(rawTime))
+            except OverflowError:
+                # Time value is so large we physically can't represent it, so
+                # let's just modify the date to it's highest possible value and
+                # call it a day.
+                m = date.max
+                date = NullDate(m.year, m.month, m.day, m.hour, m.minute, m.second, m.microsecond)
             date.filetime = rawTime
 
             return date
@@ -1241,14 +1248,14 @@ def unwrapMultipart(mp: Union[bytes, str, email.message.Message]) -> Dict:
     }
 
 
-def validateHtml(html: bytes) -> bool:
+def validateHtml(html: bytes, encoding: Optional[str]) -> bool:
     """
     Checks whether the HTML is considered valid.
 
     To be valid, the HTML must, at minimum, contain an ``<html>`` tag, a
     ``<body>`` tag, and closing tags for each.
     """
-    bs = bs4.BeautifulSoup(html, 'html.parser')
+    bs = bs4.BeautifulSoup(html, 'html.parser', from_encoding = encoding)
     if not bs.find('html') or not bs.find('body'):
         return False
     return True

diff --git a/extract_msg_tests/prop_tests.py b/extract_msg_tests/prop_tests.py
@@ -207,6 +207,16 @@
         PropertyFlags.MANDATORY,
         NULL_DATE
     ),
+    (
+        'Null Time 4',
+        b'\x40\x00\x1C\x30\x06\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\x7f',
+        b'\x40\x00\x1C\x30\x06\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\x7f',
+        FixedLengthProp,
+        '301C0040',
+        0x0040,
+        PropertyFlags.READABLE | PropertyFlags.WRITABLE,
+        NULL_DATE
+    ),
     # Variable Length Props.
     (
         'Object',