diff --git a/CHANGELOG.md b/CHANGELOG.md index 18fcbdd5..6efe895a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +**v0.53.2** +* [[TeamMsgExtractor #452](https://github.com/TeamMsgExtractor/msg-extractor/issues/452)] Adjusted code to allow html encoding to be cached to try to speed up `bs4` operations. +* [[TeamMsgExtractor #453](https://github.com/TeamMsgExtractor/msg-extractor/issues/453)] Fixed handler for too large filetimes so that some filetimes being too large doesn't break the handler. +* Fixed a bug that would cause an error in task objects due to a lack of `enumerate`. +* Fix `TOCEntry` not initializing `DVTargetDevice` correctly. +* Add temporary properties for `ContentID` to `SignedAttachment`. AFAIK these can't ever be set, but this prevents errors in some places. + **v0.53.1** * Expanded allowable range for `red-black-tree-mod`. * Fix issue with `MessageBase.asEmailMessage()` that prevented embedded MSG files from being attached. diff --git a/README.rst b/README.rst index 2ff4b8a2..b7a1e368 100644 --- a/README.rst +++ b/README.rst @@ -260,8 +260,8 @@ your access to the newest major version of extract-msg. .. |License: GPL v3| image:: https://img.shields.io/badge/License-GPLv3-blue.svg :target: LICENSE.txt -.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.53.1-blue.svg - :target: https://pypi.org/project/extract-msg/0.53.1/ +.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.53.2-blue.svg + :target: https://pypi.org/project/extract-msg/0.53.2/ .. |PyPI2| image:: https://img.shields.io/badge/python-3.8+-brightgreen.svg :target: https://www.python.org/downloads/release/python-3810/ diff --git a/extract_msg/__init__.py b/extract_msg/__init__.py index 3367f63c..9f12e8ef 100644 --- a/extract_msg/__init__.py +++ b/extract_msg/__init__.py @@ -27,8 +27,8 @@ # along with this program. If not, see . __author__ = 'Destiny Peterson & Matthew Walker' -__date__ = '2025-02-05' -__version__ = '0.53.1' +__date__ = '2025-03-14' +__version__ = '0.53.2' __all__ = [ # Modules: diff --git a/extract_msg/attachments/signed_att.py b/extract_msg/attachments/signed_att.py index f7c530c6..cf116368 100644 --- a/extract_msg/attachments/signed_att.py +++ b/extract_msg/attachments/signed_att.py @@ -45,7 +45,7 @@ def __init__(self, msg, data: bytes, name: str, mimetype: str, node: email.messa self.__node = node self.__treePath = msg.treePath + [makeWeakRef(self)] - self.__data = None + self.__data = b'' # To add support for embedded MSG files, we are going to completely # ignore the mimetype and just do a few simple checks to see if we can # use the bytes as am embedded file. @@ -59,7 +59,7 @@ def __init__(self, msg, data: bytes, name: str, mimetype: str, node: email.messa except Exception: logger.exception('Signed message was an OLE file, but could not be read as an MSG file due to an exception.') - if self.__data is None: + if not self.__data: self.__data = data def _handleFnc(self, _zip, filename, customPath: pathlib.Path, kwargs) -> pathlib.Path: @@ -205,6 +205,12 @@ def saveEmbededMessage(self, **kwargs) -> constants.SAVE_TYPE: def asBytes(self) -> bytes: return self.__asBytes + @property + def contentID(self) -> None: + return None + + cid = contentID + @property def data(self) -> Union[bytes, MSGFile]: """ diff --git a/extract_msg/msg_classes/message_base.py b/extract_msg/msg_classes/message_base.py index 62a39271..8fab3279 100644 --- a/extract_msg/msg_classes/message_base.py +++ b/extract_msg/msg_classes/message_base.py @@ -96,6 +96,7 @@ def __init__(self, path, **kwargs): except Exception as e: # Prevent an error in the body from preventing opening. logger.exception('Critical error accessing the body. File opened but accessing the body will throw an exception.') + self._htmlEncoding = None except: try: self.close() @@ -142,6 +143,16 @@ def _genRecipient(self, recipientStr: str, recipientType: RecipientType) -> Opti return value + def _getHtmlEncoding(self, soup: bs4.BeautifulSoup) -> None: + """ + Helper function to set the html encoding. + """ + if not self._htmlEncoding: + try: + self._htmlEncoding = cast(Optional[str], soup.original_encoding or soup.declared_html_encoding) + except AttributeError: + pass + def asEmailMessage(self) -> EmailMessage: """ Returns an instance of EmailMessage used to represent the contents of @@ -380,7 +391,8 @@ def getSaveHtmlBody(self, preparedHtml: bool = False, charset: str = 'utf-8', ** # If we are preparing the HTML, then we should if preparedHtml and charset: - bs = bs4.BeautifulSoup(data, features = 'html.parser') + bs = bs4.BeautifulSoup(data, features = 'html.parser', from_encoding = self._htmlEncoding) + self._getHtmlEncoding(bs) if not bs.find('meta', {'http-equiv': 'Content-Type'}): # Setup the attributes for the tag. tagAttrs = { @@ -405,7 +417,7 @@ def getSaveHtmlBody(self, preparedHtml: bool = False, charset: str = 'utf-8', ** return data else: - return self.htmlBody + return self.htmlBody or b'' def getSavePdfBody(self, wkPath = None, wkOptions = None, **kwargs) -> bytes: """ @@ -501,7 +513,7 @@ def injectHtmlHeader(self, prepared: bool = False) -> bytes: body = self.htmlBody # Validate the HTML. - if not validateHtml(body): + if not validateHtml(body, self._htmlEncoding): logger.warning('HTML body failed to validate. Code will attempt to correct it.') # If we are here, then we need to do what we can to fix the HTML @@ -511,7 +523,8 @@ def injectHtmlHeader(self, prepared: bool = False) -> bytes: # the and tag are missing, we determine where to put # the body tag (around everything if there is no tag, # otherwise at the end) and then wrap it all in the tag. - parser = bs4.BeautifulSoup(body, features = 'html.parser') + parser = bs4.BeautifulSoup(body, features = 'html.parser', from_encoding = self._htmlEncoding) + self._getHtmlEncoding(parser) if not parser.find('html') and not parser.find('body'): if parser.find('head') or parser.find('footer'): # Create the parser we will be using for the corrections. @@ -1186,7 +1199,8 @@ def htmlBodyPrepared(self) -> Optional[bytes]: return self.htmlBody # Create the BeautifulSoup instance to use. - soup = bs4.BeautifulSoup(self.htmlBody, 'html.parser') + soup = bs4.BeautifulSoup(self.htmlBody, 'html.parser', from_encoding = self._htmlEncoding) + self._getHtmlEncoding(soup) # Get a list of image tags to see if we can inject into. If the source # of an image starts with "cid:" that means it is one of the attachments diff --git a/extract_msg/msg_classes/task_request.py b/extract_msg/msg_classes/task_request.py index f445d5c8..435deda5 100644 --- a/extract_msg/msg_classes/task_request.py +++ b/extract_msg/msg_classes/task_request.py @@ -63,7 +63,7 @@ def taskObject(self) -> Optional[Task]: # The task object MUST be the first attachment, but we will be # lenient and allow it to be in any position. It not existing, # however, will not be tolerated. - task = next(((index, att) for index, att in self.attachments if isinstance(att.data, Task)), None) + task = next(((index, att) for index, att in enumerate(self.attachments) if isinstance(att.data, Task)), None) if task is None: if ErrorBehavior.STANDARDS_VIOLATION in self.errorBehavior: diff --git a/extract_msg/structures/toc_entry.py b/extract_msg/structures/toc_entry.py index ae1a80ca..319b8465 100644 --- a/extract_msg/structures/toc_entry.py +++ b/extract_msg/structures/toc_entry.py @@ -20,7 +20,7 @@ def __init__(self, reader: Optional[Union[bytes, BytesReader]] = None): self.__lindex = 0 self.__tymed = 0 self.__advf = 0 - self.__targetDevice = DVTargetDevice() + self.__targetDevice = DVTargetDevice(None) return if isinstance(reader, bytes): diff --git a/extract_msg/utils.py b/extract_msg/utils.py index 6741737f..887d7eee 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -295,7 +295,14 @@ def filetimeToDatetime(rawTime: int) -> datetime.datetime: # Just make null dates from all of these time stamps. from .null_date import NullDate date = NullDate(1970, 1, 1, 1) - date += datetime.timedelta(seconds = filetimeToUtc(rawTime)) + try: + date += datetime.timedelta(seconds = filetimeToUtc(rawTime)) + except OverflowError: + # Time value is so large we physically can't represent it, so + # let's just modify the date to it's highest possible value and + # call it a day. + m = date.max + date = NullDate(m.year, m.month, m.day, m.hour, m.minute, m.second, m.microsecond) date.filetime = rawTime return date @@ -1241,14 +1248,14 @@ def unwrapMultipart(mp: Union[bytes, str, email.message.Message]) -> Dict: } -def validateHtml(html: bytes) -> bool: +def validateHtml(html: bytes, encoding: Optional[str]) -> bool: """ Checks whether the HTML is considered valid. To be valid, the HTML must, at minimum, contain an ```` tag, a ```` tag, and closing tags for each. """ - bs = bs4.BeautifulSoup(html, 'html.parser') + bs = bs4.BeautifulSoup(html, 'html.parser', from_encoding = encoding) if not bs.find('html') or not bs.find('body'): return False return True diff --git a/extract_msg_tests/prop_tests.py b/extract_msg_tests/prop_tests.py index e9af7406..035ab7a1 100644 --- a/extract_msg_tests/prop_tests.py +++ b/extract_msg_tests/prop_tests.py @@ -207,6 +207,16 @@ PropertyFlags.MANDATORY, NULL_DATE ), + ( + 'Null Time 4', + b'\x40\x00\x1C\x30\x06\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\x7f', + b'\x40\x00\x1C\x30\x06\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\x7f', + FixedLengthProp, + '301C0040', + 0x0040, + PropertyFlags.READABLE | PropertyFlags.WRITABLE, + NULL_DATE + ), # Variable Length Props. ( 'Object',