Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
**v0.53.2**
* [[TeamMsgExtractor #452](https://github.com/TeamMsgExtractor/msg-extractor/issues/452)] Adjusted code to allow html encoding to be cached to try to speed up `bs4` operations.
* [[TeamMsgExtractor #453](https://github.com/TeamMsgExtractor/msg-extractor/issues/453)] Fixed handler for too large filetimes so that some filetimes being too large doesn't break the handler.
* Fixed a bug that would cause an error in task objects due to a lack of `enumerate`.
* Fix `TOCEntry` not initializing `DVTargetDevice` correctly.
* Add temporary properties for `ContentID` to `SignedAttachment`. AFAIK these can't ever be set, but this prevents errors in some places.

**v0.53.1**
* Expanded allowable range for `red-black-tree-mod`.
* Fix issue with `MessageBase.asEmailMessage()` that prevented embedded MSG files from being attached.
Expand Down
4 changes: 2 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -260,8 +260,8 @@ your access to the newest major version of extract-msg.
.. |License: GPL v3| image:: https://img.shields.io/badge/License-GPLv3-blue.svg
:target: LICENSE.txt

.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.53.1-blue.svg
:target: https://pypi.org/project/extract-msg/0.53.1/
.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.53.2-blue.svg
:target: https://pypi.org/project/extract-msg/0.53.2/

.. |PyPI2| image:: https://img.shields.io/badge/python-3.8+-brightgreen.svg
:target: https://www.python.org/downloads/release/python-3810/
Expand Down
4 changes: 2 additions & 2 deletions extract_msg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.

__author__ = 'Destiny Peterson & Matthew Walker'
__date__ = '2025-02-05'
__version__ = '0.53.1'
__date__ = '2025-03-14'
__version__ = '0.53.2'

__all__ = [
# Modules:
Expand Down
10 changes: 8 additions & 2 deletions extract_msg/attachments/signed_att.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __init__(self, msg, data: bytes, name: str, mimetype: str, node: email.messa
self.__node = node
self.__treePath = msg.treePath + [makeWeakRef(self)]

self.__data = None
self.__data = b''
# To add support for embedded MSG files, we are going to completely
# ignore the mimetype and just do a few simple checks to see if we can
# use the bytes as am embedded file.
Expand All @@ -59,7 +59,7 @@ def __init__(self, msg, data: bytes, name: str, mimetype: str, node: email.messa
except Exception:
logger.exception('Signed message was an OLE file, but could not be read as an MSG file due to an exception.')

if self.__data is None:
if not self.__data:
self.__data = data

def _handleFnc(self, _zip, filename, customPath: pathlib.Path, kwargs) -> pathlib.Path:
Expand Down Expand Up @@ -205,6 +205,12 @@ def saveEmbededMessage(self, **kwargs) -> constants.SAVE_TYPE:
def asBytes(self) -> bytes:
return self.__asBytes

@property
def contentID(self) -> None:
return None

cid = contentID

@property
def data(self) -> Union[bytes, MSGFile]:
"""
Expand Down
24 changes: 19 additions & 5 deletions extract_msg/msg_classes/message_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def __init__(self, path, **kwargs):
except Exception as e:
# Prevent an error in the body from preventing opening.
logger.exception('Critical error accessing the body. File opened but accessing the body will throw an exception.')
self._htmlEncoding = None
except:
try:
self.close()
Expand Down Expand Up @@ -142,6 +143,16 @@ def _genRecipient(self, recipientStr: str, recipientType: RecipientType) -> Opti

return value

def _getHtmlEncoding(self, soup: bs4.BeautifulSoup) -> None:
"""
Helper function to set the html encoding.
"""
if not self._htmlEncoding:
try:
self._htmlEncoding = cast(Optional[str], soup.original_encoding or soup.declared_html_encoding)
except AttributeError:
pass

def asEmailMessage(self) -> EmailMessage:
"""
Returns an instance of EmailMessage used to represent the contents of
Expand Down Expand Up @@ -380,7 +391,8 @@ def getSaveHtmlBody(self, preparedHtml: bool = False, charset: str = 'utf-8', **

# If we are preparing the HTML, then we should
if preparedHtml and charset:
bs = bs4.BeautifulSoup(data, features = 'html.parser')
bs = bs4.BeautifulSoup(data, features = 'html.parser', from_encoding = self._htmlEncoding)
self._getHtmlEncoding(bs)
if not bs.find('meta', {'http-equiv': 'Content-Type'}):
# Setup the attributes for the tag.
tagAttrs = {
Expand All @@ -405,7 +417,7 @@ def getSaveHtmlBody(self, preparedHtml: bool = False, charset: str = 'utf-8', **

return data
else:
return self.htmlBody
return self.htmlBody or b''

def getSavePdfBody(self, wkPath = None, wkOptions = None, **kwargs) -> bytes:
"""
Expand Down Expand Up @@ -501,7 +513,7 @@ def injectHtmlHeader(self, prepared: bool = False) -> bytes:
body = self.htmlBody

# Validate the HTML.
if not validateHtml(body):
if not validateHtml(body, self._htmlEncoding):
logger.warning('HTML body failed to validate. Code will attempt to correct it.')

# If we are here, then we need to do what we can to fix the HTML
Expand All @@ -511,7 +523,8 @@ def injectHtmlHeader(self, prepared: bool = False) -> bytes:
# the <html> and <body> tag are missing, we determine where to put
# the body tag (around everything if there is no <head> tag,
# otherwise at the end) and then wrap it all in the <html> tag.
parser = bs4.BeautifulSoup(body, features = 'html.parser')
parser = bs4.BeautifulSoup(body, features = 'html.parser', from_encoding = self._htmlEncoding)
self._getHtmlEncoding(parser)
if not parser.find('html') and not parser.find('body'):
if parser.find('head') or parser.find('footer'):
# Create the parser we will be using for the corrections.
Expand Down Expand Up @@ -1186,7 +1199,8 @@ def htmlBodyPrepared(self) -> Optional[bytes]:
return self.htmlBody

# Create the BeautifulSoup instance to use.
soup = bs4.BeautifulSoup(self.htmlBody, 'html.parser')
soup = bs4.BeautifulSoup(self.htmlBody, 'html.parser', from_encoding = self._htmlEncoding)
self._getHtmlEncoding(soup)

# Get a list of image tags to see if we can inject into. If the source
# of an image starts with "cid:" that means it is one of the attachments
Expand Down
2 changes: 1 addition & 1 deletion extract_msg/msg_classes/task_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def taskObject(self) -> Optional[Task]:
# The task object MUST be the first attachment, but we will be
# lenient and allow it to be in any position. It not existing,
# however, will not be tolerated.
task = next(((index, att) for index, att in self.attachments if isinstance(att.data, Task)), None)
task = next(((index, att) for index, att in enumerate(self.attachments) if isinstance(att.data, Task)), None)

if task is None:
if ErrorBehavior.STANDARDS_VIOLATION in self.errorBehavior:
Expand Down
2 changes: 1 addition & 1 deletion extract_msg/structures/toc_entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def __init__(self, reader: Optional[Union[bytes, BytesReader]] = None):
self.__lindex = 0
self.__tymed = 0
self.__advf = 0
self.__targetDevice = DVTargetDevice()
self.__targetDevice = DVTargetDevice(None)
return

if isinstance(reader, bytes):
Expand Down
13 changes: 10 additions & 3 deletions extract_msg/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,14 @@ def filetimeToDatetime(rawTime: int) -> datetime.datetime:
# Just make null dates from all of these time stamps.
from .null_date import NullDate
date = NullDate(1970, 1, 1, 1)
date += datetime.timedelta(seconds = filetimeToUtc(rawTime))
try:
date += datetime.timedelta(seconds = filetimeToUtc(rawTime))
except OverflowError:
# Time value is so large we physically can't represent it, so
# let's just modify the date to it's highest possible value and
# call it a day.
m = date.max
date = NullDate(m.year, m.month, m.day, m.hour, m.minute, m.second, m.microsecond)
date.filetime = rawTime

return date
Expand Down Expand Up @@ -1241,14 +1248,14 @@ def unwrapMultipart(mp: Union[bytes, str, email.message.Message]) -> Dict:
}


def validateHtml(html: bytes) -> bool:
def validateHtml(html: bytes, encoding: Optional[str]) -> bool:
"""
Checks whether the HTML is considered valid.

To be valid, the HTML must, at minimum, contain an ``<html>`` tag, a
``<body>`` tag, and closing tags for each.
"""
bs = bs4.BeautifulSoup(html, 'html.parser')
bs = bs4.BeautifulSoup(html, 'html.parser', from_encoding = encoding)
if not bs.find('html') or not bs.find('body'):
return False
return True
Expand Down
10 changes: 10 additions & 0 deletions extract_msg_tests/prop_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,16 @@
PropertyFlags.MANDATORY,
NULL_DATE
),
(
'Null Time 4',
b'\x40\x00\x1C\x30\x06\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\x7f',
b'\x40\x00\x1C\x30\x06\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\x7f',
FixedLengthProp,
'301C0040',
0x0040,
PropertyFlags.READABLE | PropertyFlags.WRITABLE,
NULL_DATE
),
# Variable Length Props.
(
'Object',
Expand Down