From b3e7e808798264fa7d5877b2991072519e8e692e Mon Sep 17 00:00:00 2001 From: Coniferish Date: Tue, 9 May 2023 14:48:10 -0500 Subject: [PATCH 1/2] fix: include all metadata fields when converting to dataframe or CSV (#555) --- CHANGELOG.md | 6 +++--- unstructured/staging/base.py | 13 +++++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d2672e193d..2defead210 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,13 +1,13 @@ -## 0.6.5-dev0 +## 0.6.5-dev1 ### Enhancements -* PLACEHOLDER - delete this line when there is an actual changelog item to report for 0.6.5 - ### Features ### Fixes +* Include all metadata fields when converting to dataframe or CSV + ## 0.6.4 ### Enhancements diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py index 6ad1a51b7c..4566a6b380 100644 --- a/unstructured/staging/base.py +++ b/unstructured/staging/base.py @@ -22,10 +22,14 @@ "filename", "page_number", "url", + "sent_from", + "sent_to", + "subject", + "sender", ] -def convert_to_isd(elements: List[Element]) -> List[Dict[str, str]]: +def convert_to_isd(elements: List[Element]) -> List[Dict[str, Any]]: """Represents the document elements as an Initial Structured Document (ISD).""" isd: List[Dict[str, str]] = [] for element in elements: @@ -34,7 +38,7 @@ def convert_to_isd(elements: List[Element]) -> List[Dict[str, str]]: return isd -def convert_to_dict(elements: List[Element]) -> List[Dict[str, str]]: +def convert_to_dict(elements: List[Element]) -> List[Dict[str, Any]]: """Converts a list of elements into a dictionary.""" return convert_to_isd(elements) @@ -127,6 +131,11 @@ def convert_to_isd_csv(elements: List[Element]) -> str: if key in TABLE_FIELDNAMES: row[key] = value + if row.get("sent_from"): + row["sender"] = row.get("sent_from") + if type(row["sender"]) == list: + row["sender"] = row["sender"][0] + with io.StringIO() as buffer: csv_writer = csv.DictWriter(buffer, fieldnames=TABLE_FIELDNAMES) csv_writer.writeheader() From e9281e7dcb03fd1d13b455e084bb57e85d44b08f Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Wed, 10 May 2023 09:26:31 -0400 Subject: [PATCH 2/2] bump version after merge from main --- unstructured/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 36225a4ca4..d7d6a9bd35 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.6.5" # pragma: no cover +__version__ = "0.6.6-dev0" # pragma: no cover