Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR config parameteres #3014

Merged
merged 9 commits into from
May 17, 2024
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.14.0-dev14
## 0.14.0-dev15

### BREAKING CHANGES

Expand All @@ -9,6 +9,7 @@
* **Skip unnecessary element sorting in `partition_pdf()`**. Skip element sorting when determining whether embedded text can be extracted.
* **Faster evaluation** Support for concurrent processing of documents during evaluation
* **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.
* **Add GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR** configuration parameteres to control temporary storage.

### Features
* **Add form extraction basics (document elements and placeholder code in partition)**. This is to lay the ground work for the future. Form extraction models are not currently available in the library. An attempt to use this functionality will end in a `NotImplementedError`.
Expand Down
11 changes: 9 additions & 2 deletions test_unstructured/partition/pdf_image/test_pdf_image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,9 @@ def test_save_elements(
assert not el.metadata.image_mime_type


def test_save_elements_with_output_dir_path_none():
@pytest.mark.parametrize("storage_enabled", [False, True])
def test_save_elements_with_output_dir_path_none(monkeypatch, storage_enabled):
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", storage_enabled)
with (
patch("PIL.Image.open"),
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image"),
Expand All @@ -161,7 +163,12 @@ def test_save_elements_with_output_dir_path_none():
)

# Verify that the images are saved in the expected directory
expected_output_dir = os.path.join(tmpdir, "figures")
if storage_enabled:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

personally I'd prefer to see more usages of pathlib, but I believe it's not the mail goal of this PR, just a side note

from unstructured.partition.utils.config import env_config

expected_output_dir = os.path.join(env_config.GLOBAL_WORKING_PROCESS_DIR, "figures")
else:
expected_output_dir = os.path.join(tmpdir, "figures")
assert os.path.exists(expected_output_dir)
assert os.path.isdir(expected_output_dir)
os.chdir(original_cwd)
Expand Down
47 changes: 47 additions & 0 deletions test_unstructured/partition/utils/test_config.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
import shutil
import tempfile
from pathlib import Path

import pytest


def test_default_config():
from unstructured.partition.utils.config import env_config

Expand All @@ -9,3 +16,43 @@ def test_env_override(monkeypatch):
from unstructured.partition.utils.config import env_config

assert env_config.IMAGE_CROP_PAD == 1


@pytest.fixture()
def _setup_tmpdir():
from unstructured.partition.utils.config import env_config

_tmpdir = tempfile.tempdir
_storage_tmpdir = env_config.GLOBAL_WORKING_PROCESS_DIR
_storage_tmpdir_bak = f"{env_config.GLOBAL_WORKING_PROCESS_DIR}_bak"
if Path(_storage_tmpdir).is_dir():
shutil.move(_storage_tmpdir, _storage_tmpdir_bak)
tempfile.tempdir = None
yield
if Path(_storage_tmpdir_bak).is_dir():
if Path(_storage_tmpdir).is_dir():
shutil.rmtree(_storage_tmpdir)
shutil.move(_storage_tmpdir_bak, _storage_tmpdir)
tempfile.tempdir = _tmpdir


@pytest.mark.usefixtures("_setup_tmpdir")
def test_env_storage_disabled(monkeypatch):
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", "false")
from unstructured.partition.utils.config import env_config

assert not env_config.GLOBAL_WORKING_DIR_ENABLED
assert str(Path.home() / ".cache/unstructured") == env_config.GLOBAL_WORKING_DIR
assert not Path(env_config.GLOBAL_WORKING_PROCESS_DIR).is_dir()
assert tempfile.gettempdir() != env_config.GLOBAL_WORKING_PROCESS_DIR


@pytest.mark.usefixtures("_setup_tmpdir")
def test_env_storage_enabled(monkeypatch):
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", "true")
from unstructured.partition.utils.config import env_config

assert env_config.GLOBAL_WORKING_DIR_ENABLED
assert str(Path.home() / ".cache/unstructured") == env_config.GLOBAL_WORKING_DIR
assert Path(env_config.GLOBAL_WORKING_PROCESS_DIR).is_dir()
assert tempfile.gettempdir() == env_config.GLOBAL_WORKING_PROCESS_DIR
4 changes: 4 additions & 0 deletions unstructured/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .partition.utils.config import env_config

# init env_config
env_config
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.14.0-dev14" # pragma: no cover
__version__ = "0.14.0-dev15" # pragma: no cover
1 change: 0 additions & 1 deletion unstructured/metrics/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,6 @@ def _try_process_document(self, doc: Path) -> Optional[list]:
@abstractmethod
def _process_document(self, doc: Path) -> list:
"""Should return all metadata and metrics for a single document."""
pass
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why removed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was removed by the linter

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If a docstring is added, the pass keyword is optional for functions.



@dataclass
Expand Down
9 changes: 9 additions & 0 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import os
import re
import warnings
from pathlib import Path
from typing import IO, TYPE_CHECKING, Any, Iterator, Optional, cast

import numpy as np
Expand Down Expand Up @@ -438,6 +439,14 @@ def _partition_pdf_or_image_local(
)

if analysis:
if not analyzed_image_output_dir_path:
if env_config.GLOBAL_WORKING_DIR_ENABLED:
analyzed_image_output_dir_path = str(
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
)
else:
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
annotate_layout_elements(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
Expand Down
7 changes: 5 additions & 2 deletions unstructured/partition/pdf_image/pdf_image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import tempfile
from copy import deepcopy
from io import BytesIO
from pathlib import PurePath
from pathlib import Path, PurePath
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Tuple, Union, cast

import cv2
Expand Down Expand Up @@ -131,7 +131,10 @@ def save_elements(
"""

if not output_dir_path:
output_dir_path = os.path.join(os.getcwd(), "figures")
if env_config.GLOBAL_WORKING_DIR_ENABLED:
output_dir_path = str(Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "figures")
else:
output_dir_path = str(Path.cwd() / "figures")
os.makedirs(output_dir_path, exist_ok=True)

with tempfile.TemporaryDirectory() as temp_dir:
Expand Down
45 changes: 45 additions & 0 deletions unstructured/partition/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,28 @@
"""

import os
import tempfile
from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path

from unstructured.partition.utils.constants import OCR_AGENT_TESSERACT


@lru_cache(maxsize=1)
def get_tempdir(dir: str) -> str:
tempdir = Path(dir) / f"tmp/{os.getpgid(0)}"
return str(tempdir)


@dataclass
class ENVConfig:
"""class for configuring enviorment parameters"""

def __post_init__(self):
if self.GLOBAL_WORKING_DIR_ENABLED:
self._setup_tmpdir(self.GLOBAL_WORKING_PROCESS_DIR)

def _get_string(self, var: str, default_value: str = "") -> str:
"""attempt to get the value of var from the os environment; if not present return the
default_value"""
Expand All @@ -31,6 +44,15 @@ def _get_float(self, var: str, default_value: float) -> float:
return float(value)
return default_value

def _get_bool(self, var: str, default_value: bool) -> bool:
if value := self._get_string(var):
return value.lower() in ("true", "1", "t")
return default_value

def _setup_tmpdir(self, tmpdir: str) -> None:
Path(tmpdir).mkdir(parents=True, exist_ok=True)
tempfile.tempdir = tmpdir

@property
def IMAGE_CROP_PAD(self) -> int:
"""extra image content to add around an identified element region; measured in pixels"""
Expand Down Expand Up @@ -117,5 +139,28 @@ def PDF_ANNOTATION_THRESHOLD(self) -> float:

return self._get_float("PDF_ANNOTATION_THRESHOLD", 0.9)

@property
def GLOBAL_WORKING_DIR_ENABLED(self) -> bool:
"""Enable usage of GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR."""
return self._get_bool("GLOBAL_WORKING_DIR_ENABLED", False)

@property
def GLOBAL_WORKING_DIR(self) -> str:
"""Path to Unstructured cache directory."""
return self._get_string("GLOBAL_WORKING_DIR", str(Path.home() / ".cache/unstructured"))

@property
def GLOBAL_WORKING_PROCESS_DIR(self) -> str:
"""Path to Unstructured cache tempdir. Overrides TMPDIR, TEMP and TMP.
Defaults to '{GLOBAL_WORKING_DIR}/tmp/{os.getpgid(0)}'.
"""
default_tmpdir = get_tempdir(dir=self.GLOBAL_WORKING_DIR)
tmpdir = self._get_string("GLOBAL_WORKING_PROCESS_DIR", default_tmpdir)
if tmpdir == "":
tmpdir = default_tmpdir
if self.GLOBAL_WORKING_DIR_ENABLED:
self._setup_tmpdir(tmpdir)
return tmpdir


env_config = ENVConfig()
Loading