Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR config parameteres #3014

Merged
merged 9 commits into from
May 17, 2024
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
## 0.13.8-dev5
## 0.13.8-dev6

### Enhancements

**Faster evaluation** Support for concurrent processing of documents during evaluation
* **Faster evaluation.** Support for concurrent processing of documents during evaluation.
* **Add STORAGE_DIR and STORAGE_TMPDIR** configuration parameteres to control temporary storage.

### Features

Expand Down
11 changes: 9 additions & 2 deletions test_unstructured/partition/pdf_image/test_pdf_image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,9 @@ def test_save_elements(
assert not el.metadata.image_mime_type


def test_save_elements_with_output_dir_path_none():
@pytest.mark.parametrize("storage_enabled", [False, True])
def test_save_elements_with_output_dir_path_none(monkeypatch, storage_enabled):
monkeypatch.setenv("STORAGE_ENABLED", storage_enabled)
with (
patch("PIL.Image.open"),
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image"),
Expand All @@ -161,7 +163,12 @@ def test_save_elements_with_output_dir_path_none():
)

# Verify that the images are saved in the expected directory
expected_output_dir = os.path.join(tmpdir, "figures")
if storage_enabled:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

personally I'd prefer to see more usages of pathlib, but I believe it's not the mail goal of this PR, just a side note

from unstructured.partition.utils.config import env_config

expected_output_dir = os.path.join(env_config.STORAGE_TMPDIR, "figures")
else:
expected_output_dir = os.path.join(tmpdir, "figures")
assert os.path.exists(expected_output_dir)
assert os.path.isdir(expected_output_dir)
os.chdir(original_cwd)
Expand Down
45 changes: 45 additions & 0 deletions test_unstructured/partition/utils/test_config.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
import shutil
import tempfile
from pathlib import Path

import pytest


def test_default_config():
from unstructured.partition.utils.config import env_config

Expand All @@ -9,3 +16,41 @@ def test_env_override(monkeypatch):
from unstructured.partition.utils.config import env_config

assert env_config.IMAGE_CROP_PAD == 1


@pytest.fixture()
def setup_tmpdir():
from unstructured.partition.utils.config import env_config

_tmpdir = tempfile.tempdir
_storage_tmpdir = env_config.STORAGE_TMPDIR
_storage_tmpdir_bak = f"{env_config.STORAGE_TMPDIR}_bak"
if Path(_storage_tmpdir).is_dir():
shutil.move(_storage_tmpdir, _storage_tmpdir_bak)
tempfile.tempdir = None
yield
if Path(_storage_tmpdir_bak).is_dir():
if Path(_storage_tmpdir).is_dir():
shutil.rmtree(_storage_tmpdir)
shutil.move(_storage_tmpdir_bak, _storage_tmpdir)
tempfile.tempdir = _tmpdir


def test_env_storage_disabled(monkeypatch, setup_tmpdir):
monkeypatch.setenv("STORAGE_ENABLED", "false")
from unstructured.partition.utils.config import env_config

assert env_config.STORAGE_ENABLED == False
assert str(Path.home() / ".cache/unstructured") == env_config.STORAGE_DIR
assert Path(env_config.STORAGE_TMPDIR).is_dir() == False
assert tempfile.gettempdir() != env_config.STORAGE_TMPDIR


def test_env_storage_enabled(monkeypatch, setup_tmpdir):
monkeypatch.setenv("STORAGE_ENABLED", "true")
from unstructured.partition.utils.config import env_config

assert env_config.STORAGE_ENABLED == True
assert str(Path.home() / ".cache/unstructured") == env_config.STORAGE_DIR
assert Path(env_config.STORAGE_TMPDIR).is_dir() == True
assert tempfile.gettempdir() == env_config.STORAGE_TMPDIR
4 changes: 4 additions & 0 deletions unstructured/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .partition.utils.config import env_config

# init env_config
env_config
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.13.8-dev5" # pragma: no cover
__version__ = "0.13.8-dev6" # pragma: no cover
1 change: 0 additions & 1 deletion unstructured/metrics/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,6 @@ def _try_process_document(self, doc: Path) -> Optional[list]:
@abstractmethod
def _process_document(self, doc: Path) -> list:
"""Should return all metadata and metrics for a single document."""
pass
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why removed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was removed by the linter

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If a docstring is added, the pass keyword is optional for functions.



@dataclass
Expand Down
9 changes: 9 additions & 0 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import os
import re
import warnings
from pathlib import Path
from typing import IO, TYPE_CHECKING, Any, Iterator, Optional, cast

import numpy as np
Expand Down Expand Up @@ -437,6 +438,14 @@ def _partition_pdf_or_image_local(
)

if analysis:
if not analyzed_image_output_dir_path:
if env_config.STORAGE_ENABLED:
analyzed_image_output_dir_path = str(
Path(env_config.STORAGE_TMPDIR) / "annotated"
)
else:
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
annotate_layout_elements(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
Expand Down
7 changes: 5 additions & 2 deletions unstructured/partition/pdf_image/pdf_image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import tempfile
from copy import deepcopy
from io import BytesIO
from pathlib import PurePath
from pathlib import Path, PurePath
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Tuple, Union, cast

import cv2
Expand Down Expand Up @@ -131,7 +131,10 @@ def save_elements(
"""

if not output_dir_path:
output_dir_path = os.path.join(os.getcwd(), "figures")
if env_config.STORAGE_ENABLED:
output_dir_path = str(Path(env_config.STORAGE_TMPDIR) / "figures")
else:
output_dir_path = str(Path.cwd() / "figures")
os.makedirs(output_dir_path, exist_ok=True)

with tempfile.TemporaryDirectory() as temp_dir:
Expand Down
46 changes: 46 additions & 0 deletions unstructured/partition/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,29 @@
"""

import os
import tempfile
from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path

from unstructured.partition.utils.constants import OCR_AGENT_TESSERACT


@lru_cache(maxsize=1)
def get_tempdir(dir: str) -> str:
tempdir = Path(dir) / f"tmp/{os.getpgid(0)}"
return str(tempdir)


@dataclass
class ENVConfig:
"""class for configuring enviorment parameters"""

def __post_init__(self):
print(f"=================POST INIT==================")
if self.STORAGE_ENABLED:
self._setup_tmpdir(self.STORAGE_TMPDIR)

def _get_string(self, var: str, default_value: str = "") -> str:
"""attempt to get the value of var from the os environment; if not present return the
default_value"""
Expand All @@ -31,6 +45,15 @@ def _get_float(self, var: str, default_value: float) -> float:
return float(value)
return default_value

def _get_bool(self, var: str, default_value: bool) -> bool:
if value := self._get_string(var):
return value.lower() in ("true", "1", "t")
return default_value

def _setup_tmpdir(self, tmpdir: str) -> None:
Path(tmpdir).mkdir(parents=True, exist_ok=True)
tempfile.tempdir = tmpdir

@property
def IMAGE_CROP_PAD(self) -> int:
"""extra image content to add around an identified element region; measured in pixels"""
Expand Down Expand Up @@ -117,5 +140,28 @@ def PDF_ANNOTATION_THRESHOLD(self) -> float:

return self._get_float("PDF_ANNOTATION_THRESHOLD", 0.9)

@property
def STORAGE_ENABLED(self) -> bool:
"""Enable usage of STORAGE_DIR and STORAGE_TMPDIR."""
return self._get_bool("STORAGE_ENABLED", False)

@property
def STORAGE_DIR(self) -> str:
"""Path to Unstructured storage directory."""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

STORAGE_DIR is a misleading name, which has permanent or at least caching connotations. could this instead be TMP_STORAGE_DIR?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are parameters STORAGE_DIR and STORAGE_TMPDIR

Copy link
Contributor Author

@amadeusz-ds amadeusz-ds May 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe UNSTRUCTURED_DIR and UNSTRUCTURED_TMPDIR as those by default point to ~/.cache/unstructured and ~/.cache/unstructured/tmp/{gid} respectively

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or UNSTRUCTURED_CACHE_DIR and UNSTRUCTURED_TMP_DIR?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

made the changes, waiting for greenlight @cragwolfe

return self._get_string("STORAGE_DIR", str(Path.home() / ".cache/unstructured"))

@property
def STORAGE_TMPDIR(self) -> str:
"""Path to Unstructured storage tempdir. Overrides TMPDIR, TEMP and TMP.
Defaults to '{STORAGE_DIR}/tmp/{os.getpgid(0)}'.
"""
default_tmpdir = get_tempdir(dir=self.STORAGE_DIR)
tmpdir = self._get_string("STORAGE_TMPDIR", default_tmpdir)
if tmpdir == "":
tmpdir = default_tmpdir
if self.STORAGE_ENABLED:
self._setup_tmpdir(tmpdir)
return tmpdir


env_config = ENVConfig()
Loading