-
Notifications
You must be signed in to change notification settings - Fork 595
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: add GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR config parameteres #3014
Changes from 2 commits
0af943c
0d48fdc
0461e59
df9e3f0
b11f41d
b66dd3b
6937bee
f86a216
49dd4fb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from .partition.utils.config import env_config | ||
|
||
# init env_config | ||
env_config |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
__version__ = "0.13.8-dev5" # pragma: no cover | ||
__version__ = "0.13.8-dev6" # pragma: no cover |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -160,7 +160,6 @@ def _try_process_document(self, doc: Path) -> Optional[list]: | |
@abstractmethod | ||
def _process_document(self, doc: Path) -> list: | ||
"""Should return all metadata and metrics for a single document.""" | ||
pass | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why removed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It was removed by the linter There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If a docstring is added, the |
||
|
||
|
||
@dataclass | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,15 +7,29 @@ | |
""" | ||
|
||
import os | ||
import tempfile | ||
from dataclasses import dataclass | ||
from functools import lru_cache | ||
from pathlib import Path | ||
|
||
from unstructured.partition.utils.constants import OCR_AGENT_TESSERACT | ||
|
||
|
||
@lru_cache(maxsize=1) | ||
def get_tempdir(dir: str) -> str: | ||
tempdir = Path(dir) / f"tmp/{os.getpgid(0)}" | ||
return str(tempdir) | ||
|
||
|
||
@dataclass | ||
class ENVConfig: | ||
"""class for configuring enviorment parameters""" | ||
|
||
def __post_init__(self): | ||
print(f"=================POST INIT==================") | ||
if self.STORAGE_ENABLED: | ||
self._setup_tmpdir(self.STORAGE_TMPDIR) | ||
|
||
def _get_string(self, var: str, default_value: str = "") -> str: | ||
"""attempt to get the value of var from the os environment; if not present return the | ||
default_value""" | ||
|
@@ -31,6 +45,15 @@ def _get_float(self, var: str, default_value: float) -> float: | |
return float(value) | ||
return default_value | ||
|
||
def _get_bool(self, var: str, default_value: bool) -> bool: | ||
if value := self._get_string(var): | ||
return value.lower() in ("true", "1", "t") | ||
return default_value | ||
|
||
def _setup_tmpdir(self, tmpdir: str) -> None: | ||
Path(tmpdir).mkdir(parents=True, exist_ok=True) | ||
tempfile.tempdir = tmpdir | ||
|
||
@property | ||
def IMAGE_CROP_PAD(self) -> int: | ||
"""extra image content to add around an identified element region; measured in pixels""" | ||
|
@@ -117,5 +140,28 @@ def PDF_ANNOTATION_THRESHOLD(self) -> float: | |
|
||
return self._get_float("PDF_ANNOTATION_THRESHOLD", 0.9) | ||
|
||
@property | ||
def STORAGE_ENABLED(self) -> bool: | ||
"""Enable usage of STORAGE_DIR and STORAGE_TMPDIR.""" | ||
return self._get_bool("STORAGE_ENABLED", False) | ||
|
||
@property | ||
def STORAGE_DIR(self) -> str: | ||
"""Path to Unstructured storage directory.""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. STORAGE_DIR is a misleading name, which has permanent or at least caching connotations. could this instead be TMP_STORAGE_DIR? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are parameters There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. or There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. made the changes, waiting for greenlight @cragwolfe |
||
return self._get_string("STORAGE_DIR", str(Path.home() / ".cache/unstructured")) | ||
|
||
@property | ||
def STORAGE_TMPDIR(self) -> str: | ||
"""Path to Unstructured storage tempdir. Overrides TMPDIR, TEMP and TMP. | ||
Defaults to '{STORAGE_DIR}/tmp/{os.getpgid(0)}'. | ||
""" | ||
default_tmpdir = get_tempdir(dir=self.STORAGE_DIR) | ||
tmpdir = self._get_string("STORAGE_TMPDIR", default_tmpdir) | ||
if tmpdir == "": | ||
tmpdir = default_tmpdir | ||
if self.STORAGE_ENABLED: | ||
self._setup_tmpdir(tmpdir) | ||
return tmpdir | ||
|
||
|
||
env_config = ENVConfig() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
personally I'd prefer to see more usages of pathlib, but I believe it's not the mail goal of this PR, just a side note