diff --git a/pyproject.toml b/pyproject.toml index 887ada47..e94a7da4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,6 @@ dependencies = [ "scikit-learn >= 1.5", "xgboost-cpu >= 2.1.4", # regular xgboost includes nvidia libraries which bloat the package size on linux. For PyProphet, we likely would not need GPU support. "matplotlib", - "pyarrow", "pypdf", "psutil", "pyopenms", @@ -55,6 +54,7 @@ dependencies = [ testing = ["pytest", "pytest-regtest", "pytest-xdist"] docs = ["sphinx", "sphinx-copybutton", "sphinx_rtd_theme", "pydata_sphinx_theme", "sphinx-click"] dev = ["pyprophet[testing]", "pyprophet[docs]", "black", "ruff", "mypy"] +parquet = ["pyarrow"] # Define console entry points [project.scripts] diff --git a/pyprophet/io/__init__.py b/pyprophet/io/__init__.py index 6bbbfa4b..372e3726 100644 --- a/pyprophet/io/__init__.py +++ b/pyprophet/io/__init__.py @@ -15,7 +15,7 @@ Dependencies: ------------- - `pandas` -- `pyarrow` +- `pyarrow`(optional, for Parquet support) - `duckdb` - `sqlite3` - `loguru` diff --git a/pyprophet/io/dispatcher.py b/pyprophet/io/dispatcher.py index 00c4eb56..4aeeeaff 100644 --- a/pyprophet/io/dispatcher.py +++ b/pyprophet/io/dispatcher.py @@ -16,52 +16,28 @@ from loguru import logger +from .util import ( + _get_parquet_reader_class_for_config, + _get_parquet_writer_class_for_config, +) from .._config import ExportIOConfig, IPFIOConfig, LevelContextIOConfig, RunnerIOConfig # Export I/O from .export.osw import OSWReader as ExportOSWReader from .export.osw import OSWWriter as ExportOSWWriter from .export.sqmass import SqMassWriter as ExportSqMassWriter -from .export.parquet import ( - ParquetReader as ExportParquetReader, -) -from .export.parquet import ( - ParquetWriter as ExportParquetWriter, -) -from .export.split_parquet import ( - SplitParquetReader as ExportSplitParquetReader, -) -from .export.split_parquet import ( - SplitParquetWriter as ExportSplitParquetWriter, -) # IPF I/O from .ipf.osw import OSWReader as IPFOSWReader from .ipf.osw import OSWWriter as IPFOSWWriter -from .ipf.parquet import ParquetReader as IPFParquetReader -from .ipf.parquet import ParquetWriter as IPFParquetWriter -from .ipf.split_parquet import SplitParquetReader as IPFSplitParquetReader -from .ipf.split_parquet import SplitParquetWriter as IPFSplitParquetWriter # Levels Context I/O from .levels_context.osw import OSWReader as LevelContextOSWReader from .levels_context.osw import OSWWriter as LevelContextOSWWriter -from .levels_context.parquet import ParquetReader as LevelContextParquetReader -from .levels_context.parquet import ParquetWriter as LevelContextParquetWriter -from .levels_context.split_parquet import ( - SplitParquetReader as LevelContextSplitParquetReader, -) -from .levels_context.split_parquet import ( - SplitParquetWriter as LevelContextSplitParquetWriter, -) # Scoring I/O from .scoring.osw import OSWReader as ScoringOSWReader from .scoring.osw import OSWWriter as ScoringOSWWriter -from .scoring.parquet import ParquetReader as ParquetScoringReader -from .scoring.parquet import ParquetWriter as ParquetScoringWriter -from .scoring.split_parquet import SplitParquetReader as SplitParquetScoringReader -from .scoring.split_parquet import SplitParquetWriter as SplitParquetScoringWriter from .scoring.tsv import TSVReader as ScoringTSVReader from .scoring.tsv import TSVWriter as ScoringTSVWriter @@ -123,29 +99,13 @@ def _get_osw_reader(config): @staticmethod def _get_parquet_reader(config): - if isinstance(config, RunnerIOConfig): - return ParquetScoringReader(config) - elif isinstance(config, IPFIOConfig): - return IPFParquetReader(config) - elif isinstance(config, LevelContextIOConfig): - return LevelContextParquetReader(config) - elif isinstance(config, ExportIOConfig): - return ExportParquetReader(config) - else: - raise ValueError(f"Unsupported config context: {type(config).__name__}") + cls = _get_parquet_reader_class_for_config(config, split=False) + return cls(config) @staticmethod def _get_split_parquet_reader(config): - if isinstance(config, RunnerIOConfig): - return SplitParquetScoringReader(config) - elif isinstance(config, IPFIOConfig): - return IPFSplitParquetReader(config) - elif isinstance(config, LevelContextIOConfig): - return LevelContextSplitParquetReader(config) - elif isinstance(config, ExportIOConfig): - return ExportSplitParquetReader(config) - else: - raise ValueError(f"Unsupported config context: {type(config).__name__}") + cls = _get_parquet_reader_class_for_config(config, split=True) + return cls(config) @staticmethod def _get_tsv_reader(config): @@ -223,29 +183,13 @@ def _get_sqmass_writer(config): @staticmethod def _get_parquet_writer(config): - if isinstance(config, RunnerIOConfig): - return ParquetScoringWriter(config) - elif isinstance(config, IPFIOConfig): - return IPFParquetWriter(config) - elif isinstance(config, LevelContextIOConfig): - return LevelContextParquetWriter(config) - elif isinstance(config, ExportIOConfig): - return ExportParquetWriter(config) - else: - raise ValueError(f"Unsupported config context: {type(config).__name__}") + cls = _get_parquet_writer_class_for_config(config, split=False) + return cls(config) @staticmethod def _get_split_parquet_writer(config): - if isinstance(config, RunnerIOConfig): - return SplitParquetScoringWriter(config) - elif isinstance(config, IPFIOConfig): - return IPFSplitParquetWriter(config) - elif isinstance(config, LevelContextIOConfig): - return LevelContextSplitParquetWriter(config) - elif isinstance(config, ExportIOConfig): - return ExportSplitParquetWriter(config) - else: - raise ValueError(f"Unsupported config context: {type(config).__name__}") + cls = _get_parquet_writer_class_for_config(config, split=True) + return cls(config) @staticmethod def _get_tsv_writer(config): diff --git a/pyprophet/io/ipf/parquet.py b/pyprophet/io/ipf/parquet.py index f9c7ff93..f97e6bde 100644 --- a/pyprophet/io/ipf/parquet.py +++ b/pyprophet/io/ipf/parquet.py @@ -1,14 +1,17 @@ import os -from typing import Literal from shutil import copyfile -import pandas as pd -import pyarrow as pa -import duckdb +from typing import Literal + import click +import duckdb +import pandas as pd from loguru import logger -from ..util import get_parquet_column_names -from .._base import BaseParquetReader, BaseParquetWriter + from ..._config import IPFIOConfig +from .._base import BaseParquetReader, BaseParquetWriter +from ..util import _ensure_pyarrow, get_parquet_column_names + +pa, _, _ = _ensure_pyarrow() class ParquetReader(BaseParquetReader): diff --git a/pyprophet/io/ipf/split_parquet.py b/pyprophet/io/ipf/split_parquet.py index acb733c3..085cdea0 100644 --- a/pyprophet/io/ipf/split_parquet.py +++ b/pyprophet/io/ipf/split_parquet.py @@ -1,16 +1,17 @@ -import os import glob -from shutil import copyfile +import os from typing import Literal -import pandas as pd -import pyarrow as pa -import duckdb + import click +import duckdb +import pandas as pd from loguru import logger -from ..util import get_parquet_column_names -from .._base import BaseSplitParquetReader, BaseSplitParquetWriter from ..._config import IPFIOConfig +from .._base import BaseSplitParquetReader, BaseSplitParquetWriter +from ..util import _ensure_pyarrow, get_parquet_column_names + +pa, _, _ = _ensure_pyarrow() class SplitParquetReader(BaseSplitParquetReader): diff --git a/pyprophet/io/levels_context/parquet.py b/pyprophet/io/levels_context/parquet.py index 49e33f3c..24f2c3e8 100644 --- a/pyprophet/io/levels_context/parquet.py +++ b/pyprophet/io/levels_context/parquet.py @@ -1,14 +1,15 @@ -import os -from typing import Literal from shutil import copyfile -import pandas as pd -import pyarrow as pa -import duckdb + import click +import duckdb +import pandas as pd from loguru import logger -from ..util import get_parquet_column_names -from .._base import BaseParquetReader, BaseParquetWriter + from ..._config import LevelContextIOConfig +from .._base import BaseParquetReader, BaseParquetWriter +from ..util import _ensure_pyarrow, get_parquet_column_names + +pa, _, _ = _ensure_pyarrow() class ParquetReader(BaseParquetReader): diff --git a/pyprophet/io/levels_context/split_parquet.py b/pyprophet/io/levels_context/split_parquet.py index 671ff88f..3f0812d3 100644 --- a/pyprophet/io/levels_context/split_parquet.py +++ b/pyprophet/io/levels_context/split_parquet.py @@ -1,18 +1,16 @@ -import os import glob -from shutil import copyfile -from typing import Literal -import pandas as pd -import pyarrow as pa -import duckdb +import os + import click +import duckdb +import pandas as pd from loguru import logger -from .._base import BaseSplitParquetReader, BaseSplitParquetWriter from ..._config import LevelContextIOConfig -from ..util import ( - get_parquet_column_names, -) +from .._base import BaseSplitParquetReader, BaseSplitParquetWriter +from ..util import _ensure_pyarrow, get_parquet_column_names + +pa, _, _ = _ensure_pyarrow() class SplitParquetReader(BaseSplitParquetReader): diff --git a/pyprophet/io/scoring/parquet.py b/pyprophet/io/scoring/parquet.py index 538832fa..b1c3626c 100644 --- a/pyprophet/io/scoring/parquet.py +++ b/pyprophet/io/scoring/parquet.py @@ -1,14 +1,17 @@ import sys from shutil import copyfile + +import click +import duckdb import pandas as pd import polars as pl -import pyarrow as pa -import duckdb -import click from loguru import logger -from ..util import get_parquet_column_names -from .._base import BaseParquetReader, BaseParquetWriter, RowCountMismatchError + from ..._config import RunnerIOConfig +from .._base import BaseParquetReader, BaseParquetWriter, RowCountMismatchError +from ..util import _ensure_pyarrow, get_parquet_column_names + +pa, _, _ = _ensure_pyarrow() class ParquetReader(BaseParquetReader): diff --git a/pyprophet/io/scoring/split_parquet.py b/pyprophet/io/scoring/split_parquet.py index 2e5eb464..de09e720 100644 --- a/pyprophet/io/scoring/split_parquet.py +++ b/pyprophet/io/scoring/split_parquet.py @@ -1,16 +1,15 @@ import os -import sys -import glob -from shutil import copyfile -import pandas as pd -import pyarrow as pa -import duckdb + import click +import duckdb +import pandas as pd from loguru import logger -from ..util import get_parquet_column_names -from .._base import BaseSplitParquetReader, BaseSplitParquetWriter from ..._config import RunnerIOConfig +from .._base import BaseSplitParquetReader, BaseSplitParquetWriter +from ..util import _ensure_pyarrow, get_parquet_column_names + +pa, _, _ = _ensure_pyarrow() class SplitParquetReader(BaseSplitParquetReader): diff --git a/pyprophet/io/util.py b/pyprophet/io/util.py index 4d07617d..9cce1d95 100644 --- a/pyprophet/io/util.py +++ b/pyprophet/io/util.py @@ -44,13 +44,40 @@ import os from collections import defaultdict import sqlite3 +import importlib +from typing import Type import duckdb import click import pandas as pd -import pyarrow.parquet as pq import pyopenms as poms from loguru import logger -from pyarrow.lib import ArrowInvalid, ArrowIOError + + +def _ensure_pyarrow(): + """ + Avoid importing pyarrow at module import time; import lazily in functions that need it. + """ + try: + import pyarrow as pa # pylint: disable=C0415 + + # Ensure the parquet submodule is loaded and available as pa.parquet + try: + import pyarrow.parquet as pq # pylint: disable=C0415 + except Exception: + pq = None + from pyarrow.lib import ArrowInvalid, ArrowIOError # pylint: disable=C0415 + + # Some pyarrow installs don't automatically expose the parquet submodule + # as an attribute on the top-level module until it's imported. Attach it + # so callers can use pa.parquet.* + if pq is not None and not hasattr(pa, "parquet"): + setattr(pa, "parquet", pq) + + return pa, ArrowInvalid, ArrowIOError + except ImportError as exc: + raise click.ClickException( + "Parquet support requires 'pyarrow'. Install with 'pip install pyarrow' or 'pip install pyprophet[parquet]'." + ) from exc def is_tsv_file(file_path): @@ -248,6 +275,60 @@ def create_index_if_not_exists(con, index_name, table_name, column_name): con.execute(f"CREATE INDEX {index_name} ON {table_name} ({column_name})") +def _lazy_parquet_class(module_path: str, class_name: str) -> Type: + """ + Import the given module (relative to this package) and return the class. + Raises a ClickException with a friendly message if the import fails (e.g. missing pyarrow). + """ + try: + mod = importlib.import_module(module_path, package=__package__) + return getattr(mod, class_name) + except ModuleNotFoundError as exc: + # Likely pyarrow or the module itself is missing; user should install the parquet extra. + raise click.ClickException( + "Parquet support requires the 'pyarrow' package. " + "Install it with 'pip install pyarrow' or 'pip install pyprophet[parquet]'." + ) from exc + except Exception: + # Propagate other exceptions (syntax errors, attribute errors) to surface the real problem. + raise + + +def _area_from_config(config) -> str: + """ + Map a config instance to its package area name used in the io package. + """ + # Avoid importing config classes here to prevent circular imports. + cname = type(config).__name__ + if cname == "RunnerIOConfig": + return "scoring" + if cname == "IPFIOConfig": + return "ipf" + if cname == "LevelContextIOConfig": + return "levels_context" + if cname == "ExportIOConfig": + return "export" + raise ValueError(f"Unsupported config context: {type(config).__name__}") + + +def _get_parquet_reader_class_for_config(config, split: bool = False) -> Type: + _, _, _ = _ensure_pyarrow() + area = _area_from_config(config) + module = f".{area}.split_parquet" if split else f".{area}.parquet" + return _lazy_parquet_class( + module, "SplitParquetReader" if split else "ParquetReader" + ) + + +def _get_parquet_writer_class_for_config(config, split: bool = False) -> Type: + _, _, _ = _ensure_pyarrow() + area = _area_from_config(config) + module = f".{area}.split_parquet" if split else f".{area}.parquet" + return _lazy_parquet_class( + module, "SplitParquetWriter" if split else "ParquetWriter" + ) + + def is_parquet_file(file_path): """ Check if the file is a valid Parquet file. @@ -259,7 +340,8 @@ def is_parquet_file(file_path): # Then verify it's actually a parquet file try: - pq.read_schema(file_path) + pa, ArrowInvalid, ArrowIOError = _ensure_pyarrow() # pylint: disable=C0103 + pa.parquet.read_schema(file_path) return True except (ArrowInvalid, ArrowIOError, OSError): return False @@ -327,7 +409,8 @@ def get_parquet_column_names(file_path): Retrieves column names from a Parquet file without reading the entire file. """ try: - table_schema = pq.read_schema(file_path) + pa, _, _ = _ensure_pyarrow() + table_schema = pa.parquet.read_schema(file_path) return table_schema.names except Exception as e: print(f"An error occurred while reading schema from '{file_path}': {e}") diff --git a/scripts/build/build_linux.sh b/scripts/build/build_linux.sh index 29fc5e5c..06d99551 100644 --- a/scripts/build/build_linux.sh +++ b/scripts/build/build_linux.sh @@ -104,6 +104,21 @@ $PYTHON -m PyInstaller \ --noconfirm \ --log-level INFO \ --additional-hooks-dir packaging/pyinstaller/hooks \ + --exclude-module pyarrow \ + --exclude-module sphinx \ + --exclude-module sphinx_rtd_theme \ + --exclude-module pydata_sphinx_theme \ + --exclude-module sphinx_copybutton \ + --exclude-module sphinx.ext \ + --exclude-module alabaster \ + --exclude-module babel \ + --exclude-module docutils \ + --exclude-module mypy \ + --exclude-module pytest \ + --exclude-module pytest-regtest \ + --exclude-module pytest-xdist \ + --exclude-module black \ + --exclude-module ruff \ --hidden-import=pyprophet \ --hidden-import=pyprophet.main \ --collect-submodules pyprophet \ diff --git a/scripts/build/build_macos.sh b/scripts/build/build_macos.sh index 2578abea..96a74643 100755 --- a/scripts/build/build_macos.sh +++ b/scripts/build/build_macos.sh @@ -21,6 +21,40 @@ if [ "$(printf '%s\n' "$REQUIRED_VERSION" "$PYTHON_VERSION" | sort -V | head -n1 exit 1 fi +# Get version: prefer GITHUB_REF_NAME (CI), then pyproject.toml, then git tag, then default +if [ -n "${GITHUB_REF_NAME:-}" ]; then + VERSION="${GITHUB_REF_NAME#v}" +else + # Use tomllib (Python 3.11) to read pyproject.toml + VERSION=$( + python3 - <<'PY' +import tomllib, sys, subprocess +try: + with open("pyproject.toml", "rb") as f: + cfg = tomllib.load(f) + v = cfg.get("project", {}).get("version") + if v: + print(v) + sys.exit(0) +except Exception: + pass +# fallback to git tag +try: + tag = subprocess.check_output(["git", "describe", "--tags", "--abbrev=0"], stderr=subprocess.DEVNULL).decode().strip() + print(tag.lstrip("v")) + sys.exit(0) +except Exception: + print("0.0.0") +PY + ) +fi + +# sanitize for filenames +VERSION_SAFE="${VERSION//\//-}" +VERSION_SAFE="${VERSION_SAFE// /-}" +VERSION_SAFE="$(echo "$VERSION_SAFE" | tr -cd 'A-Za-z0-9._-')" + + # Install/upgrade build dependencies python3 -m pip install --upgrade pip setuptools wheel cython numpy pyinstaller @@ -78,6 +112,7 @@ python3 -m PyInstaller \ --strip \ --log-level INFO \ --additional-hooks-dir packaging/pyinstaller/hooks \ + --exclude-module pyarrow \ --exclude-module sphinx \ --exclude-module sphinx_rtd_theme \ --exclude-module pydata_sphinx_theme \ diff --git a/scripts/build/build_windows.bat b/scripts/build/build_windows.bat index 0c7f9b1c..ba9c205c 100644 --- a/scripts/build/build_windows.bat +++ b/scripts/build/build_windows.bat @@ -61,6 +61,21 @@ python -m PyInstaller ^ --name pyprophet ^ --log-level INFO ^ --additional-hooks-dir packaging/pyinstaller/hooks ^ + --exclude-module pyarrow ^ + --exclude-module sphinx ^ + --exclude-module sphinx_rtd_theme ^ + --exclude-module pydata_sphinx_theme ^ + --exclude-module sphinx_copybutton ^ + --exclude-module sphinx.ext ^ + --exclude-module alabaster ^ + --exclude-module babel ^ + --exclude-module docutils ^ + --exclude-module mypy ^ + --exclude-module pytest ^ + --exclude-module pytest-regtest ^ + --exclude-module pytest-xdist ^ + --exclude-module black ^ + --exclude-module ruff ^ --hidden-import=pyprophet ^ --hidden-import=pyprophet.main ^ --collect-submodules pyprophet ^ diff --git a/scripts/build/create_macos_dmg.sh b/scripts/build/create_macos_dmg.sh index b1b8a45b..e97cb625 100644 --- a/scripts/build/create_macos_dmg.sh +++ b/scripts/build/create_macos_dmg.sh @@ -8,15 +8,38 @@ echo "============================================" echo "Creating macOS DMG Installer" echo "============================================" -# Get version and architecture +# Get version (prefer GITHUB_REF_NAME, then pyproject.toml, then git tag, then default) if [ -n "${GITHUB_REF_NAME:-}" ]; then VERSION="${GITHUB_REF_NAME#v}" else - VERSION=$(git describe --tags --abbrev=0 2>/dev/null | sed 's/^v//' || echo "3.0.4") + VERSION=$( + python3 - <<'PY' +import sys, subprocess +try: + import tomllib + with open("pyproject.toml","rb") as f: + cfg = tomllib.load(f) + v = cfg.get("project",{}).get("version") + if v: + print(v); sys.exit(0) +except Exception: + pass +try: + tag = subprocess.check_output(["git","describe","--tags","--abbrev=0"], stderr=subprocess.DEVNULL).decode().strip() + print(tag.lstrip("v")); sys.exit(0) +except Exception: + print("3.0.4") +PY + ) fi +# sanitize VERSION for filenames +VERSION_SAFE="${VERSION//\//-}" +VERSION_SAFE="${VERSION_SAFE// /-}" +VERSION_SAFE="$(echo "$VERSION_SAFE" | tr -cd 'A-Za-z0-9._-')" + ARCH=$(uname -m) -echo "Version: ${VERSION}" +echo "Version: ${VERSION} (safe: ${VERSION_SAFE})" echo "Architecture: ${ARCH}" # Verify single-file executable exists @@ -82,7 +105,7 @@ if [ -f "LICENSE" ]; then fi # Create DMG -DMG_NAME="pyprophet-${VERSION}-macos-${ARCH}.dmg" +DMG_NAME="pyprophet-${VERSION_SAFE}-macos-${ARCH}.dmg" echo "Creating DMG: ${DMG_NAME}" # Remove any existing DMG