add initial extract_udf_dependencies() to extract UDF dependencies …

…from UDF code Open-EO/openeo-geopyspark-driver#237
Open-EO · May 23, 2024 · 4b9ca22 · 4b9ca22
1 parent a1d0b63
commit 4b9ca22
Show file tree

Hide file tree

Showing 5 changed files with 298 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Add `openeo.udf.run_code.extract_udf_dependencies()` to extract UDF dependency declarations from UDF code
+  (related to [Open-EO/openeo-geopyspark-driver#237)](https://github.com/Open-EO/openeo-geopyspark-driver/issues/237))
+
 ### Changed
 
 ### Removed

diff --git a/openeo/udf/_compat.py b/openeo/udf/_compat.py
@@ -0,0 +1,64 @@
+import json
+import re
+from typing import Union
+
+try:
+    import tomllib
+except ImportError:
+    try:
+        import tomli as tomllib
+    except ImportError:
+        # Will be assigned with fallback implementation below
+        tomllib = None
+
+
+class FlimsyTomlParser:
+    """
+    This is a rudimentary, low-tech, incomplete implementation of TOML parsing functionality
+    for simple TOML use cases where the dependency on a full-fledged TOML library is not justified.
+
+    It is intended to be used as a best-effort drop-in replacement
+    for the `loads()` functionality from full-fledged TOML libraries
+    like `tomllib` (part of standard library since Python 3.11)
+    or `tomli` (`tomllib` backport for earlier Python versions).
+    """
+
+    class TomlParseError(ValueError):
+        pass
+
+    KEY_PAIR_REGEX = re.compile(
+        r"(?P<key>^[a-z0-9_-]+)\s*=\s*(?P<value>.*(\s+^\s+.*)*(\s+^])?)",
+        flags=re.MULTILINE | re.VERBOSE | re.IGNORECASE,
+    )
+
+    @classmethod
+    def loads(cls, data: str) -> dict:
+        if re.search(r"^\[", data, flags=re.MULTILINE):
+            raise cls.TomlParseError("Tables are not supported")
+        if re.search(r"^[a-z0-9_-]+\.[a-z0-9_.-]+\s*=", data, flags=re.MULTILINE | re.IGNORECASE):
+            raise cls.TomlParseError("Dotted keys are not supported")
+        return {
+            match.group("key"): cls._parse_toml_value_like_json(match.group("value"))
+            for match in cls.KEY_PAIR_REGEX.finditer(data)
+        }
+
+    @classmethod
+    def _parse_toml_value_like_json(cls, value: str) -> Union[int, float, list]:
+        """
+        Try to parse a TOML value by pretending it's (almost) JSON,
+        which covers the basics (simple strings, numbers, arrays, a bit of nesting, ...)
+        """
+        # A bit of preprocessing to make it more JSON-like (strip comments, strip trailing commas)
+        value = re.sub(r"#.*$", "", value, flags=re.MULTILINE)
+        value = re.sub(r",\s*\]", "]", value)
+        # Rudimentarily convert single quote strings to double quotes.
+        value = re.sub("'([^'\"]*)'", r'"\1"', value)
+        try:
+            data = json.loads(value)
+        except json.JSONDecodeError as e:
+            raise cls.TomlParseError(f"Failed to parse TOML value {value!r}") from e
+        return data
+
+
+if tomllib is None:
+    tomllib = FlimsyTomlParser
diff --git a/openeo/udf/run_code.py b/openeo/udf/run_code.py
@@ -1,16 +1,15 @@
 """
 
+Note: this module was initially developed under the ``openeo-udf`` project (https://github.com/Open-EO/openeo-udf)
 """
 
-# Note: this module was initially developed under the ``openeo-udf`` project (https://github.com/Open-EO/openeo-udf)
-
 import functools
-import importlib
 import inspect
 import logging
 import math
 import pathlib
-from typing import Callable, Union
+import re
+from typing import Callable, List, Union
 
 import numpy
 import pandas
@@ -20,6 +19,7 @@
 
 import openeo
 from openeo.udf import OpenEoUdfException
+from openeo.udf._compat import tomllib
 from openeo.udf.feature_collection import FeatureCollection
 from openeo.udf.structured_data import StructuredData
 from openeo.udf.udf_data import UdfData
@@ -242,3 +242,52 @@ def execute_local_udf(udf: Union[str, openeo.UDF], datacube: Union[str, xarray.D
     # run the udf through the same routine as it would have been parsed in the backend
     result = run_udf_code(udf, udf_data)
     return result
+
+
+def extract_udf_dependencies(code: str) -> Union[List[str], None]:
+    """
+    Extract dependencies from UDF code declared in a top-level comment block
+    following the `inline script metadata specification (PEP 508) <https://packaging.python.org/en/latest/specifications/inline-script-metadata>`_.
+
+    Example comment block with dependencies declaration:
+
+    .. code-block:: python
+
+        # /// script
+        # dependencies = [
+        #     "xarray>=2024.1.1",
+        #     "eotools @ https://example.com/eotools-0.1.0.whl",
+        # ]
+        # ///
+        import xarray
+        import eotools
+
+        def apply_datacube(cube: xarray.DataArray, context: dict) -> xarray.DataArray:
+            ...
+
+    :param code: UDF code
+    :return: list of extracted dependencies
+
+    .. versionadded:: 0.30.0
+    """
+
+    # Extract "script" blocks
+    script_type = "script"
+    block_regex = re.compile(
+        r"^# /// (?P<type>[a-zA-Z0-9-]+)\s*$\s(?P<content>(^#(| .*)$\s)+)^# ///$", flags=re.MULTILINE
+    )
+    script_blocks = [
+        match.group("content") for match in block_regex.finditer(code) if match.group("type") == script_type
+    ]
+
+    if len(script_blocks) > 1:
+        raise ValueError(f"Multiple {script_type!r} blocks found in top-level comment")
+    elif len(script_blocks) == 0:
+        return None
+
+    # Extract dependencies from "script" block
+    content = "".join(
+        line[2:] if line.startswith("# ") else line[1:] for line in script_blocks[0].splitlines(keepends=True)
+    )
+
+    return tomllib.loads(content).get("dependencies")
diff --git a/tests/udf/test_compat.py b/tests/udf/test_compat.py
@@ -0,0 +1,121 @@
+import textwrap
+
+import pytest
+
+from openeo.udf._compat import FlimsyTomlParser
+
+
+class TestFlimsyTomlLib:
+    @pytest.mark.parametrize(
+        ["value", "expected"],
+        [
+            # Numbers
+            ("123", 123),
+            ("12.5", 12.5),
+            # Strings
+            ('"Basic string"', "Basic string"),
+            ("'Literal string'", "Literal string"),
+            ('''"I'm a string"''', "I'm a string"),
+            (r'''"You can \"quote\" me"''', 'You can "quote" me'),
+            # Arrays (aka lists)
+            ("[]", []),
+            ("[1, 2, 3]", [1, 2, 3]),
+            ("[1.5, 2.5, 3.5]", [1.5, 2.5, 3.5]),
+            ("[1, 2, 3,]", [1, 2, 3]),
+            ("[\n  1,\n  2,\n 3,\n]", [1, 2, 3]),
+            ('["blue", "yellow"]', ["blue", "yellow"]),
+            ("['blue', 'yellow']", ["blue", "yellow"]),
+            (
+                """
+                [
+                    "blue",
+                    "yellow",
+                ]
+                """,
+                ["blue", "yellow"],
+            ),
+            ("[1, 'two', 3.0, \"four\"]", [1, "two", 3.0, "four"]),
+            (
+                """
+                [
+                    'one',
+                    [2, 3],
+                ]
+                """,
+                ["one", [2, 3]],
+            ),
+        ],
+    )
+    def test_parse_toml_value_like_json(self, value, expected):
+        assert FlimsyTomlParser._parse_toml_value_like_json(value) == expected
+
+    def test_loads_basic(self):
+        data = textwrap.dedent(
+            """
+            title = "TOML Example"
+            colors = ["blue", "yellow"]
+            size = 132
+            """
+        )
+        assert FlimsyTomlParser.loads(data) == {
+            "title": "TOML Example",
+            "colors": ["blue", "yellow"],
+            "size": 132,
+        }
+
+    def test_loads_multiline_values(self):
+        data = textwrap.dedent(
+            """
+            # Some colors
+            colors = [
+                "blue",
+                "yellow",
+            ]
+            sizes = [
+                12,
+                34,
+                # This closing bracket is intentionally indented too
+                ]
+            shape = "round"
+            """
+        )
+        assert FlimsyTomlParser.loads(data) == {
+            "colors": ["blue", "yellow"],
+            "sizes": [12, 34],
+            "shape": "round",
+        }
+
+    def test_loads_special_keys(self):
+        data = textwrap.dedent(
+            """
+            1234 = "one two three four"
+            bare_key = "underscore"
+            another-key = "dash"
+            """
+        )
+        assert FlimsyTomlParser.loads(data) == {
+            "1234": "one two three four",
+            "another-key": "dash",
+            "bare_key": "underscore",
+        }
+
+    def test_loads_tables(self):
+        data = textwrap.dedent(
+            """
+            title = "Vroom"
+            [car]
+            brand = "HobbleBlob"
+            """
+        )
+        with pytest.raises(FlimsyTomlParser.TomlParseError, match="Tables are not supported"):
+            _ = FlimsyTomlParser.loads(data)
+
+    def test_loads_dotted_keys(self):
+        data = textwrap.dedent(
+            """
+            title = "Vroom"
+            car.brand = "HobbleBlob"
+            """
+        )
+        with pytest.raises(FlimsyTomlParser.TomlParseError, match="Dotted keys are not supported"):
+            _ = FlimsyTomlParser.loads(data)
diff --git a/tests/udf/test_run_code.py b/tests/udf/test_run_code.py
@@ -13,6 +13,7 @@
     _annotation_is_udf_datacube,
     _get_annotation_str,
     execute_local_udf,
+    extract_udf_dependencies,
     run_udf_code,
 )
 
@@ -300,3 +301,59 @@ def test_run_local_udf_from_file_netcdf(tmp_path):
     xarray.testing.assert_equal(result[0, 0, 0:2, 0:2], expected)
 
     assert result[2, 0, 4, 3] == _ndvi(2034, 2134)
+
+
+@pytest.mark.parametrize(
+    ["udf_code", "expected"],
+    [
+        (
+            """
+            def foo(x):
+                return x+1
+            """,
+            None,
+        ),
+        (
+            """
+            # /// script
+            # dependencies = ["numpy", "pandas"]
+            # ///
+            def foo(x):
+                return x+1
+            """,
+            ["numpy", "pandas"],
+        ),
+        (
+            """
+            # /// script
+            # dependencies = [
+            #     "numpy>=1.2.3",
+            #     "pandas!=2.3.4",
+            # ]
+            # ///
+            def foo(x):
+                return x+1
+            """,
+            ["numpy>=1.2.3", "pandas!=2.3.4"],
+        ),
+        (
+            """
+            # /// script
+            # dependencies = [
+            #     'requests [security,tests] >= 2.8.1, == 2.8.* ; python_version < "2.7"',
+            #     "pip @ https://github.com/pypa/pip/archive/1.3.1.zip#sha1=da9234ee9982d4bbb3c72346a6de940a148ea686",
+            # ]
+            # ///
+            def foo(x):
+                return x+1
+            """,
+            [
+                'requests [security,tests] >= 2.8.1, == 2.8.* ; python_version < "2.7"',
+                "pip @ https://github.com/pypa/pip/archive/1.3.1.zip#sha1=da9234ee9982d4bbb3c72346a6de940a148ea686",
+            ],
+        ),
+    ],
+)
+def test_extract_udf_dependencies(udf_code, expected):
+    udf_code = textwrap.dedent(udf_code)
+    assert extract_udf_dependencies(udf_code) == expected