Skip to content

Commit

Permalink
add initial extract_udf_dependencies() to extract UDF dependencies …
Browse files Browse the repository at this point in the history
  • Loading branch information
soxofaan committed May 23, 2024
1 parent a1d0b63 commit 4b9ca22
Show file tree
Hide file tree
Showing 5 changed files with 298 additions and 4 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- Add `openeo.udf.run_code.extract_udf_dependencies()` to extract UDF dependency declarations from UDF code
(related to [Open-EO/openeo-geopyspark-driver#237)](https://github.com/Open-EO/openeo-geopyspark-driver/issues/237))

### Changed

### Removed
Expand Down
64 changes: 64 additions & 0 deletions openeo/udf/_compat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import json
import re
from typing import Union

try:
import tomllib
except ImportError:
try:
import tomli as tomllib
except ImportError:
# Will be assigned with fallback implementation below
tomllib = None


class FlimsyTomlParser:
"""
This is a rudimentary, low-tech, incomplete implementation of TOML parsing functionality
for simple TOML use cases where the dependency on a full-fledged TOML library is not justified.
It is intended to be used as a best-effort drop-in replacement
for the `loads()` functionality from full-fledged TOML libraries
like `tomllib` (part of standard library since Python 3.11)
or `tomli` (`tomllib` backport for earlier Python versions).
"""

class TomlParseError(ValueError):
pass

KEY_PAIR_REGEX = re.compile(
r"(?P<key>^[a-z0-9_-]+)\s*=\s*(?P<value>.*(\s+^\s+.*)*(\s+^])?)",
flags=re.MULTILINE | re.VERBOSE | re.IGNORECASE,
)

@classmethod
def loads(cls, data: str) -> dict:
if re.search(r"^\[", data, flags=re.MULTILINE):
raise cls.TomlParseError("Tables are not supported")
if re.search(r"^[a-z0-9_-]+\.[a-z0-9_.-]+\s*=", data, flags=re.MULTILINE | re.IGNORECASE):
raise cls.TomlParseError("Dotted keys are not supported")
return {
match.group("key"): cls._parse_toml_value_like_json(match.group("value"))
for match in cls.KEY_PAIR_REGEX.finditer(data)
}

@classmethod
def _parse_toml_value_like_json(cls, value: str) -> Union[int, float, list]:
"""
Try to parse a TOML value by pretending it's (almost) JSON,
which covers the basics (simple strings, numbers, arrays, a bit of nesting, ...)
"""
# A bit of preprocessing to make it more JSON-like (strip comments, strip trailing commas)
value = re.sub(r"#.*$", "", value, flags=re.MULTILINE)
value = re.sub(r",\s*\]", "]", value)
# Rudimentarily convert single quote strings to double quotes.
value = re.sub("'([^'\"]*)'", r'"\1"', value)
try:
data = json.loads(value)
except json.JSONDecodeError as e:
raise cls.TomlParseError(f"Failed to parse TOML value {value!r}") from e
return data


if tomllib is None:
tomllib = FlimsyTomlParser
57 changes: 53 additions & 4 deletions openeo/udf/run_code.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
"""
Note: this module was initially developed under the ``openeo-udf`` project (https://github.com/Open-EO/openeo-udf)
"""

# Note: this module was initially developed under the ``openeo-udf`` project (https://github.com/Open-EO/openeo-udf)

import functools
import importlib
import inspect
import logging
import math
import pathlib
from typing import Callable, Union
import re
from typing import Callable, List, Union

import numpy
import pandas
Expand All @@ -20,6 +19,7 @@

import openeo
from openeo.udf import OpenEoUdfException
from openeo.udf._compat import tomllib
from openeo.udf.feature_collection import FeatureCollection
from openeo.udf.structured_data import StructuredData
from openeo.udf.udf_data import UdfData
Expand Down Expand Up @@ -242,3 +242,52 @@ def execute_local_udf(udf: Union[str, openeo.UDF], datacube: Union[str, xarray.D
# run the udf through the same routine as it would have been parsed in the backend
result = run_udf_code(udf, udf_data)
return result


def extract_udf_dependencies(code: str) -> Union[List[str], None]:
"""
Extract dependencies from UDF code declared in a top-level comment block
following the `inline script metadata specification (PEP 508) <https://packaging.python.org/en/latest/specifications/inline-script-metadata>`_.
Example comment block with dependencies declaration:
.. code-block:: python
# /// script
# dependencies = [
# "xarray>=2024.1.1",
# "eotools @ https://example.com/eotools-0.1.0.whl",
# ]
# ///
import xarray
import eotools
def apply_datacube(cube: xarray.DataArray, context: dict) -> xarray.DataArray:
...
:param code: UDF code
:return: list of extracted dependencies
.. versionadded:: 0.30.0
"""

# Extract "script" blocks
script_type = "script"
block_regex = re.compile(
r"^# /// (?P<type>[a-zA-Z0-9-]+)\s*$\s(?P<content>(^#(| .*)$\s)+)^# ///$", flags=re.MULTILINE
)
script_blocks = [
match.group("content") for match in block_regex.finditer(code) if match.group("type") == script_type
]

if len(script_blocks) > 1:
raise ValueError(f"Multiple {script_type!r} blocks found in top-level comment")
elif len(script_blocks) == 0:
return None

# Extract dependencies from "script" block
content = "".join(
line[2:] if line.startswith("# ") else line[1:] for line in script_blocks[0].splitlines(keepends=True)
)

return tomllib.loads(content).get("dependencies")
121 changes: 121 additions & 0 deletions tests/udf/test_compat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import textwrap

import pytest

from openeo.udf._compat import FlimsyTomlParser


class TestFlimsyTomlLib:
@pytest.mark.parametrize(
["value", "expected"],
[
# Numbers
("123", 123),
("12.5", 12.5),
# Strings
('"Basic string"', "Basic string"),
("'Literal string'", "Literal string"),
('''"I'm a string"''', "I'm a string"),
(r'''"You can \"quote\" me"''', 'You can "quote" me'),
# Arrays (aka lists)
("[]", []),
("[1, 2, 3]", [1, 2, 3]),
("[1.5, 2.5, 3.5]", [1.5, 2.5, 3.5]),
("[1, 2, 3,]", [1, 2, 3]),
("[\n 1,\n 2,\n 3,\n]", [1, 2, 3]),
('["blue", "yellow"]', ["blue", "yellow"]),
("['blue', 'yellow']", ["blue", "yellow"]),
(
"""
[
"blue",
"yellow",
]
""",
["blue", "yellow"],
),
("[1, 'two', 3.0, \"four\"]", [1, "two", 3.0, "four"]),
(
"""
[
'one',
[2, 3],
]
""",
["one", [2, 3]],
),
],
)
def test_parse_toml_value_like_json(self, value, expected):
assert FlimsyTomlParser._parse_toml_value_like_json(value) == expected

def test_loads_basic(self):
data = textwrap.dedent(
"""
title = "TOML Example"
colors = ["blue", "yellow"]
size = 132
"""
)
assert FlimsyTomlParser.loads(data) == {
"title": "TOML Example",
"colors": ["blue", "yellow"],
"size": 132,
}

def test_loads_multiline_values(self):
data = textwrap.dedent(
"""
# Some colors
colors = [
"blue",
"yellow",
]
sizes = [
12,
34,
# This closing bracket is intentionally indented too
]
shape = "round"
"""
)
assert FlimsyTomlParser.loads(data) == {
"colors": ["blue", "yellow"],
"sizes": [12, 34],
"shape": "round",
}

def test_loads_special_keys(self):
data = textwrap.dedent(
"""
1234 = "one two three four"
bare_key = "underscore"
another-key = "dash"
"""
)
assert FlimsyTomlParser.loads(data) == {
"1234": "one two three four",
"another-key": "dash",
"bare_key": "underscore",
}

def test_loads_tables(self):
data = textwrap.dedent(
"""
title = "Vroom"
[car]
brand = "HobbleBlob"
"""
)
with pytest.raises(FlimsyTomlParser.TomlParseError, match="Tables are not supported"):
_ = FlimsyTomlParser.loads(data)

def test_loads_dotted_keys(self):
data = textwrap.dedent(
"""
title = "Vroom"
car.brand = "HobbleBlob"
"""
)
with pytest.raises(FlimsyTomlParser.TomlParseError, match="Dotted keys are not supported"):
_ = FlimsyTomlParser.loads(data)
57 changes: 57 additions & 0 deletions tests/udf/test_run_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
_annotation_is_udf_datacube,
_get_annotation_str,
execute_local_udf,
extract_udf_dependencies,
run_udf_code,
)

Expand Down Expand Up @@ -300,3 +301,59 @@ def test_run_local_udf_from_file_netcdf(tmp_path):
xarray.testing.assert_equal(result[0, 0, 0:2, 0:2], expected)

assert result[2, 0, 4, 3] == _ndvi(2034, 2134)


@pytest.mark.parametrize(
["udf_code", "expected"],
[
(
"""
def foo(x):
return x+1
""",
None,
),
(
"""
# /// script
# dependencies = ["numpy", "pandas"]
# ///
def foo(x):
return x+1
""",
["numpy", "pandas"],
),
(
"""
# /// script
# dependencies = [
# "numpy>=1.2.3",
# "pandas!=2.3.4",
# ]
# ///
def foo(x):
return x+1
""",
["numpy>=1.2.3", "pandas!=2.3.4"],
),
(
"""
# /// script
# dependencies = [
# 'requests [security,tests] >= 2.8.1, == 2.8.* ; python_version < "2.7"',
# "pip @ https://github.com/pypa/pip/archive/1.3.1.zip#sha1=da9234ee9982d4bbb3c72346a6de940a148ea686",
# ]
# ///
def foo(x):
return x+1
""",
[
'requests [security,tests] >= 2.8.1, == 2.8.* ; python_version < "2.7"',
"pip @ https://github.com/pypa/pip/archive/1.3.1.zip#sha1=da9234ee9982d4bbb3c72346a6de940a148ea686",
],
),
],
)
def test_extract_udf_dependencies(udf_code, expected):
udf_code = textwrap.dedent(udf_code)
assert extract_udf_dependencies(udf_code) == expected

0 comments on commit 4b9ca22

Please sign in to comment.