Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions ingestify/domain/services/identifier_key_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,13 @@ def to_path(self, provider: str, dataset_type: str, identifier: dict) -> str:
path_parts.append(f"{key}_{suffix}={transformed_value}")

# Append the original value (either standalone for identity or alongside transformed).
# URL-encode the value so special characters, spaces, etc. are safe in paths.
path_parts.append(f"{key}={quote(str(value), safe='')}")
# Truncate long values before encoding to keep paths under
# filesystem/GCS limits. Append a short hash to preserve uniqueness.
str_value = str(value)
if len(str_value) > 40:
short_hash = hashlib.md5(str_value.encode()).hexdigest()[:8]
str_value = f"{str_value[:40]}_{short_hash}"
path_parts.append(f"{key}={quote(str_value, safe='')}")

# Join the parts with `/` to form the full path
return "/".join(path_parts)
37 changes: 37 additions & 0 deletions ingestify/tests/test_identifier_transformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""Tests for IdentifierTransformer.to_path."""
from ingestify.domain.services.identifier_key_transformer import IdentifierTransformer


def test_to_path_short_value_unchanged():
t = IdentifierTransformer()
path = t.to_path("p", "d", {"key": "short"})
assert path == "key=short"


def test_to_path_special_chars_url_encoded():
t = IdentifierTransformer()
path = t.to_path("p", "d", {"key": "$99 mattress"})
assert path == "key=%2499%20mattress"


def test_to_path_long_value_truncated_with_hash():
t = IdentifierTransformer()
long_value = "a" * 50
path = t.to_path("p", "d", {"key": long_value})
# Truncated at 40 chars + _ + 8-char hash
assert path.startswith("key=" + "a" * 40 + "_")
assert len(path.split("=")[1]) == 40 + 1 + 8 # value_hash


def test_to_path_long_value_hash_is_stable():
t = IdentifierTransformer()
long_value = "keyword " * 10
path1 = t.to_path("p", "d", {"key": long_value})
path2 = t.to_path("p", "d", {"key": long_value})
assert path1 == path2


def test_to_path_integer_value_unchanged():
t = IdentifierTransformer()
path = t.to_path("p", "d", {"id": 12345})
assert path == "id=12345"
Loading