Skip to content

Commit

Permalink
Add s3:// prefix if not present (#172)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexmassen-hane authored Jul 26, 2023
1 parent 5019bbc commit 67be3be
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 26 deletions.
11 changes: 9 additions & 2 deletions academic_observatory_workflows/workflows/openalex_telescope.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,7 +789,7 @@ def s3_uri_parts(s3_uri: str) -> Tuple[str, str]:
"""

if not s3_uri.startswith("s3://"):
raise ValueError("Invalid S3 URI. URI should start with 's3://'")
raise ValueError(f"Invalid S3 URI. URI should start with 's3://' - {s3_uri}")

parts = s3_uri[5:].split("/", 1) # Remove 's3://' and split the remaining string
bucket_name = parts[0]
Expand Down Expand Up @@ -820,7 +820,11 @@ def to_dict(self) -> Dict:

class ManifestEntry:
def __init__(self, url: str, meta: Meta):
self.url = url
# URLs given from OpenAlex may not be given with the 's3://' prefix.
if not url.startswith("s3://"):
self.url = f"s3://{url}"
else:
self.url = url
self.meta = meta

def __eq__(self, other):
Expand Down Expand Up @@ -927,6 +931,9 @@ def fetch_manifest(
).client("s3")
obj = client.get_object(Bucket=bucket, Key=f"data/{entity_name}/manifest")
data = json.loads(obj["Body"].read().decode())

# Add s3:// as necessary

return Manifest.from_dict(data)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,10 @@ def test_manifest_entry(self):
self.assertEqual(pendulum.datetime(2022, 12, 20), entry.updated_date)
self.assertEqual("part_000.gz", entry.file_name)

# Assert that manifest entry without a s3:// url prefix is still valid.
manifest_entry_no_s3 = ManifestEntry("openalex/data/works/updated_date=2022-12-20/part_000.gz", Meta(7073, 4))
self.assertEqual(manifest_entry_no_s3.url, "s3://openalex/data/works/updated_date=2022-12-20/part_000.gz")

# object_key
manifest_entry = ManifestEntry("s3://openalex/data/works/updated_date=2022-12-20/part_000.gz", Meta(7073, 4))
self.assertEqual("data/works/updated_date=2022-12-20/part_000.gz", manifest_entry.object_key)
Expand Down Expand Up @@ -373,50 +377,34 @@ def test_transform_object(self):
"""Test the transform_object function."""

# Null
obj = {
"corresponding_institution_ids": None
}
obj = {"corresponding_institution_ids": None}
transform_object(obj)
self.assertDictEqual(
{
"corresponding_institution_ids": []
},
{"corresponding_institution_ids": []},
obj,
)

# Null
obj = {
"corresponding_author_ids": None
}
obj = {"corresponding_author_ids": None}
transform_object(obj)
self.assertDictEqual(
{
"corresponding_author_ids": []
},
{"corresponding_author_ids": []},
obj,
)

# Null in array
obj = {
"corresponding_institution_ids": [None]
}
obj = {"corresponding_institution_ids": [None]}
transform_object(obj)
self.assertDictEqual(
{
"corresponding_institution_ids": []
},
{"corresponding_institution_ids": []},
obj,
)

# Null in array
obj = {
"corresponding_author_ids": [None]
}
obj = {"corresponding_author_ids": [None]}
transform_object(obj)
self.assertDictEqual(
{
"corresponding_author_ids": []
},
{"corresponding_author_ids": []},
obj,
)

Expand Down

0 comments on commit 67be3be

Please sign in to comment.