diff --git a/academic_observatory_workflows/workflows/openalex_telescope.py b/academic_observatory_workflows/workflows/openalex_telescope.py index aea3ed27..50323273 100644 --- a/academic_observatory_workflows/workflows/openalex_telescope.py +++ b/academic_observatory_workflows/workflows/openalex_telescope.py @@ -789,7 +789,7 @@ def s3_uri_parts(s3_uri: str) -> Tuple[str, str]: """ if not s3_uri.startswith("s3://"): - raise ValueError("Invalid S3 URI. URI should start with 's3://'") + raise ValueError(f"Invalid S3 URI. URI should start with 's3://' - {s3_uri}") parts = s3_uri[5:].split("/", 1) # Remove 's3://' and split the remaining string bucket_name = parts[0] @@ -820,7 +820,11 @@ def to_dict(self) -> Dict: class ManifestEntry: def __init__(self, url: str, meta: Meta): - self.url = url + # URLs given from OpenAlex may not be given with the 's3://' prefix. + if not url.startswith("s3://"): + self.url = f"s3://{url}" + else: + self.url = url self.meta = meta def __eq__(self, other): @@ -927,6 +931,9 @@ def fetch_manifest( ).client("s3") obj = client.get_object(Bucket=bucket, Key=f"data/{entity_name}/manifest") data = json.loads(obj["Body"].read().decode()) + + # Add s3:// as necessary + return Manifest.from_dict(data) diff --git a/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py b/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py index 2ad95c33..3a76818e 100644 --- a/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py +++ b/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py @@ -142,6 +142,10 @@ def test_manifest_entry(self): self.assertEqual(pendulum.datetime(2022, 12, 20), entry.updated_date) self.assertEqual("part_000.gz", entry.file_name) + # Assert that manifest entry without a s3:// url prefix is still valid. + manifest_entry_no_s3 = ManifestEntry("openalex/data/works/updated_date=2022-12-20/part_000.gz", Meta(7073, 4)) + self.assertEqual(manifest_entry_no_s3.url, "s3://openalex/data/works/updated_date=2022-12-20/part_000.gz") + # object_key manifest_entry = ManifestEntry("s3://openalex/data/works/updated_date=2022-12-20/part_000.gz", Meta(7073, 4)) self.assertEqual("data/works/updated_date=2022-12-20/part_000.gz", manifest_entry.object_key) @@ -373,50 +377,34 @@ def test_transform_object(self): """Test the transform_object function.""" # Null - obj = { - "corresponding_institution_ids": None - } + obj = {"corresponding_institution_ids": None} transform_object(obj) self.assertDictEqual( - { - "corresponding_institution_ids": [] - }, + {"corresponding_institution_ids": []}, obj, ) # Null - obj = { - "corresponding_author_ids": None - } + obj = {"corresponding_author_ids": None} transform_object(obj) self.assertDictEqual( - { - "corresponding_author_ids": [] - }, + {"corresponding_author_ids": []}, obj, ) # Null in array - obj = { - "corresponding_institution_ids": [None] - } + obj = {"corresponding_institution_ids": [None]} transform_object(obj) self.assertDictEqual( - { - "corresponding_institution_ids": [] - }, + {"corresponding_institution_ids": []}, obj, ) # Null in array - obj = { - "corresponding_author_ids": [None] - } + obj = {"corresponding_author_ids": [None]} transform_object(obj) self.assertDictEqual( - { - "corresponding_author_ids": [] - }, + {"corresponding_author_ids": []}, obj, )