From 40180d000443264bb9d114d81bd943db4502c74d Mon Sep 17 00:00:00 2001 From: Alex Massen-Hane Date: Wed, 26 Jul 2023 16:21:18 +0800 Subject: [PATCH] Fix for if abstract_inverted_index is a string --- .../updated_date=2023-04-16/part_000.json | 4 +-- .../workflows/openalex_telescope.py | 26 ++++++++++++++++--- .../tests/test_openalex_telescope.py | 21 ++++++++++++--- 3 files changed, 42 insertions(+), 9 deletions(-) diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/works/updated_date=2023-04-16/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/works/updated_date=2023-04-16/part_000.json index 2777d7f8..ccef3488 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/works/updated_date=2023-04-16/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/works/updated_date=2023-04-16/part_000.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2af6dd859abdd351c261e7bb3e43ab90bce6531bf6c5fda0a26302bea59be6c7 -size 49088 +oid sha256:ab7daeed6c0cf57da591b17d05cd1f89bb0dfe2b5f57f622802198af081a2143 +size 43681 diff --git a/academic_observatory_workflows/workflows/openalex_telescope.py b/academic_observatory_workflows/workflows/openalex_telescope.py index 50323273..2267ecff 100644 --- a/academic_observatory_workflows/workflows/openalex_telescope.py +++ b/academic_observatory_workflows/workflows/openalex_telescope.py @@ -1016,12 +1016,30 @@ def transform_object(obj: dict): field = "abstract_inverted_index" if field in obj: - if not isinstance(obj.get(field), dict): + if not isinstance(obj.get(field), (dict, str)): return - keys = list(obj[field].keys()) - values = [str(value)[1:-1] for value in obj[field].values()] + else: + # If data is held in a string dump, load json string again. + if isinstance(obj.get(field), str): + obj_part = json.loads(obj[field]) + field2 = "InvertedIndex" + if isinstance(obj_part.get(field2), dict): + keys = list(obj_part[field2].keys()) + values = [str(value)[1:-1] for value in obj_part[field2].values()] + + index_sum = sum(len(value.split(", ")) for value in values) + assert ( + index_sum == obj_part["IndexLength"] + ), f"Calculated IndexLength {index_sum} does not match value from file {obj_part['IndexLength']}." + + obj[field] = {"keys": keys, "values": values} + else: + raise TypeError(f"obj_part['InvertedIndex'] is not a dictionary: {obj_part}") + else: + keys = list(obj[field].keys()) + values = [str(value)[1:-1] for value in obj[field].values()] - obj[field] = {"keys": keys, "values": values} + obj[field] = {"keys": keys, "values": values} field = "international" if field in obj: diff --git a/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py b/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py index 3a76818e..c20ff8b8 100644 --- a/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py +++ b/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py @@ -462,10 +462,25 @@ def test_transform_object(self): obj3, ) - # Test object with nested "abstract_inverted_index" none - obj4 = {"abstract_inverted_index": None} + # Test object when "abstract_inverted_index" is a json string dump + obj4 = { + "abstract_inverted_index": '{"IndexLength": 7, "InvertedIndex": { "Malignant": [0], "hyperthermia": [1], "susceptibility": [2],"(MHS)": [3], "is": [4, 6], "primarily": [5]}}' + } transform_object(obj4) - self.assertDictEqual({"abstract_inverted_index": None}, obj4) + self.assertDictEqual( + { + "abstract_inverted_index": { + "keys": ["Malignant", "hyperthermia", "susceptibility", "(MHS)", "is", "primarily"], + "values": ["0", "1", "2", "3", "4, 6", "5"], + } + }, + obj4, + ) + + # Test object with nested "abstract_inverted_index" none + obj5 = {"abstract_inverted_index": None} + transform_object(obj5) + self.assertDictEqual({"abstract_inverted_index": None}, obj5) def upload_folder_to_s3(bucket_name: str, folder_path: str, s3_prefix=None):