Skip to content

Commit

Permalink
Fix for if abstract_inverted_index is a string
Browse files Browse the repository at this point in the history
  • Loading branch information
alexmassen-hane committed Jul 26, 2023
1 parent 67be3be commit 40180d0
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 9 deletions.
Git LFS file not shown
26 changes: 22 additions & 4 deletions academic_observatory_workflows/workflows/openalex_telescope.py
Original file line number Diff line number Diff line change
Expand Up @@ -1016,12 +1016,30 @@ def transform_object(obj: dict):

field = "abstract_inverted_index"
if field in obj:
if not isinstance(obj.get(field), dict):
if not isinstance(obj.get(field), (dict, str)):
return
keys = list(obj[field].keys())
values = [str(value)[1:-1] for value in obj[field].values()]
else:
# If data is held in a string dump, load json string again.
if isinstance(obj.get(field), str):
obj_part = json.loads(obj[field])
field2 = "InvertedIndex"
if isinstance(obj_part.get(field2), dict):
keys = list(obj_part[field2].keys())
values = [str(value)[1:-1] for value in obj_part[field2].values()]

index_sum = sum(len(value.split(", ")) for value in values)
assert (
index_sum == obj_part["IndexLength"]
), f"Calculated IndexLength {index_sum} does not match value from file {obj_part['IndexLength']}."

obj[field] = {"keys": keys, "values": values}
else:
raise TypeError(f"obj_part['InvertedIndex'] is not a dictionary: {obj_part}")

Check warning on line 1037 in academic_observatory_workflows/workflows/openalex_telescope.py

View check run for this annotation

Codecov / codecov/patch

academic_observatory_workflows/workflows/openalex_telescope.py#L1037

Added line #L1037 was not covered by tests
else:
keys = list(obj[field].keys())
values = [str(value)[1:-1] for value in obj[field].values()]

obj[field] = {"keys": keys, "values": values}
obj[field] = {"keys": keys, "values": values}

field = "international"
if field in obj:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -462,10 +462,25 @@ def test_transform_object(self):
obj3,
)

# Test object with nested "abstract_inverted_index" none
obj4 = {"abstract_inverted_index": None}
# Test object when "abstract_inverted_index" is a json string dump
obj4 = {
"abstract_inverted_index": '{"IndexLength": 7, "InvertedIndex": { "Malignant": [0], "hyperthermia": [1], "susceptibility": [2],"(MHS)": [3], "is": [4, 6], "primarily": [5]}}'
}
transform_object(obj4)
self.assertDictEqual({"abstract_inverted_index": None}, obj4)
self.assertDictEqual(
{
"abstract_inverted_index": {
"keys": ["Malignant", "hyperthermia", "susceptibility", "(MHS)", "is", "primarily"],
"values": ["0", "1", "2", "3", "4, 6", "5"],
}
},
obj4,
)

# Test object with nested "abstract_inverted_index" none
obj5 = {"abstract_inverted_index": None}
transform_object(obj5)
self.assertDictEqual({"abstract_inverted_index": None}, obj5)


def upload_folder_to_s3(bucket_name: str, folder_path: str, s3_prefix=None):
Expand Down

0 comments on commit 40180d0

Please sign in to comment.