Skip to content

Commit

Permalink
Fix/OpenAlex "abstract_inverted_index" field is sometimes a string, n…
Browse files Browse the repository at this point in the history
…ot a dictionary (#173)
  • Loading branch information
alexmassen-hane authored Jul 27, 2023
1 parent 67be3be commit afd3453
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 10 deletions.
Git LFS file not shown
17 changes: 12 additions & 5 deletions academic_observatory_workflows/workflows/openalex_telescope.py
Original file line number Diff line number Diff line change
Expand Up @@ -1016,12 +1016,19 @@ def transform_object(obj: dict):

field = "abstract_inverted_index"
if field in obj:
if not isinstance(obj.get(field), dict):
return
keys = list(obj[field].keys())
values = [str(value)[1:-1] for value in obj[field].values()]

obj[field] = {"keys": keys, "values": values}
def parse_abstract(dict_: dict):
keys_ = list(dict_.keys())
values_ = [str(value_)[1:-1] for value_ in dict_.values()]
return {"keys": keys_, "values": values_}

if isinstance(obj.get(field), str):
data = json.loads(obj[field])
obj[field] = parse_abstract(data["InvertedIndex"])
elif isinstance(obj.get(field), dict):
obj[field] = parse_abstract(obj[field])
else:
return

field = "international"
if field in obj:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -462,10 +462,25 @@ def test_transform_object(self):
obj3,
)

# Test object with nested "abstract_inverted_index" none
obj4 = {"abstract_inverted_index": None}
# Test object when "abstract_inverted_index" is a json string dump
obj4 = {
"abstract_inverted_index": '{"IndexLength": 7, "InvertedIndex": { "Malignant": [0], "hyperthermia": [1], "susceptibility": [2],"(MHS)": [3], "is": [4, 6], "primarily": [5]}}'
}
transform_object(obj4)
self.assertDictEqual({"abstract_inverted_index": None}, obj4)
self.assertDictEqual(
{
"abstract_inverted_index": {
"keys": ["Malignant", "hyperthermia", "susceptibility", "(MHS)", "is", "primarily"],
"values": ["0", "1", "2", "3", "4, 6", "5"],
}
},
obj4,
)

# Test object with nested "abstract_inverted_index" none
obj5 = {"abstract_inverted_index": None}
transform_object(obj5)
self.assertDictEqual({"abstract_inverted_index": None}, obj5)


def upload_folder_to_s3(bucket_name: str, folder_path: str, s3_prefix=None):
Expand Down

0 comments on commit afd3453

Please sign in to comment.