diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/authors/updated_date=2023-04-16/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/authors/updated_date=2023-04-16/part_000.json index 1daecbb6a..59150eb5e 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/authors/updated_date=2023-04-16/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/authors/updated_date=2023-04-16/part_000.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bfac4ac0f8d7a83950ccc962d1fc6ad9c570e3a49214cad4980d5c3a8e6b47c6 -size 2312 +oid sha256:376de45fb18b57954990b08575aaec1a00f6948a1bb38d4fe2dbec4f36f785b8 +size 2296 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/authors.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/authors.json index 9cecf53f3..5b4c0bd18 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/authors.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/authors.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc51e11fa79a49ea33c9e1385d28b99171064ad9374652af39392d972f43b6f7 +oid sha256:a248682c707948ca70306a87164f914445a07c7b2e9716b850e7a9420daf2145 size 4476 diff --git a/academic_observatory_workflows/workflows/openalex_telescope.py b/academic_observatory_workflows/workflows/openalex_telescope.py index 92ee080f2..7baf900d9 100644 --- a/academic_observatory_workflows/workflows/openalex_telescope.py +++ b/academic_observatory_workflows/workflows/openalex_telescope.py @@ -331,10 +331,10 @@ def __init__( ("concepts", True), ("institutions", True), ("works", True), - ("authors", False), - ("publishers", False), - ("sources", False), - ("funders", False), + ("authors", True), + ("publishers", True), + ("sources", True), + ("funders", True), ] super().__init__( @@ -1014,9 +1014,9 @@ def transform_object(obj: dict): value = [] obj[field] = [x for x in value if x is not None] + # TODO: when re-ingesting entire dataset: change schema to new version field = "abstract_inverted_index" if field in obj: - def parse_abstract(dict_: dict): keys_ = list(dict_.keys()) values_ = [str(value_)[1:-1] for value_ in dict_.values()] @@ -1039,3 +1039,10 @@ def parse_abstract(dict_: dict): values = list(obj[field][nested_field].values()) obj[field][nested_field] = {"keys": keys, "values": values} + + # Transform updated_date from a date into a datetime + # TODO: when re-ingesting entire dataset: change to date + field = "updated_date" + if field in obj: + obj[field] = pendulum.parse(obj[field]).to_iso8601_string() + diff --git a/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py b/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py index c20ff8b8c..0fc66e0c2 100644 --- a/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py +++ b/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py @@ -592,7 +592,11 @@ def test_dag_structure(self): "aws_to_gcs_transfer": ["download_concepts"], "download_concepts": ["download_institutions"], "download_institutions": ["download_works"], - "download_works": ["transform"], + "download_works": ["download_authors"], + "download_authors": ["download_publishers"], + "download_publishers": ["download_sources"], + "download_sources": ["download_funders"], + "download_funders": ["transform"], "transform": ["upload_upsert_files"], "upload_upsert_files": ["bq_load_upsert_tables"], "bq_load_upsert_tables": ["bq_upsert_records"],