diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 707c44495..afcbe8937 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -55,6 +55,9 @@ jobs: TEST_GCP_PROJECT_ID: ${{ secrets.TEST_GCP_PROJECT_ID }} TEST_GCP_SERVICE_KEY: ${{ secrets.TEST_GCP_SERVICE_KEY }} GOOGLE_APPLICATION_CREDENTIALS: /tmp/google_application_credentials.json + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }} AIRFLOW__CORE__LOAD_EXAMPLES: false run: | echo "${TEST_GCP_SERVICE_KEY}" | base64 --decode > /tmp/google_application_credentials.json diff --git a/academic_observatory_workflows/database/schema/openalex/works.json b/academic_observatory_workflows/database/schema/openalex/works.json index 39823ec0b..0d263338a 100644 --- a/academic_observatory_workflows/database/schema/openalex/works.json +++ b/academic_observatory_workflows/database/schema/openalex/works.json @@ -382,22 +382,16 @@ "name": "cited_by_count", "type": "INTEGER", "mode": "NULLABLE", - "description": "todo" - }, - { - "name": "works_count", - "type": "INTEGER", - "mode": "NULLABLE", - "description": "todo" + "description": "The number of times this work is cited in this year." }, { "name": "year", "type": "INTEGER", "mode": "NULLABLE", - "description": "" + "description": "The year." } ], - "description": "todo" + "description": "Works.cited_by_count for each of the last ten years, binned by year. To put it another way: each year, you can see how many times this work was cited." }, { "name": "created_date", @@ -814,7 +808,7 @@ }, { "name": "publication_date", - "type": "STRING", + "type": "DATE", "mode": "NULLABLE", "description": "The day when this work was published, formatted as an ISO 8601 date.\nWhere different publication dates exist, we select the earliest available date of electronic publication. \nThis date applies to the version found at Work.url. The other versions, found in Work.alternate_host_venues, may have been published at different (earlier) dates." }, diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/authors/updated_date=2023-04-02/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/authors/updated_date=2023-04-02/part_000.json index 4b66e9531..723874c7c 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/authors/updated_date=2023-04-02/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/authors/updated_date=2023-04-02/part_000.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55f1a3c1c477d85c8c3ce436bf920cbe95096f436e8b9bda6d27723ff879f4f6 -size 2202 +oid sha256:afc8a5ccf0622898e380acfc448e92c5c58b1dabbc24e89765c34b6509cb8afe +size 3109 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/concepts/updated_date=2023-04-02/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/concepts/updated_date=2023-04-02/part_000.json index 1984564fd..c6867652e 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/concepts/updated_date=2023-04-02/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/concepts/updated_date=2023-04-02/part_000.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c9b7159be4346d81e395217a27b2504a417b9a418611c3981275d8085dc8ec0 -size 35353 +oid sha256:988200d47a2735a709a2e3f7ecd2a2c23fe67ea2176cf7baea9f48fbd5e84362 +size 56079 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/institutions/updated_date=2023-04-02/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/institutions/updated_date=2023-04-02/part_000.json index 859773f8d..e4c86bd04 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/institutions/updated_date=2023-04-02/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/institutions/updated_date=2023-04-02/part_000.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a64be8655d07ba06f9ec69862cee479dd669c8ba12054cfe9de8648a5417f0d4 -size 29477 +oid sha256:feca1645be49678dd98e32e5ec0b4ffd7f6246ea311d12f01f50235851f5cce9 +size 38198 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/publishers/updated_date=2023-04-02/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/publishers/updated_date=2023-04-02/part_000.json index f0ea2fd58..375e75a25 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/publishers/updated_date=2023-04-02/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/publishers/updated_date=2023-04-02/part_000.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cfdb85d22bd94adbdaeea211435901f126359e8dc0407e480ab6d60a642bf37f -size 5993 +oid sha256:0b49ae1aedf81fc4a8dd35379fa8a48dbf2ce13d02319c4691c5ed5a8e981da7 +size 7823 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/sources/updated_date=2023-04-02/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/sources/updated_date=2023-04-02/part_000.json index 5ccbedc48..e2dbc29ff 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/sources/updated_date=2023-04-02/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/sources/updated_date=2023-04-02/part_000.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7d6636dd5f00b2cd538a926fa1af487c561eef54eac17759dacdc473233337e -size 18598 +oid sha256:0ea484e9a3b91e514dba8fd52c42c9a2930996c78958f6590772faa2c453258b +size 23525 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/works/updated_date=2023-04-02/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/works/updated_date=2023-04-02/part_000.json index 4d43c6936..360aa7b1a 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/works/updated_date=2023-04-02/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/works/updated_date=2023-04-02/part_000.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00295e802a7f6c69d5e7ef8682252a12a424f3db16af32e3820726d3b5f061e5 -size 59906 +oid sha256:49b193f066f529d2c64541e05eb9ed43d614a844501e03100f8bee6bb6499d67 +size 64614 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/authors.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/authors.json new file mode 100644 index 000000000..723874c7c --- /dev/null +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/authors.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afc8a5ccf0622898e380acfc448e92c5c58b1dabbc24e89765c34b6509cb8afe +size 3109 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/concepts.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/concepts.json new file mode 100644 index 000000000..c6867652e --- /dev/null +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/concepts.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:988200d47a2735a709a2e3f7ecd2a2c23fe67ea2176cf7baea9f48fbd5e84362 +size 56079 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/institutions.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/institutions.json new file mode 100644 index 000000000..e4c86bd04 --- /dev/null +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/institutions.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:feca1645be49678dd98e32e5ec0b4ffd7f6246ea311d12f01f50235851f5cce9 +size 38198 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/publishers.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/publishers.json new file mode 100644 index 000000000..375e75a25 --- /dev/null +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/publishers.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b49ae1aedf81fc4a8dd35379fa8a48dbf2ce13d02319c4691c5ed5a8e981da7 +size 7823 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/sources.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/sources.json new file mode 100644 index 000000000..e2dbc29ff --- /dev/null +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/sources.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ea484e9a3b91e514dba8fd52c42c9a2930996c78958f6590772faa2c453258b +size 23525 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/works.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/works.json new file mode 100644 index 000000000..da2c8c66c --- /dev/null +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/works.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9da5df45d1358debe55ebd123d68ed37d7ac07d01f715cf138609a66733a7c77 +size 64610 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/authors/updated_date=2023-04-02/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/authors/updated_date=2023-04-02/part_000.json index 74fc4f9a4..2fce9b359 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/authors/updated_date=2023-04-02/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/authors/updated_date=2023-04-02/part_000.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55ca2abd42dbf5d5cde4e662d480e8bbc33261f59d321ee99f3fc9c3d5755399 -size 761 +oid sha256:a74180b315807cdc4a4db41873f349c72bfabdc1d975e5645c549b64d99cc458 +size 911 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/authors/updated_date=2023-04-16/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/authors/updated_date=2023-04-16/part_000.json index cae6cf497..1daecbb6a 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/authors/updated_date=2023-04-16/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/authors/updated_date=2023-04-16/part_000.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:03ea20646653538e036414aebab18f350c4a4a175d643fca40f3f0cb7bea1ea4 -size 1679 +oid sha256:bfac4ac0f8d7a83950ccc962d1fc6ad9c570e3a49214cad4980d5c3a8e6b47c6 +size 2312 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/concepts/updated_date=2023-04-02/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/concepts/updated_date=2023-04-02/part_000.json index 1984564fd..b821f59b7 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/concepts/updated_date=2023-04-02/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/concepts/updated_date=2023-04-02/part_000.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c9b7159be4346d81e395217a27b2504a417b9a418611c3981275d8085dc8ec0 +oid sha256:124dfdefabf2ffb69b1682fcbfcf1134326b85ddc2ded970d16ad52898a438e3 size 35353 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/concepts/updated_date=2023-04-16/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/concepts/updated_date=2023-04-16/part_000.json index e69de29bb..ec12e8f43 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/concepts/updated_date=2023-04-16/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/concepts/updated_date=2023-04-16/part_000.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27b72bfdb555b6322e80808b2a8fc64e8b018b629826196638f338e7161228b9 +size 32089 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/institutions/updated_date=2023-04-02/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/institutions/updated_date=2023-04-02/part_000.json index 859773f8d..18b31f294 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/institutions/updated_date=2023-04-02/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/institutions/updated_date=2023-04-02/part_000.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a64be8655d07ba06f9ec69862cee479dd669c8ba12054cfe9de8648a5417f0d4 -size 29477 +oid sha256:6118d6d7d9126f0f366240ee12f3d284e9c2624d4c8ee02baa9a2faac9dba2b4 +size 8724 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/institutions/updated_date=2023-04-16/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/institutions/updated_date=2023-04-16/part_000.json index e69de29bb..d5d738cbb 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/institutions/updated_date=2023-04-16/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/institutions/updated_date=2023-04-16/part_000.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47b838b98e7fe3f2794327ae4770e3be09ce55dd96104ad8e4a59cbe9c81081e +size 28841 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/merged_ids/authors/2023-04-16.csv b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/merged_ids/authors/2023-04-16.csv index f73f2a092..e782afbf7 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/merged_ids/authors/2023-04-16.csv +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/merged_ids/authors/2023-04-16.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:25faacf0aaa5208221ffa1e9f57aa128422bbe363f543d8fe99057c1b37be6ee +oid sha256:a1915f5648bb8934f009c7cfeb6473bd005515b4b31b9596c69c594f04ef2b00 size 62 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/merged_ids/institutions/2023-04-16.csv b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/merged_ids/institutions/2023-04-16.csv index 604c60cda..faf7bb667 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/merged_ids/institutions/2023-04-16.csv +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/merged_ids/institutions/2023-04-16.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4800aaa6907be45780764c6520354223f73b459a3ef026664fda4ef5966a0146 -size 40 +oid sha256:388e7ad7a427082f98932135b03caeda52a886b785f4f295e4e3ec1dd20cd5d2 +size 60 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/merged_ids/sources/2023-04-16.csv b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/merged_ids/sources/2023-04-16.csv index 604c60cda..7af441aa8 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/merged_ids/sources/2023-04-16.csv +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/merged_ids/sources/2023-04-16.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4800aaa6907be45780764c6520354223f73b459a3ef026664fda4ef5966a0146 -size 40 +oid sha256:fe56a6d59e69c425295a6952bb0c50f4b541a4802244bbe9cf24b21703aafdbb +size 62 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/merged_ids/works/2023-04-16.csv b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/merged_ids/works/2023-04-16.csv index 604c60cda..a2f38f072 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/merged_ids/works/2023-04-16.csv +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/merged_ids/works/2023-04-16.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4800aaa6907be45780764c6520354223f73b459a3ef026664fda4ef5966a0146 -size 40 +oid sha256:a28f29e363288b4df289930898856874c8ae9885928c7be472e7f344a325ba77 +size 62 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/publishers/updated_date=2023-04-02/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/publishers/updated_date=2023-04-02/part_000.json index 9f1aea390..375e75a25 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/publishers/updated_date=2023-04-02/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/publishers/updated_date=2023-04-02/part_000.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6d1eabf78a534f9f33eef2676e7630b4021c2e2102a4a0ea7a7bb86c08dae4f4 -size 5992 +oid sha256:0b49ae1aedf81fc4a8dd35379fa8a48dbf2ce13d02319c4691c5ed5a8e981da7 +size 7823 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/publishers/updated_date=2023-04-16/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/publishers/updated_date=2023-04-16/part_000.json index e69de29bb..8dfb47ae7 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/publishers/updated_date=2023-04-16/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/publishers/updated_date=2023-04-16/part_000.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d39d6216288c79a6d27354be00e0a8f16a6c47fc30d9c2b40f1cfcdcf5be91fa +size 3735 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/sources/updated_date=2023-04-02/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/sources/updated_date=2023-04-02/part_000.json index 5ccbedc48..5103a7318 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/sources/updated_date=2023-04-02/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/sources/updated_date=2023-04-02/part_000.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7d6636dd5f00b2cd538a926fa1af487c561eef54eac17759dacdc473233337e -size 18598 +oid sha256:f3045c7affc7857b72427a0e03da71476ea94835343432a103141888d10897f6 +size 4930 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/sources/updated_date=2023-04-16/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/sources/updated_date=2023-04-16/part_000.json index e69de29bb..2eb500145 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/sources/updated_date=2023-04-16/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/sources/updated_date=2023-04-16/part_000.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348ceea2e96e1bbb9282f0384b1446761159b5e55d14bff9bb308db648c920a0 +size 16769 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/works/updated_date=2023-04-02/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/works/updated_date=2023-04-02/part_000.json index 4d43c6936..5601f7af7 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/works/updated_date=2023-04-02/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/works/updated_date=2023-04-02/part_000.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00295e802a7f6c69d5e7ef8682252a12a424f3db16af32e3820726d3b5f061e5 -size 59906 +oid sha256:504398ab6d91d147d71db0caf53ca41e17e14536d848ef003a87ede6878b8333 +size 4711 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/works/updated_date=2023-04-16/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/works/updated_date=2023-04-16/part_000.json index 2d55faebc..ea1d97d70 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/works/updated_date=2023-04-16/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/works/updated_date=2023-04-16/part_000.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:47305be82abdb58a11e35a39a82e7daf198e604b42e102f8587fffc9ce0b2d60 -size 8 +oid sha256:9584a66707b4e6cb62c5b482ac55ea6ff59d37102ca1d8b73ab16eecb373a663 +size 48934 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/authors.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/authors.json new file mode 100644 index 000000000..51b95a259 --- /dev/null +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/authors.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c80d56a1935f59b2fd3c43e0acdbde24151997a72fa4fc4fcd618e4cbe2bcde3 +size 3220 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/concepts.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/concepts.json new file mode 100644 index 000000000..9344466f3 --- /dev/null +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/concepts.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b84fdad17c941c182f7741f59ca528a138dbc863de0fee655107f34a87f9744 +size 67440 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/institutions.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/institutions.json new file mode 100644 index 000000000..d34754f9b --- /dev/null +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/institutions.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf405aa2cda7e680af83f5bed9e3d14ddd2e51eaf679f206fd0f8e3d5a183f53 +size 37563 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/publishers.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/publishers.json new file mode 100644 index 000000000..cb509b74d --- /dev/null +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/publishers.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d84cae48af88638403bb44254e872e9fd7d5a6258dce64ac5d109560645d0fa +size 9725 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/sources.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/sources.json new file mode 100644 index 000000000..16bba62bb --- /dev/null +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/sources.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dc4a2941f730087b8d8ba8668e51e3c35f136db2f68b6f0779e40694903fc79 +size 21697 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/works.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/works.json new file mode 100644 index 000000000..ffe930f4b --- /dev/null +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/works.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:922b592afb5d9b53dcdba737d5695b2447a32074ed62d326ef8456fef61c7e4c +size 53634 diff --git a/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py b/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py index 19d3505d9..28f162fe6 100644 --- a/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py +++ b/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py @@ -45,6 +45,7 @@ parse_release_msg, ) from observatory.platform.api import get_dataset_releases +from observatory.platform.bigquery import bq_table_id from observatory.platform.files import save_jsonl_gz, load_file from observatory.platform.observatory_config import Workflow, CloudWorkspace from observatory.platform.observatory_environment import ( @@ -52,6 +53,7 @@ ObservatoryTestCase, aws_bucket_test_env, find_free_port, + load_and_parse_json, ) @@ -474,43 +476,44 @@ def upload_folder_to_s3(bucket_name: str, folder_path: str, s3_prefix=None): print(f"Uploaded {local_path} to s3://{bucket_name}/{s3_path}") -def create_openalex_dataset(input_path: pathlib.Path, bucket_name: str, region_name: str = "ap-southeast-2"): +def create_openalex_dataset(input_path: pathlib.Path, bucket_name: str): with tempfile.TemporaryDirectory() as temp_dir: entry_index = {} # Create part files and merged_ids for root, dirs, files in os.walk(str(input_path)): for filename in files: - file_path = pathlib.Path(root) / filename - s3_object_key = pathlib.Path(root).relative_to(input_path) - output_root = pathlib.Path(temp_dir) / s3_object_key - - os.makedirs(str(output_root), exist_ok=True) - - # For JSON: convert to JSONL, save without extension and gzip - if file_path.suffix == ".json": - file_name = f"{file_path.stem}.gz" - output_path = output_root / file_name - data = json.loads(load_file(file_path)) - save_jsonl_gz(str(output_path), data) - - # Build manifest entries - entity_name = output_root.parts[-2] - if entity_name not in entry_index: - entry_index[entity_name] = [] - entry_index[entity_name].append( - ManifestEntry( - f"s3://{bucket_name}/{s3_object_key}/{file_name}", - Meta(content_length=os.path.getsize(output_path), record_count=len(data)), + if "expected" not in pathlib.Path(root).parts: + file_path = pathlib.Path(root) / filename + s3_object_key = pathlib.Path(root).relative_to(input_path) + output_root = pathlib.Path(temp_dir) / s3_object_key + + os.makedirs(str(output_root), exist_ok=True) + + # For JSON: convert to JSONL, save without extension and gzip + if file_path.suffix == ".json": + file_name = f"{file_path.stem}.gz" + output_path = output_root / file_name + data = json.loads(load_file(file_path)) + save_jsonl_gz(str(output_path), data) + + # Build manifest entries + entity_name = output_root.parts[-2] + if entity_name not in entry_index: + entry_index[entity_name] = [] + entry_index[entity_name].append( + ManifestEntry( + f"s3://{bucket_name}/{s3_object_key}/{file_name}", + Meta(content_length=os.path.getsize(output_path), record_count=len(data)), + ) ) - ) - # For CSV: gzip - elif file_path.suffix == ".csv": - output_path = output_root / f"{file_path.stem}.csv.gz" - data = load_file(file_path) - with gzip.open(output_path, "wt") as f: - f.write(data) + # For CSV: gzip + elif file_path.suffix == ".csv": + output_path = output_root / f"{file_path.stem}.csv.gz" + data = load_file(file_path) + with gzip.open(output_path, "wt") as f: + f.write(data) # Create and save manifests for entity_name, entries in entry_index.items(): @@ -527,8 +530,6 @@ def create_openalex_dataset(input_path: pathlib.Path, bucket_name: str, region_n # Upload data to s3 upload_folder_to_s3(bucket_name, temp_dir) - a = 1 - class TestOpenAlexTelescope(ObservatoryTestCase): """Tests for the OpenAlex telescope""" @@ -617,7 +618,8 @@ def test_telescope(self): with aws_bucket_test_env(prefix=self.dag_id, region_name=self.aws_region_name) as bucket_name: workflow.aws_openalex_bucket = bucket_name create_openalex_dataset( - pathlib.Path(test_fixtures_folder(self.dag_id, "2023-04-02")), bucket_name, self.aws_region_name + pathlib.Path(test_fixtures_folder(self.dag_id, "2023-04-02")), + bucket_name, ) # First run: snapshot @@ -687,6 +689,15 @@ def test_telescope(self): ti = env.run_task(workflow.bq_delete_records.__name__) self.assertEqual(State.SUCCESS, ti.state) + # Assert content + for entity_name, _ in workflow.entities: + table_id = bq_table_id(self.project_id, workflow.bq_dataset_id, entity_name) + expected_data = load_and_parse_json( + test_fixtures_folder(self.dag_id, "2023-04-02", "expected", f"{entity_name}.json"), + date_fields={"created_date", "updated_date", "publication_date"}, + ) + self.assert_table_content(table_id, expected_data, "id") + # Check that there is one dataset release per entity after add_new_dataset_releases for entity_name, _ in workflow.entities: dataset_releases = get_dataset_releases(dag_id=self.dag_id, dataset_id=entity_name) @@ -737,13 +748,103 @@ def test_telescope(self): self.assertEqual(len(dataset_releases), 1) # Create bucket and dataset for use in third run - # with aws_bucket_test_env(prefix=self.dag_id, region_name=self.aws_region_name) as bucket_name: - # workflow.aws_openalex_bucket = bucket_name - # create_openalex_dataset( - # pathlib.Path(test_fixtures_folder(self.dag_id, "2023-04-09")), bucket_name, - # self.aws_region_name - # ) - # Third run: changefiles - # data_interval_start = pendulum.datetime(2023, 4, 16) - # with env.create_dag_run(dag, data_interval_start) as dag_run: - # pass + with aws_bucket_test_env(prefix=self.dag_id, region_name=self.aws_region_name) as bucket_name: + workflow.aws_openalex_bucket = bucket_name + create_openalex_dataset( + pathlib.Path(test_fixtures_folder(self.dag_id, "2023-04-16")), + bucket_name, + ) + + # Third run: changefiles + data_interval_start = pendulum.datetime(2023, 4, 16) + with env.create_dag_run(dag, data_interval_start) as dag_run: + # Mocked and expected data + release = OpenAlexRelease( + dag_id=self.dag_id, + run_id=dag_run.run_id, + entities=[], + download_bucket=workflow.cloud_workspace.download_bucket, + changefile_start_date=data_interval_start, + changefile_end_date=data_interval_start, + is_first_run=True, + ) + + # Wait for the previous DAG run to finish + ti = env.run_task("wait_for_prev_dag_run") + self.assertEqual(State.SUCCESS, ti.state) + + # Check dependencies + ti = env.run_task(workflow.check_dependencies.__name__) + self.assertEqual(State.SUCCESS, ti.state) + + # Fetch releases and check that we have received the expected snapshot date and changefiles + task_id = workflow.fetch_releases.__name__ + ti = env.run_task(task_id) + self.assertEqual(State.SUCCESS, ti.state) + msg = ti.xcom_pull( + key=workflow.RELEASE_INFO, + task_ids=task_id, + include_prior_dates=False, + ) + entities = parse_release_msg(msg) + self.assertEqual(6, len(entities)) + # TODO: assert + + ti = env.run_task(workflow.create_datasets.__name__) + self.assertEqual(State.SUCCESS, ti.state) + + ti = env.run_task(workflow.bq_create_main_table_snapshots.__name__) + self.assertEqual(State.SUCCESS, ti.state) + + ti = env.run_task(workflow.aws_to_gcs_transfer.__name__) + self.assertEqual(State.SUCCESS, ti.state) + + for entity_name, transform in workflow.entities: + if transform: + ti = env.run_task(f"download_{entity_name}") + self.assertEqual(State.SUCCESS, ti.state) + + ti = env.run_task(workflow.transform.__name__) + self.assertEqual(State.SUCCESS, ti.state) + + ti = env.run_task(workflow.upload_upsert_files.__name__) + self.assertEqual(State.SUCCESS, ti.state) + + ti = env.run_task(workflow.bq_load_upsert_tables.__name__) + self.assertEqual(State.SUCCESS, ti.state) + + ti = env.run_task(workflow.bq_upsert_records.__name__) + self.assertEqual(State.SUCCESS, ti.state) + + ti = env.run_task(workflow.bq_load_delete_tables.__name__) + self.assertEqual(State.SUCCESS, ti.state) + + ti = env.run_task(workflow.bq_delete_records.__name__) + self.assertEqual(State.SUCCESS, ti.state) + + # Assert content + for entity_name, _ in workflow.entities: + table_id = bq_table_id(self.project_id, workflow.bq_dataset_id, entity_name) + expected_data = load_and_parse_json( + test_fixtures_folder(self.dag_id, "2023-04-16", "expected", f"{entity_name}.json"), + date_fields={"created_date", "updated_date", "publication_date"}, + ) + self.assert_table_content(table_id, expected_data, "id") + + # Check that there is one dataset release per entity after add_new_dataset_releases + for entity_name, _ in workflow.entities: + dataset_releases = get_dataset_releases(dag_id=self.dag_id, dataset_id=entity_name) + self.assertEqual(len(dataset_releases), 1) + ti = env.run_task(workflow.add_new_dataset_releases.__name__) + self.assertEqual(State.SUCCESS, ti.state) + for entity_name, _ in workflow.entities: + dataset_releases = get_dataset_releases(dag_id=self.dag_id, dataset_id=entity_name) + self.assertEqual(len(dataset_releases), 2) + + # Test that all workflow data deleted + ti = env.run_task(workflow.cleanup.__name__) + self.assertEqual(State.SUCCESS, ti.state) + self.assert_cleanup(release.workflow_folder) + + ti = env.run_task("dag_run_complete") + self.assertEqual(State.SUCCESS, ti.state) diff --git a/academic_observatory_workflows/workflows/tests/test_unpaywall_telescope.py b/academic_observatory_workflows/workflows/tests/test_unpaywall_telescope.py index 35a0d4c26..375f0a7e3 100644 --- a/academic_observatory_workflows/workflows/tests/test_unpaywall_telescope.py +++ b/academic_observatory_workflows/workflows/tests/test_unpaywall_telescope.py @@ -48,7 +48,7 @@ ObservatoryTestCase, find_free_port, HttpServer, - load_json, + load_and_parse_json, ) @@ -367,8 +367,9 @@ def test_telescope(self): ti = env.run_task(workflow.bq_load_main_table.__name__) self.assertEqual(State.SUCCESS, ti.state) self.assert_table_integrity(workflow.bq_main_table_id, expected_rows=10) - expected_content = load_json( - test_fixtures_folder(self.dag_id, "expected", "run1_bq_load_main_table.json") + expected_content = load_and_parse_json( + test_fixtures_folder(self.dag_id, "expected", "run1_bq_load_main_table.json"), + date_fields={"updated", "oa_date", "published_date"}, ) self.assert_table_content(workflow.bq_main_table_id, expected_content, "doi") @@ -422,8 +423,9 @@ def test_telescope(self): ti = env.run_task(workflow.bq_upsert_records.__name__) self.assertEqual(State.SUCCESS, ti.state) self.assert_table_integrity(workflow.bq_main_table_id, expected_rows=10) - expected_content = load_json( - test_fixtures_folder(self.dag_id, "expected", "run1_bq_upsert_records.json") + expected_content = load_and_parse_json( + test_fixtures_folder(self.dag_id, "expected", "run1_bq_upsert_records.json"), + date_fields={"updated", "oa_date", "published_date"}, ) self.assert_table_content(workflow.bq_main_table_id, expected_content, "doi") @@ -601,8 +603,9 @@ def test_telescope(self): ti = env.run_task(workflow.bq_upsert_records.__name__) self.assertEqual(State.SUCCESS, ti.state) self.assert_table_integrity(workflow.bq_main_table_id, expected_rows=12) - expected_content = load_json( - test_fixtures_folder(self.dag_id, "expected", "run3_bq_upsert_records.json") + expected_content = load_and_parse_json( + test_fixtures_folder(self.dag_id, "expected", "run3_bq_upsert_records.json"), + date_fields={"updated", "oa_date", "published_date"}, ) self.assert_table_content(workflow.bq_main_table_id, expected_content, "doi")