From 6a423117cc5c84d46016f0b238354139e29fb55d Mon Sep 17 00:00:00 2001 From: Keegan Smith Date: Tue, 6 Jun 2023 06:08:22 +0800 Subject: [PATCH 1/3] Added .env to gitignore (#165) --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 6f02fcf1d..2179ea14a 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ ChangeLog .DS_Store /observatory-dags/observatory/dags/workflows/oapen_cloud_function.zip docs/schemas +.env \ No newline at end of file From 88c0ea7a17341434f46298b88c590ea03fad43a9 Mon Sep 17 00:00:00 2001 From: Jamie Diprose <5715104+jdddog@users.noreply.github.com> Date: Tue, 13 Jun 2023 11:22:58 +1200 Subject: [PATCH 2/3] Fix/deploy may 2023 (#166) --- .../crossref_fundref_2014-03-01.json | 38 +- .../database/schema/openalex/authors.json | 54 ++- .../database/schema/openalex/concepts.json | 54 ++- .../schema/openalex/institutions.json | 107 +++++- .../database/schema/openalex/publishers.json | 92 ++++- .../database/schema/openalex/sources.json | 54 ++- .../database/schema/openalex/works.json | 363 +++++++++++++----- .../database/schema/unpaywall/unpaywall.json | 37 ++ .../database/sql/create_aggregate.sql.jinja2 | 132 ++----- .../sql/export_access_types.sql.jinja2 | 1 - .../sql/export_disciplines.sql.jinja2 | 1 - .../updated_date=2023-04-02/part_000.json | 4 +- .../openalex/2023-04-02/expected/authors.json | 4 +- .../2023-04-02/expected/concepts.json | 4 +- .../2023-04-02/expected/institutions.json | 4 +- .../2023-04-02/expected/publishers.json | 4 +- .../openalex/2023-04-02/expected/sources.json | 4 +- .../openalex/2023-04-02/expected/works.json | 4 +- .../updated_date=2023-04-02/part_000.json | 4 +- .../updated_date=2023-04-16/part_000.json | 4 +- .../openalex/2023-04-16/expected/authors.json | 4 +- .../2023-04-16/expected/concepts.json | 4 +- .../2023-04-16/expected/institutions.json | 4 +- .../2023-04-16/expected/publishers.json | 4 +- .../openalex/2023-04-16/expected/sources.json | 4 +- .../openalex/2023-04-16/expected/works.json | 4 +- .../expected/run1_bq_load_main_table.json | 4 +- .../expected/run1_bq_upsert_records.json | 4 +- .../expected/run3_bq_upsert_records.json | 4 +- .../workflows/crossref_events_telescope.py | 6 +- .../workflows/crossref_fundref_telescope.py | 4 +- .../workflows/crossref_metadata_telescope.py | 52 ++- .../workflows/doi_workflow.py | 10 +- .../workflows/oa_web_workflow.py | 7 +- .../workflows/openalex_telescope.py | 58 ++- .../workflows/ror_telescope.py | 7 + .../workflows/scopus_telescope.py | 2 +- .../tests/test_crossref_metadata_telescope.py | 7 +- .../tests/test_openalex_telescope.py | 98 ++--- .../workflows/unpaywall_telescope.py | 8 +- .../workflows/web_of_science_telescope.py | 2 +- 41 files changed, 908 insertions(+), 358 deletions(-) diff --git a/academic_observatory_workflows/database/schema/crossref_fundref/crossref_fundref_2014-03-01.json b/academic_observatory_workflows/database/schema/crossref_fundref/crossref_fundref_2014-03-01.json index cf43f2d78..083e983e2 100644 --- a/academic_observatory_workflows/database/schema/crossref_fundref/crossref_fundref_2014-03-01.json +++ b/academic_observatory_workflows/database/schema/crossref_fundref/crossref_fundref_2014-03-01.json @@ -8,9 +8,26 @@ { "fields": [ { + "fields": [ + { + "mode": "REPEATED", + "name": "parent", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "name", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "funder", + "type": "STRING" + } + ], "mode": "REPEATED", "name": "parent", - "type": "STRING" + "type": "RECORD" }, { "mode": "NULLABLE", @@ -111,9 +128,26 @@ { "fields": [ { + "fields": [ + { + "mode": "REPEATED", + "name": "children", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "name", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "funder", + "type": "STRING" + } + ], "mode": "REPEATED", "name": "children", - "type": "STRING" + "type": "RECORD" }, { "mode": "NULLABLE", diff --git a/academic_observatory_workflows/database/schema/openalex/authors.json b/academic_observatory_workflows/database/schema/openalex/authors.json index 4e329575f..c0b0cfc2d 100644 --- a/academic_observatory_workflows/database/schema/openalex/authors.json +++ b/academic_observatory_workflows/database/schema/openalex/authors.json @@ -16,6 +16,12 @@ "mode": "NULLABLE", "description": "The total number Works that cite a work this author has created." }, + { + "name": "oa_works_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, { "name": "works_count", "type": "INTEGER", @@ -154,23 +160,65 @@ "type": "RECORD", "mode": "NULLABLE", "fields": [ + { + "name": "2yr_cited_by_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "2yr_h_index", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "2yr_i10_index", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, { "name": "2yr_mean_citedness", "type": "FLOAT", "mode": "NULLABLE", - "description": "The 2-year mean citedness for this author. Also known as impact factor." + "description": "" + }, + { + "name": "2yr_works_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "cited_by_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" }, { "name": "h_index", "type": "INTEGER", "mode": "NULLABLE", - "description": "The h-index for this author." + "description": "" }, { "name": "i10_index", "type": "INTEGER", "mode": "NULLABLE", - "description": "The i-10 index for this author." + "description": "" + }, + { + "name": "oa_percent", + "type": "FLOAT", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "works_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" } ], "description": "Citation metrics for this author." diff --git a/academic_observatory_workflows/database/schema/openalex/concepts.json b/academic_observatory_workflows/database/schema/openalex/concepts.json index 1f149b27a..57707ebda 100644 --- a/academic_observatory_workflows/database/schema/openalex/concepts.json +++ b/academic_observatory_workflows/database/schema/openalex/concepts.json @@ -48,6 +48,12 @@ "mode": "NULLABLE", "description": "The number citations to works that have been tagged with this concept. Or less formally: the number of citations to this concept. For example, if there are just two works tagged with this concept and one of them has been cited 10 times, and the other has been cited 1 time, cited_by_count for this concept would be 11." }, + { + "name": "oa_works_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, { "name": "works_count", "type": "INTEGER", @@ -238,23 +244,65 @@ "type": "RECORD", "mode": "NULLABLE", "fields": [ + { + "name": "2yr_cited_by_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "2yr_h_index", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "2yr_i10_index", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, { "name": "2yr_mean_citedness", "type": "FLOAT", "mode": "NULLABLE", - "description": "The 2-year mean citedness for this concept. Also known as impact factor." + "description": "" + }, + { + "name": "2yr_works_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "cited_by_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" }, { "name": "h_index", "type": "INTEGER", "mode": "NULLABLE", - "description": "The h-index for this concept." + "description": "" }, { "name": "i10_index", "type": "INTEGER", "mode": "NULLABLE", - "description": "The i-10 index for this concept." + "description": "" + }, + { + "name": "oa_percent", + "type": "FLOAT", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "works_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" } ], "description": "Citation metrics for this concept." diff --git a/academic_observatory_workflows/database/schema/openalex/institutions.json b/academic_observatory_workflows/database/schema/openalex/institutions.json index 34e1e4f7c..8d711360c 100644 --- a/academic_observatory_workflows/database/schema/openalex/institutions.json +++ b/academic_observatory_workflows/database/schema/openalex/institutions.json @@ -65,6 +65,12 @@ "mode": "NULLABLE", "description": "The total number Works that cite a work created by an author affiliated with this institution. Or less formally: the number of citations this institution has collected." }, + { + "name": "oa_works_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, { "name": "works_count", "type": "INTEGER", @@ -104,6 +110,28 @@ "mode": "REPEATED", "description": "Other names people may use for this institution. " }, + { + "name": "roles", + "type": "RECORD", + "mode": "REPEATED", + "fields": [ + { + "name": "role", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "id", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "works_count", + "type": "INTEGER", + "mode": "NULLABLE" + } + ] + }, { "name": "geo", "type": "RECORD", @@ -152,7 +180,7 @@ "description": "The sub-national region (state, province) where this institution lives." } ], - "description": "A bunch of stuff we know about the location of this institution:" + "description": "A bunch of stuff we know about the location of this institution" }, { "name": "homepage_url", @@ -258,21 +286,40 @@ { "name": "id", "type": "STRING", + "mode": "NULLABLE", "description": "The OpenAlex ID of the repository." }, + { + "name": "issn_l", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "issn", + "type": "STRING", + "mode": "REPEATED" + }, { "name": "display_name", "type": "STRING", + "mode": "NULLABLE", "description": "The repositories display name." }, + { + "name": "publisher", + "type": "STRING", + "mode": "NULLABLE" + }, { "name": "host_organization", "type": "STRING", + "mode": "NULLABLE", "description": "The OpenAlex ID of the host organisation." }, { "name": "host_organization_name", "type": "STRING", + "mode": "NULLABLE", "description": "The host organisations name." }, { @@ -280,6 +327,16 @@ "type": "STRING", "mode": "REPEATED", "description": "The host organisations lineage." + }, + { + "name": "publisher_id", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "type", + "type": "STRING", + "mode": "NULLABLE" } ] }, @@ -294,23 +351,65 @@ "type": "RECORD", "mode": "NULLABLE", "fields": [ + { + "name": "2yr_cited_by_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "2yr_h_index", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "2yr_i10_index", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, { "name": "2yr_mean_citedness", "type": "FLOAT", "mode": "NULLABLE", - "description": "The 2-year mean citedness for this institutions. Also known as impact factor." + "description": "" + }, + { + "name": "2yr_works_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "cited_by_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" }, { "name": "h_index", "type": "INTEGER", "mode": "NULLABLE", - "description": "The h-index for this institutions." + "description": "" }, { "name": "i10_index", "type": "INTEGER", "mode": "NULLABLE", - "description": "The i-10 index for this institutions." + "description": "" + }, + { + "name": "oa_percent", + "type": "FLOAT", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "works_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" } ], "description": "Citation metrics for this institutions." diff --git a/academic_observatory_workflows/database/schema/openalex/publishers.json b/academic_observatory_workflows/database/schema/openalex/publishers.json index 91451bdb6..6a8d3e041 100644 --- a/academic_observatory_workflows/database/schema/openalex/publishers.json +++ b/academic_observatory_workflows/database/schema/openalex/publishers.json @@ -28,6 +28,12 @@ "mode": "NULLABLE", "description": "The total number of Works that cite a Work published by this publisher." }, + { + "name": "oa_works_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, { "name": "works_count", "type": "INTEGER", @@ -113,9 +119,43 @@ }, { "name": "parent_publisher", - "type": "STRING", + "type": "RECORD", "mode": "NULLABLE", - "description": "An OpenAlex ID linking to the direct parent of the publisher. This will be null if the publisher's hierarchy_level is 0." + "fields": [ + { + "name": "id", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "display_name", + "type": "STRING", + "mode": "NULLABLE" + } + ], + "description": "An OpenAlex ID linking to the direct parent of the publisher and display name. This will be null if the publisher's hierarchy_level is 0." + }, + { + "name": "roles", + "type": "RECORD", + "mode": "REPEATED", + "fields": [ + { + "name": "id", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "role", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "works_count", + "type": "INTEGER", + "mode": "NULLABLE" + } + ] }, { "name": "sources_api_url", @@ -129,23 +169,65 @@ "mode": "NULLABLE", "description": "Citation metrics for this publisher", "fields": [ + { + "name": "2yr_cited_by_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "2yr_h_index", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "2yr_i10_index", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, { "name": "2yr_mean_citedness", "type": "FLOAT", "mode": "NULLABLE", - "description": "The 2-year mean citedness for this publisher. Also known as impact factor. While the h-index and the i-10 index are normally author-level metrics and the 2-year mean citedness is normally a journal-level metric, they can be calculated for any set of papers, so we include them for publishers." + "description": "" + }, + { + "name": "2yr_works_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "cited_by_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" }, { "name": "h_index", "type": "INTEGER", "mode": "NULLABLE", - "description": "The h-index for this publisher." + "description": "" }, { "name": "i10_index", "type": "INTEGER", "mode": "NULLABLE", - "description": "The i-10 index for this publisher." + "description": "" + }, + { + "name": "oa_percent", + "type": "FLOAT", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "works_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" } ] }, diff --git a/academic_observatory_workflows/database/schema/openalex/sources.json b/academic_observatory_workflows/database/schema/openalex/sources.json index 100a37b58..2380660b1 100644 --- a/academic_observatory_workflows/database/schema/openalex/sources.json +++ b/academic_observatory_workflows/database/schema/openalex/sources.json @@ -60,6 +60,12 @@ "mode": "NULLABLE", "description": "The total number of Works that cite a Work hosted in this source." }, + { + "name": "oa_works_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, { "name": "works_count", "type": "INTEGER", @@ -222,23 +228,65 @@ "type": "RECORD", "mode": "NULLABLE", "fields": [ + { + "name": "2yr_cited_by_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "2yr_h_index", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "2yr_i10_index", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, { "name": "2yr_mean_citedness", "type": "FLOAT", "mode": "NULLABLE", - "description": "The 2-year mean citedness for this source. Also known as impact factor." + "description": "" + }, + { + "name": "2yr_works_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "cited_by_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" }, { "name": "h_index", "type": "INTEGER", "mode": "NULLABLE", - "description": "The h-index for this source." + "description": "" }, { "name": "i10_index", "type": "INTEGER", "mode": "NULLABLE", - "description": "The i-10 index for this source." + "description": "" + }, + { + "name": "oa_percent", + "type": "FLOAT", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "works_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" } ], "description": "Citation metrics for this source." diff --git a/academic_observatory_workflows/database/schema/openalex/works.json b/academic_observatory_workflows/database/schema/openalex/works.json index 2c3b70c11..18cc4be6b 100644 --- a/academic_observatory_workflows/database/schema/openalex/works.json +++ b/academic_observatory_workflows/database/schema/openalex/works.json @@ -1,57 +1,32 @@ [ { - "name": "host_venue", + "name": "abstract_inverted_index", "type": "RECORD", - "description": "DEPRECATED", + "mode": "NULLABLE", "fields": [ { - "name": "display_name", - "type": "STRING", - "description": "DEPRECATED" - }, - { - "name": "id", - "type": "STRING", - "description": "DEPRECATED" - }, - { - "name": "is_oa", - "type": "BOOLEAN", - "description": "DEPRECATED" - }, - { - "name": "issn", + "name": "keys", "type": "STRING", "mode": "REPEATED", - "description": "DEPRECATED" - }, - { - "name": "issn_l", - "type": "STRING", - "description": "DEPRECATED" - }, - { - "name": "license", - "type": "STRING", - "description": "DEPRECATED" - }, - { - "name": "publisher", - "type": "STRING", - "description": "DEPRECATED" - }, - { - "name": "type", - "type": "STRING", - "description": "DEPRECATED" + "description": "Custom field created by COKI. Originally each word in the abstract was a key and the indices of where this word occurred inside the abstract the corresponding value." }, { - "name": "url", + "name": "values", "type": "STRING", - "description": "DEPRECATED" - }, + "mode": "REPEATED", + "description": "Custom field created by COKI. Originally each word in the abstract was a key and the indices of where this word occurred inside the abstract the corresponding value." + } + ], + "description": "The abstract of the work, as an inverted index, which encodes information about the abstract's words and their positions within the text. Like Microsoft Academic Graph, OpenAlex doesn't include plaintext abstracts due to legal constraints." + }, + { + "name": "alternate_host_venues", + "type": "RECORD", + "mode": "REPEATED", + "description": "DEPRECATED", + "fields": [ { - "name": "version", + "name": "display_name", "type": "STRING", "description": "DEPRECATED" }, @@ -61,25 +36,13 @@ "description": "DEPRECATED" }, { - "name": "host_organization_name", + "name": "host_organization_lineage", "type": "STRING", + "mode": "REPEATED", "description": "DEPRECATED" }, { - "name": "publisher_id", - "type": "STRING", - "description": "DEPRECATED" - } - ] - }, - { - "name": "alternate_host_venues", - "type": "RECORD", - "mode": "REPEATED", - "description": "DEPRECATED", - "fields": [ - { - "name": "display_name", + "name": "host_organization_name", "type": "STRING", "description": "DEPRECATED" }, @@ -115,56 +78,55 @@ "description": "DEPRECATED" }, { - "name": "type", - "type": "STRING", - "description": "DEPRECATED" - }, - { - "name": "url", - "type": "STRING", - "description": "DEPRECATED" - }, - { - "name": "version", + "name": "publisher_id", "type": "STRING", "description": "DEPRECATED" }, { - "name": "host_organization", + "name": "type", "type": "STRING", "description": "DEPRECATED" }, { - "name": "host_organization_name", + "name": "url", "type": "STRING", "description": "DEPRECATED" }, { - "name": "publisher_id", + "name": "version", "type": "STRING", "description": "DEPRECATED" } ] }, { - "name": "abstract_inverted_index", + "name": "apc_payment", "type": "RECORD", "mode": "NULLABLE", "fields": [ { - "name": "keys", + "name": "currency", "type": "STRING", - "mode": "REPEATED", - "description": "Custom field created by COKI. Originally each word in the abstract was a key and the indices of where this word occurred inside the abstract the corresponding value." + "mode": "NULLABLE" }, { - "name": "values", + "name": "price", + "type": "INTEGER", + "mode": "NULLABLE" + }, + { + "name": "price_usd", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "APC converted to USD" + }, + { + "name": "provenance", "type": "STRING", - "mode": "REPEATED", - "description": "Custom field created by COKI. Originally each word in the abstract was a key and the indices of where this word occurred inside the abstract the corresponding value." + "mode": "NULLABLE" } ], - "description": "The abstract of the work, as an inverted index, which encodes information about the abstract's words and their positions within the text. Like Microsoft Academic Graph, OpenAlex doesn't include plaintext abstracts due to legal constraints." + "description": "Objects containing information about the APC (article processing charge) for this work. If we can get the APC price from OpenAPC, we use that. Those APCs are specific to an article and are the actual APC paid by an author or institution to publish the article. As a fallback, we use the DOAJ APC prices that are available in sources. Those are an estimate of what authors would have had to pay to publish the article, since the DOAJ apc prices apply to an entire journal." }, { "name": "authorships", @@ -283,6 +245,12 @@ "mode": "NULLABLE", "description": "The location's publishing license. This can be a Create Commons license such as cc0 or cc-by, a publisher-specific license, or null which means we are not able to determine a license for this location." }, + { + "name": "pdf_url", + "type": "STRING", + "mode": "NULLABLE", + "description": "A URL where you can find this location as a PDF." + }, { "name": "source", "type": "RECORD", @@ -300,6 +268,11 @@ "mode": "NULLABLE", "description": "The host organization for this source as an OpenAlex ID. This will be an Institution.id if the source is a repository, and a Publisher.id if the source is a journal, conference, or eBook platform (based on the type field)." }, + { + "name": "host_organization_lineage", + "type": "STRING", + "mode": "REPEATED" + }, { "name": "host_organization_name", "type": "STRING", @@ -345,12 +318,6 @@ ], "description": "Information about the source of this location, as a DehydratedSource object." }, - { - "name": "pdf_url", - "type": "STRING", - "mode": "NULLABLE", - "description": "A URL where you can find this location as a PDF." - }, { "name": "version", "type": "STRING", @@ -437,6 +404,18 @@ ], "description": "List of dehydrated Concept objects. \nEach Concept object in the list also has one additional property" }, + { + "name": "corresponding_author_ids", + "type": "STRING", + "mode": "REPEATED", + "description": "OpenAlex IDs of any authors for which authorships.is_corresponding is true." + }, + { + "name": "corresponding_institution_ids", + "type": "STRING", + "mode": "REPEATED", + "description": "OpenAlex IDs of any institutions found within an authorship for which authorships.is_corresponding is true." + }, { "name": "counts_by_year", "type": "RECORD", @@ -448,6 +427,12 @@ "mode": "NULLABLE", "description": "The number of times this work is cited in this year." }, + { + "name": "oa_works_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, { "name": "year", "type": "INTEGER", @@ -519,6 +504,108 @@ } ] }, + { + "name": "grants", + "type": "RECORD", + "mode": "REPEATED", + "fields": [ + { + "name": "award_id", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "funder", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "funder_display_name", + "type": "STRING", + "mode": "NULLABLE" + } + ], + "description": "List of grant objects, which include the Funder and the award ID, if available. Our grants data comes from Crossref, and is currently fairly limited." + }, + { + "name": "host_venue", + "type": "RECORD", + "description": "DEPRECATED", + "fields": [ + { + "name": "display_name", + "type": "STRING", + "description": "DEPRECATED" + }, + { + "name": "host_organization", + "type": "STRING", + "description": "DEPRECATED" + }, + { + "name": "host_organization_lineage", + "type": "STRING", + "mode": "REPEATED", + "description": "DEPRECATED" + }, + { + "name": "host_organization_name", + "type": "STRING", + "description": "DEPRECATED" + }, + { + "name": "id", + "type": "STRING", + "description": "DEPRECATED" + }, + { + "name": "is_oa", + "type": "BOOLEAN", + "description": "DEPRECATED" + }, + { + "name": "issn", + "type": "STRING", + "mode": "REPEATED", + "description": "DEPRECATED" + }, + { + "name": "issn_l", + "type": "STRING", + "description": "DEPRECATED" + }, + { + "name": "license", + "type": "STRING", + "description": "DEPRECATED" + }, + { + "name": "publisher", + "type": "STRING", + "description": "DEPRECATED" + }, + { + "name": "publisher_id", + "type": "STRING", + "description": "DEPRECATED" + }, + { + "name": "type", + "type": "STRING", + "description": "DEPRECATED" + }, + { + "name": "url", + "type": "STRING", + "description": "DEPRECATED" + }, + { + "name": "version", + "type": "STRING", + "description": "DEPRECATED" + } + ] + }, { "name": "id", "type": "STRING", @@ -563,6 +650,12 @@ ], "description": "All the persistent identifiers (PIDs) that we know about for this work, as key: value pairs, where key is the PID namespace, and value is the PID. IDs are expressed as URIs where possible." }, + { + "name": "is_oa", + "type": "BOOLEAN", + "mode": "NULLABLE", + "description": "Set to true if the work hosted here can be read for free, without registration." + }, { "name": "is_paratext", "type": "BOOLEAN", @@ -581,6 +674,12 @@ "mode": "NULLABLE", "description": "The language of the work in ISO 639-1 format. The language is automatically detected using the information we have about the work. We use the langdetect software library on the words in the work's abstract, or the title if we do not have the abstract. The source code for this procedure is here. Keep in mind that this method is not perfect, and that in some cases the language of the title or abstract could be different from the body of the work." }, + { + "name": "license", + "type": "STRING", + "mode": "NULLABLE", + "description": "The license applied to this work at this host. Most toll-access works don't have an explicit license (they're under \"all rights reserved\" copyright), so this field generally has content only if is_oa is true." + }, { "name": "locations", "type": "RECORD", @@ -627,6 +726,11 @@ "mode": "NULLABLE", "description": "The host organization for this source as an OpenAlex ID. This will be an Institution.id if the source is a repository, and a Publisher.id if the source is a journal, conference, or eBook platform (based on the type field)." }, + { + "name": "host_organization_lineage", + "type": "STRING", + "mode": "REPEATED" + }, { "name": "host_organization_name", "type": "STRING", @@ -675,11 +779,17 @@ "name": "version", "type": "STRING", "mode": "NULLABLE", - "description": "The version of the work, based on the DRIVER Guidelines versioning scheme. Possible values are: publishedVersion: The document’s version of record. This is the most authoritative version.\nacceptedVersion: The document after having completed peer review and being officially accepted for publication. It will lack publisher formatting, but the content should be interchangeable with the that of the publishedVersion.\nsubmittedVersion: the document as submitted to the publisher by the authors, but before peer-review. Its content may differ significantly from that of the accepted article." + "description": "The version of the work, based on the DRIVER Guidelines versioning scheme. Possible values are: publishedVersion: The document\u2019s version of record. This is the most authoritative version.\nacceptedVersion: The document after having completed peer review and being officially accepted for publication. It will lack publisher formatting, but the content should be interchangeable with the that of the publishedVersion.\nsubmittedVersion: the document as submitted to the publisher by the authors, but before peer-review. Its content may differ significantly from that of the accepted article." } ], "description": "A list of Location objects describing all unique places where this work lives." }, + { + "name": "locations_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "Number of locations for this work." + }, { "name": "mesh", "type": "RECORD", @@ -718,6 +828,11 @@ "type": "RECORD", "mode": "NULLABLE", "fields": [ + { + "name": "any_repository_has_fulltext", + "type": "BOOLEAN", + "mode": "NULLABLE" + }, { "name": "is_oa", "type": "BOOLEAN", @@ -785,6 +900,11 @@ "mode": "NULLABLE", "description": "The host organization for this source as an OpenAlex ID. This will be an Institution.id if the source is a repository, and a Publisher.id if the source is a journal, conference, or eBook platform (based on the type field)." }, + { + "name": "host_organization_lineage", + "type": "STRING", + "mode": "REPEATED" + }, { "name": "host_organization_name", "type": "STRING", @@ -833,7 +953,7 @@ "name": "version", "type": "STRING", "mode": "NULLABLE", - "description": "The version of the work, based on the DRIVER Guidelines versioning scheme. Possible values are:.\npublishedVersion: The document’s version of record. This is the most authoritative version.\nacceptedVersion: The document after having completed peer review and being officially accepted for publication. It will lack publisher formatting, but the content should be interchangeable with the that of the publishedVersion.\nsubmittedVersion: the document as submitted to the publisher by the authors, but before peer-review. Its content may differ significantly from that of the accepted article." + "description": "The version of the work, based on the DRIVER Guidelines versioning scheme. Possible values are:.\npublishedVersion: The document\u2019s version of record. This is the most authoritative version.\nacceptedVersion: The document after having completed peer review and being officially accepted for publication. It will lack publisher formatting, but the content should be interchangeable with the that of the publishedVersion.\nsubmittedVersion: the document as submitted to the publisher by the authors, but before peer-review. Its content may differ significantly from that of the accepted article." } ], "description": "A Location object with the primary location of this work." @@ -862,6 +982,61 @@ "mode": "REPEATED", "description": "OpenAlex IDs for works related to this work. " }, + { + "name": "summary_stats", + "type": "RECORD", + "mode": "NULLABLE", + "fields": [ + { + "name": "2yr_cited_by_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "2yr_h_index", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "2yr_i10_index", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "2yr_mean_citedness", + "type": "FLOAT", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "cited_by_count", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "h_index", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "i10_index", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + }, + { + "name": "oa_percent", + "type": "FLOAT", + "mode": "NULLABLE", + "description": "" + } + ] + }, { "name": "title", "type": "STRING", @@ -879,5 +1054,17 @@ "type": "TIMESTAMP", "mode": "NULLABLE", "description": "The last time anything in this Work object changed, expressed as an ISO 8601 date string. This date is updated for any change at all, including increases in various counts." + }, + { + "name": "url", + "type": "STRING", + "mode": "NULLABLE", + "description": "The URL where you can access this work." + }, + { + "name": "version", + "type": "STRING", + "mode": "NULLABLE", + "description": "The version of the work, based on the DRIVER Guidelines versioning scheme. Possible values are: publishedVersion, acceptedVersion or submittedVersion." } ] \ No newline at end of file diff --git a/academic_observatory_workflows/database/schema/unpaywall/unpaywall.json b/academic_observatory_workflows/database/schema/unpaywall/unpaywall.json index 9c274a8b8..0bc2e58e9 100644 --- a/academic_observatory_workflows/database/schema/unpaywall/unpaywall.json +++ b/academic_observatory_workflows/database/schema/unpaywall/unpaywall.json @@ -398,6 +398,43 @@ "mode": "NULLABLE", "name": "name", "type": "STRING" + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "id", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "id-type", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "asserted-by", + "type": "STRING" + } + ], + "mode": "REPEATED", + "name": "id", + "type": "RECORD" + }, + { + "mode": "REPEATED", + "name": "place", + "type": "STRING" + }, + { + "mode": "REPEATED", + "name": "department", + "type": "STRING" + }, + { + "mode": "REPEATED", + "name": "acronym", + "type": "STRING" } ], "mode": "REPEATED", diff --git a/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2 b/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2 index 8228a1b4f..32ab6958a 100644 --- a/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2 +++ b/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2 @@ -13,23 +13,6 @@ # limitations under the License. # Author: Richard Hosking #} -# Helper Function: Processing Output Types -{# -Output Schema: -per_25th FLOAT NULLABLE -median FLOAT NULLABLE -per_90th FLOAT NULLABLE -per_95th FLOAT NULLABLE -#} -CREATE TEMP FUNCTION compute_percentiles(counts ARRAY) AS ( - (SELECT as STRUCT - ROUND(PERCENTILE_CONT(count, 0.25) OVER(), 2) as per_25th, - ROUND(PERCENTILE_CONT(count, 0.50) OVER(), 2) as median, - ROUND(PERCENTILE_CONT(count, 0.90) OVER(), 2) as per_90th, - ROUND(PERCENTILE_CONT(count, 0.95) OVER(), 2) as per_95th - FROM UNNEST(counts) as count LIMIT 1) -); - # Helper Function: Counting Access Types {# Output Schema: @@ -77,11 +60,10 @@ citations RECORD NULLABLE citations.total_openalex_citations INTEGER NULLABLE citations.total_open_citations_citations INTEGER NULLABLE citations.total_crossref_citations INTEGER NULLABLE -citations.total_mag_citations INTEGER NULLABLE #} CREATE TEMP FUNCTION count_single_output_type( output_type STRING, - items ARRAY, + items ARRAY, oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL>>, measured_type STRING) AS ( (SELECT as STRUCT @@ -97,8 +79,7 @@ CREATE TEMP FUNCTION count_single_output_type( STRUCT( SUM(citations.openalex) as total_openalex_citations, SUM(citations.open_citations) as total_open_citations_citations, - SUM(citations.crossref) as total_crossref_citations, - SUM(citations.mag) as total_mag_citations + SUM(citations.crossref) as total_crossref_citations ) as citations FROM UNNEST(items) as item) ); @@ -119,10 +100,9 @@ citations RECORD NULLABLE citations.total_openalex_citations INTEGER NULLABLE citations.total_open_citations_citations INTEGER NULLABLE citations.total_crossref_citations INTEGER NULLABLE -citations.total_mag_citations INTEGER NULLABLE #} CREATE TEMP FUNCTION count_array_output_type( - output_type STRING, items ARRAY, + output_type STRING, items ARRAY, oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL>>, measured_type ARRAY) AS ( (SELECT as STRUCT @@ -138,8 +118,7 @@ CREATE TEMP FUNCTION count_array_output_type( STRUCT( SUM(citations.openalex) as total_openalex_citations, SUM(citations.open_citations) as total_open_citations_citations, - SUM(citations.crossref) as total_crossref_citations, - SUM(citations.mag) as total_mag_citations + SUM(citations.crossref) as total_crossref_citations ) as citations FROM UNNEST(items) as item) @@ -161,10 +140,9 @@ citations RECORD NULLABLE citations.total_openalex_citations INTEGER NULLABLE citations.total_open_citations_citations INTEGER NULLABLE citations.total_crossref_citations INTEGER NULLABLE -citations.total_mag_citations INTEGER NULLABLE #} CREATE TEMP FUNCTION count_not_in_array_output_type( - output_type STRING, items ARRAY, + output_type STRING, items ARRAY, oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL>>, measured_type ARRAY) AS ( (SELECT as STRUCT @@ -180,8 +158,7 @@ CREATE TEMP FUNCTION count_not_in_array_output_type( STRUCT( SUM(citations.openalex) as total_openalex_citations, SUM(citations.open_citations) as total_open_citations_citations, - SUM(citations.crossref) as total_crossref_citations, - SUM(citations.mag) as total_mag_citations + SUM(citations.crossref) as total_crossref_citations ) as citations FROM UNNEST(items) as item) ); @@ -194,7 +171,7 @@ output_types RECORD REPEATED * Each record has the same schema, and is captured in the count_* methods #} CREATE TEMP FUNCTION count_output_types( - items ARRAY, + items ARRAY, oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL>>) AS ( [ @@ -221,21 +198,15 @@ outputs_without_citations INTEGER NULLABLE citations RECORD NULLABLE citations.openalex RECORD NULLABLE citations.openalex.total_citations INTEGER NULLABLE -citations.openalex.percentiles RECORD NULLABLE citations.open_citations RECORD NULLABLE citations.open_citations.total_citations INTEGER NULLABLE -citations.open_citations.percentiles RECORD NULLABLE citations.crossref RECORD NULLABLE citations.crossref.total_citations INTEGER NULLABLE -citations.crossref.percentiles RECORD NULLABLE -citations.mag RECORD NULLABLE -citations.mag.total_citations INTEGER NULLABLE -citations.mag.percentiles RECORD NULLABLE *percetiles schema captured above #} CREATE TEMP FUNCTION compute_conditional_citations( - items ARRAY, is_x BOOL>>, + items ARRAY, is_x BOOL>>, access_type STRING, positive_label STRING, negative_label STRING) AS ( ARRAY(( @@ -248,25 +219,18 @@ CREATE TEMP FUNCTION compute_conditional_citations( access_type, is_x as status, COUNT(*) as total_outputs, - COUNTIF(citations.crossref > 0 OR citations.open_citations > 0 OR citations.openalex > 0 OR citations.mag > 0) as outputs_with_citations, - COUNTIF( (citations.crossref IS NULL OR citations.crossref = 0) AND (citations.open_citations IS NULL OR citations.open_citations = 0) AND (citations.openalex IS NULL OR citations.openalex = 0) OR (citations.mag IS NULL OR citations.mag = 0) ) as outputs_without_citations, + COUNTIF(citations.crossref > 0 OR citations.open_citations > 0 OR citations.openalex > 0) as outputs_with_citations, + COUNTIF( (citations.crossref IS NULL OR citations.crossref = 0) AND (citations.open_citations IS NULL OR citations.open_citations = 0) AND (citations.openalex IS NULL OR citations.openalex = 0)) as outputs_without_citations, STRUCT( STRUCT( - SUM(citations.openalex) as total_citations, - compute_percentiles(ARRAY_AGG(citations.openalex)) as percentiles + SUM(citations.openalex) as total_citations ) as openalex, STRUCT( - SUM(citations.open_citations) as total_citations, - compute_percentiles(ARRAY_AGG(citations.open_citations)) as percentiles + SUM(citations.open_citations) as total_citations ) as open_citations, STRUCT( - SUM(citations.crossref) as total_citations, - compute_percentiles(ARRAY_AGG(citations.crossref)) as percentiles - ) as crossref, - STRUCT( - SUM(citations.mag) as total_citations, - compute_percentiles(ARRAY_AGG(citations.mag)) as percentiles - ) as mag + SUM(citations.crossref) as total_citations + ) as crossref ) as citations, FROM UNNEST(items) @@ -295,14 +259,8 @@ open_citations.citations_per_output FLOAT NULLABLE open_citations.outputs_with_citations INTEGER NULLABLE open_citations.outputs_without_citations INTEGER NULLABLE open_citations.citations_per_cited_output FLOAT NULLABLE -mag RECORD NULLABLE -mag.total_citations INTEGER NULLABLE -mag.citations_per_output FLOAT NULLABLE -mag.outputs_with_citations INTEGER NULLABLE -mag.outputs_without_citations INTEGER NULLABLE -mag.citations_per_cited_output FLOAT NULLABLE #} -CREATE TEMP FUNCTION compute_citations(items ARRAY>>) as ( +CREATE TEMP FUNCTION compute_citations(items ARRAY>>) as ( (SELECT AS STRUCT -- Citation counts STRUCT( @@ -325,14 +283,7 @@ CREATE TEMP FUNCTION compute_citations(items ARRAY 0) as outputs_with_citations, COUNTIF(citations.open_citations is null) as outputs_without_citations, ROUND(SAFE_DIVIDE(SUM(citations.open_citations), COUNTIF(citations.open_citations > 0)), 2) as citations_per_cited_output - ) as open_citations, - STRUCT( - SUM(citations.mag) as total_citations, - ROUND(SAFE_DIVIDE( SUM(citations.mag) , COUNT(doi)), 2) as citations_per_output, - COUNTIF(citations.mag > 0) as outputs_with_citations, - COUNTIF(citations.mag is null) as outputs_without_citations, - ROUND(SAFE_DIVIDE(SUM(citations.mag), COUNTIF(citations.mag > 0)), 2) as citations_per_cited_output - ) as mag + ) as open_citations FROM UNNEST(items)) ); @@ -365,7 +316,7 @@ breakdown RECORD REPEATED * breakdown object array captured in compute_conditional_citations schema above #} CREATE TEMP FUNCTION compute_access_types( - items ARRAY, + items ARRAY, is_oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL>>) AS ( (SELECT AS STRUCT @@ -530,16 +481,11 @@ sum_of_scores FLOAT NULLABLE citations RECORD NULLABLE openalex RECORD NULLABLE total_citations INTEGER NULLABLE -percentiles RECORD NULLABLE open_citations RECORD NULLABLE total_citations INTEGER NULLABLE -percentiles RECORD NULLABLE crossref RECORD NULLABLE total_citations INTEGER NULLABLE -percentiles RECORD NULLABLE -mag RECORD NULLABLE total_citations INTEGER NULLABLE -percentiles RECORD NULLABLE num_oa_outputs INTEGER NULLABLE num_green_outputs INTEGER NULLABLE num_gold_outputs INTEGER NULLABLE @@ -559,7 +505,7 @@ num_international_collaboration_outputs INTEGER NULLABLE international_collaboration_with_funding_outputs INTEGER NULLABLE #} CREATE TEMP FUNCTION compute_disciplines( - fields ARRAY, + fields ARRAY, is_oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL, funding BOOL, international_funding BOOL, domestic_funding BOOL, government_funding BOOL, private_funding BOOL, international_colab BOOL>>) AS ( @@ -570,20 +516,13 @@ CREATE TEMP FUNCTION compute_disciplines( SUM(Score) as sum_of_scores, STRUCT( STRUCT( - SUM(citations.mag) as total_citations, - compute_percentiles(ARRAY_AGG(citations.mag)) as percentiles - ) as mag, - STRUCT( - SUM(citations.openalex) as total_citations, - compute_percentiles(ARRAY_AGG(citations.openalex)) as percentiles + SUM(citations.openalex) as total_citations ) as openalex, STRUCT( - SUM(citations.open_citations) as total_citations, - compute_percentiles(ARRAY_AGG(citations.open_citations)) as percentiles + SUM(citations.open_citations) as total_citations ) as open_citations, STRUCT( - SUM(citations.crossref) as total_citations, - compute_percentiles(ARRAY_AGG(citations.crossref)) as percentiles + SUM(citations.crossref) as total_citations ) as crossref ) as citations, COUNTIF(is_oa) as num_oa_outputs, @@ -657,7 +596,6 @@ citations RECORD NULLABLE citations.openalex INTEGER NULLABLE citations.crosssref INTEGER NULLABLE citations.open_citations INTEGER NULLABLE -citations.mag INTEGER NULLABLE disciplines RECORD REPEATED * Schema for disciplines captured above @@ -678,8 +616,7 @@ CREATE TEMP FUNCTION process_relations(relationships ANY TYPE, total INT64, tota STRUCT( SUM(citations.openalex) as openalex, SUM(citations.crossref) as crosssref, - SUM(citations.open_citations) as open_citations, - SUM(citations.mag) as mag + SUM(citations.open_citations) as open_citations ) as citations, group_disciplines(ARRAY_CONCAT_AGG(disciplines)) as disciplines FROM UNNEST(relationships) as relations @@ -784,8 +721,7 @@ WITH tmp_disciplines AS STRUCT( dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, - dois.open_citations.citations_total as open_citations, - dois.mag.CitationCount as mag + dois.open_citations.citations_total as open_citations ) as citations, unpaywall.is_oa as is_oa, unpaywall.green as green, unpaywall.gold as gold, unpaywall.gold_just_doaj, unpaywall.hybrid, unpaywall.bronze, unpaywall.green_only, -- Total Funding @@ -818,7 +754,7 @@ tmp_access_types AS ( compute_access_types( ARRAY_AGG( STRUCT( - dois.doi, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations, dois.mag.CitationCount as mag) as citations, + dois.doi, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, unpaywall.is_oa, unpaywall.green, unpaywall.gold, unpaywall.gold_just_doaj, unpaywall.hybrid, unpaywall.bronze, unpaywall.green_only ) ) @@ -864,7 +800,7 @@ SELECT compute_citations( ARRAY_AGG( STRUCT( - dois.doi, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations, dois.mag.CitationCount as mag) as citations + dois.doi, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations ) ) ) as citations, @@ -873,7 +809,7 @@ SELECT count_output_types( ARRAY_AGG( STRUCT( - unpaywall.output_type, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations, dois.mag.CItationCount as mag) as citations, + unpaywall.output_type, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, unpaywall.is_oa, unpaywall.green, unpaywall.gold, unpaywall.gold_just_doaj, unpaywall.hybrid, unpaywall.bronze, unpaywall.green_only ) ) @@ -906,7 +842,7 @@ SELECT (SELECT as STRUCT relation, unpaywall, - STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations, dois.mag.CitationCount as mag) as citations, + STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines FROM UNNEST(affiliations.institutions) as relation WHERE relation.identifier <> aggregrate.identifier) @@ -930,7 +866,7 @@ SELECT (SELECT as STRUCT relation, unpaywall, - STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations, dois.mag.CitationCount as mag) as citations, + STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines FROM UNNEST(affiliations.countries) as relation WHERE relation.identifier <> aggregrate.country_code OR aggregrate.country_code IS NULL) @@ -954,7 +890,7 @@ SELECT (SELECT as STRUCT relation, unpaywall, - STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations, dois.mag.CitationCount as mag) as citations, + STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines FROM UNNEST(affiliations.groupings) as relation WHERE relation.identifier <> aggregrate.identifier) @@ -978,7 +914,7 @@ SELECT (SELECT as STRUCT relation, unpaywall, - STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations, dois.mag.CitationCount as mag) as citations, + STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines FROM UNNEST(affiliations.funders) as relation WHERE relation.identifier <> aggregrate.identifier) @@ -1005,7 +941,7 @@ SELECT ) as relation, unpaywall, STRUCT( - dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations, dois.mag.CitationCount as mag + dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations ) as citations, ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines FROM UNNEST(aggregrate.members) as relation) @@ -1029,7 +965,7 @@ SELECT (SELECT as STRUCT relation, unpaywall, - STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations, dois.mag.CitationCount as mag) as citations, + STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines FROM UNNEST(affiliations.publishers) as relation WHERE relation.identifier <> aggregrate.identifier) @@ -1053,7 +989,7 @@ SELECT (SELECT as STRUCT relation, unpaywall, - STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations, dois.mag.CitationCount as mag) as citations, + STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines FROM UNNEST(affiliations.journals) as relation WHERE relation.identifier <> aggregrate.identifier) @@ -1074,7 +1010,7 @@ SELECT SELECT AS STRUCT event.source, event.count, - STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations, dois.mag.CitationCount as mag) as citations, + STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, unpaywall.is_oa as is_oa, unpaywall.green as green, unpaywall.gold as gold, unpaywall.gold_just_doaj, unpaywall.hybrid, unpaywall.bronze, unpaywall.green_only FROM UNNEST(dois.events.events) as event))) ) as events, diff --git a/academic_observatory_workflows/database/sql/export_access_types.sql.jinja2 b/academic_observatory_workflows/database/sql/export_access_types.sql.jinja2 index 8583b711e..5d1b6c0af 100644 --- a/academic_observatory_workflows/database/sql/export_access_types.sql.jinja2 +++ b/academic_observatory_workflows/database/sql/export_access_types.sql.jinja2 @@ -23,6 +23,5 @@ SELECT access_type.outputs_with_citations as access_types_outputs_with_citations, access_type.outputs_without_citations as access_types_outputs_without_citations, access_type.citations.openalex.total_citations as access_types_total_citations, - access_type.citations.openalex.percentiles.median as access_types_median_citations_per_output FROM `{{ table_id }}`, UNNEST( access_types.breakdown ) as access_type ORDER BY id, published_year ASC \ No newline at end of file diff --git a/academic_observatory_workflows/database/sql/export_disciplines.sql.jinja2 b/academic_observatory_workflows/database/sql/export_disciplines.sql.jinja2 index fd2ac6fbe..9741fa76a 100644 --- a/academic_observatory_workflows/database/sql/export_disciplines.sql.jinja2 +++ b/academic_observatory_workflows/database/sql/export_disciplines.sql.jinja2 @@ -30,7 +30,6 @@ SELECT ROUND(SAFE_DIVIDE( ( discipline.num_green_outputs ) * 100 , discipline.total_outputs ), 2) as disciplines_percent_green, ROUND(SAFE_DIVIDE( ( discipline.num_gold_outputs ) * 100 , discipline.total_outputs ), 2) as disciplines_percent_gold, discipline.citations.openalex.total_citations as disciplines_total_citations, - discipline.citations.openalex.percentiles.median as disciplines_median_citations_per_output, discipline.funding.total_funded_outputs as disciplines_total_funded_outputs, discipline.funding.num_international_outputs as disciplines_num_international_funded_outputs, discipline.funding.num_domestic_outputs as disciplines_num_domestic_funded_outputs, diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/publishers/updated_date=2023-04-02/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/publishers/updated_date=2023-04-02/part_000.json index 375e75a25..9e2de6d6e 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/publishers/updated_date=2023-04-02/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/publishers/updated_date=2023-04-02/part_000.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b49ae1aedf81fc4a8dd35379fa8a48dbf2ce13d02319c4691c5ed5a8e981da7 -size 7823 +oid sha256:befe9a21ace1f764aa2898292df137d17934d1b7d6f7fad4550064bd597b95f5 +size 7831 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/authors.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/authors.json index a90eed227..04bee3b27 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/authors.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/authors.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3bd7abd62f450450d2a8a17645ee358af0fd4707c37b94ab5e6bbfe93729e072 -size 3549 +oid sha256:5f8156a2d393590d0bdf59542509bdb0104d4e463fb7d3913b4ac77df24f9647 +size 4365 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/concepts.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/concepts.json index f4a60de46..7de1c0ffc 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/concepts.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/concepts.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e169145a2bf24ffcb0e44c20a7c1adcf86d6249f2ac6bd8b1b17af20965d2981 -size 63659 +oid sha256:e7af41245c3279bdef11dce94afa87282ea22463eed22ad692c2a765c1f5cee5 +size 66011 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/institutions.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/institutions.json index 8f2e78cfd..6e8272084 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/institutions.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/institutions.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:15489b3de773811b568079c6faec6eed6b3eae93d5be839cd270ce73df73692f -size 40406 +oid sha256:2ea8182e507898c6e4638c280b38db7b6d197e495c9673a675b11af4530bfdfd +size 42826 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/publishers.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/publishers.json index 29743dac5..1bc0793f3 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/publishers.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/publishers.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:073423dac00bcc41a554dece518f24ac7647fbc900da1417c8bfaeeb9066ad8b -size 8180 +oid sha256:7a59eabc9986ab52e0abfa11a568043f0cb30d0855babfdbc8f95e0737d622ef +size 10630 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/sources.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/sources.json index 22f864c3f..6466cf2e1 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/sources.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/sources.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a75e223b6812f803ae5b13340a53f6cb9703f9bcb0855bc6b0bc83d469e902e9 -size 23983 +oid sha256:eda11d6389dea9d28fa2d523a3e1da4a82aad7c40c43edac4026d9d02597bf3a +size 26239 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/works.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/works.json index e1d4fa5cb..f33846a96 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/works.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/works.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a748ded838d21c426a3a62b9e4c6cad1d29cfe26ed6151b4fa445f5f580ba093 -size 61739 +oid sha256:2b99194cd7199fa282aaa1656f33f3f119734ce7051e5e81a8b626a52cd3d977 +size 64316 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/publishers/updated_date=2023-04-02/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/publishers/updated_date=2023-04-02/part_000.json index 375e75a25..9e2de6d6e 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/publishers/updated_date=2023-04-02/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/publishers/updated_date=2023-04-02/part_000.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b49ae1aedf81fc4a8dd35379fa8a48dbf2ce13d02319c4691c5ed5a8e981da7 -size 7823 +oid sha256:befe9a21ace1f764aa2898292df137d17934d1b7d6f7fad4550064bd597b95f5 +size 7831 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/works/updated_date=2023-04-16/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/works/updated_date=2023-04-16/part_000.json index ea1d97d70..2777d7f88 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/works/updated_date=2023-04-16/part_000.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/works/updated_date=2023-04-16/part_000.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9584a66707b4e6cb62c5b482ac55ea6ff59d37102ca1d8b73ab16eecb373a663 -size 48934 +oid sha256:2af6dd859abdd351c261e7bb3e43ab90bce6531bf6c5fda0a26302bea59be6c7 +size 49088 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/authors.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/authors.json index 38a5cf053..9cecf53f3 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/authors.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/authors.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ade95bb53f2ca4fb19b4cd73becd361fa4099fb5738c412a055008441fd50c57 -size 3660 +oid sha256:cc51e11fa79a49ea33c9e1385d28b99171064ad9374652af39392d972f43b6f7 +size 4476 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/concepts.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/concepts.json index 2ad233117..66bde0294 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/concepts.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/concepts.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd3b6945eb25e2925921d59465a39c5d848fa1e6bbf9bc5cfc701cbd13871a9c -size 75948 +oid sha256:e1f583377c7d3c34b64615c61f0c9ae78529aa0bdbf5a89475c626712c39b6b5 +size 78888 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/institutions.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/institutions.json index 62f8ca72d..1ebd87369 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/institutions.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/institutions.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fdf6a182144f02c18d0ac02fc813a79f9f78062a2ea72bbf80aac909af60b869 -size 39267 +oid sha256:3dd9bd6a49028df3f3d4de54188d337f9099429d5e41e603abc2f1f65f5641ae +size 41687 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/publishers.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/publishers.json index 2a185c8fa..9c756b976 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/publishers.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/publishers.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73c4b4795ade705963241dfd4808c2f8f62395194b094ac93442d93b1489b3d9 -size 10176 +oid sha256:f8139671554c19329f1b95543db743f8b01ac792f6461bbab24b777e6a6a97c5 +size 13231 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/sources.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/sources.json index 2d0e789bc..f202d0d0a 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/sources.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/sources.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:34587aa2c068d00e63bc9cac65d9c4562f1090e60221719b01d29df16db415f6 -size 22112 +oid sha256:7c22e51ff3de26e723ae1b25661bfba04b8c96560646395df8c6ace923d467f5 +size 24464 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/works.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/works.json index 746eca5de..39e728463 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/works.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/works.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:569f53208acf8520eb265ddbf4b3e511423e4944d79198f1e2421718abb0cc70 -size 51589 +oid sha256:e6d8c9c486421f6bf238155f62add07542ec0f466b588672cb524032921fce48 +size 53806 diff --git a/academic_observatory_workflows/fixtures/unpaywall/expected/run1_bq_load_main_table.json b/academic_observatory_workflows/fixtures/unpaywall/expected/run1_bq_load_main_table.json index ba8a9f3ef..13c03eb99 100644 --- a/academic_observatory_workflows/fixtures/unpaywall/expected/run1_bq_load_main_table.json +++ b/academic_observatory_workflows/fixtures/unpaywall/expected/run1_bq_load_main_table.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26e30270c0b0dfb4ee9512c89526c32a2a42bb93bdec557054a79d16660f1197 -size 55555 +oid sha256:ae5d43f64d6225554385980f3a4534b7dab20b1397c344aaefd7fa2d998963fc +size 57427 diff --git a/academic_observatory_workflows/fixtures/unpaywall/expected/run1_bq_upsert_records.json b/academic_observatory_workflows/fixtures/unpaywall/expected/run1_bq_upsert_records.json index 0e88a540c..0f485bc0c 100644 --- a/academic_observatory_workflows/fixtures/unpaywall/expected/run1_bq_upsert_records.json +++ b/academic_observatory_workflows/fixtures/unpaywall/expected/run1_bq_upsert_records.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d3e6f1102a9addd369c709b71c222d8b0d909da5d9be07b124b50bb25761ff4c -size 55568 +oid sha256:39981fb96d12876a528aabc7878910cfa796a208714c95e7a474824c4f27d781 +size 57440 diff --git a/academic_observatory_workflows/fixtures/unpaywall/expected/run3_bq_upsert_records.json b/academic_observatory_workflows/fixtures/unpaywall/expected/run3_bq_upsert_records.json index b06eab2aa..b5ef8e47d 100644 --- a/academic_observatory_workflows/fixtures/unpaywall/expected/run3_bq_upsert_records.json +++ b/academic_observatory_workflows/fixtures/unpaywall/expected/run3_bq_upsert_records.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd759c19ead8b35cf0b6489a6a7460118b3e851bcff51211399a857cd0f88bfd -size 58096 +oid sha256:a54512b3aa901a302ac37f63c9f99b47c34ac8a18191b5b921885fdb0fc227f9 +size 59968 diff --git a/academic_observatory_workflows/workflows/crossref_events_telescope.py b/academic_observatory_workflows/workflows/crossref_events_telescope.py index 7b23d5761..2504fb77b 100644 --- a/academic_observatory_workflows/workflows/crossref_events_telescope.py +++ b/academic_observatory_workflows/workflows/crossref_events_telescope.py @@ -165,11 +165,11 @@ def __init__( dag_id: str, cloud_workspace: CloudWorkspace, events_start_date: pendulum.DateTime = pendulum.datetime(2017, 2, 17), - bq_dataset_id: str = "crossref", + bq_dataset_id: str = "crossref_events", bq_table_name: str = "crossref_events", api_dataset_id: str = "crossref_events", schema_folder: str = os.path.join(default_schema_folder(), "crossref_events"), - dataset_description: str = "Datasets created by Crossref: https://www.crossref.org/", + dataset_description: str = "The Crossref Events dataset: https://www.eventdata.crossref.org/guide/", table_description: str = "The Crossref Events dataset: https://www.eventdata.crossref.org/guide/", snapshot_expiry_days: int = 31, n_rows: int = 1000, @@ -515,7 +515,7 @@ def bq_load_main_table(self, release: CrossrefEventsRelease, **kwargs): schema_file_path=self.schema_file_path, source_format=SourceFormat.NEWLINE_DELIMITED_JSON, write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, - ignore_unknown_values=False, + ignore_unknown_values=True, ) set_task_state(success, self.bq_load_main_table.__name__, release) diff --git a/academic_observatory_workflows/workflows/crossref_fundref_telescope.py b/academic_observatory_workflows/workflows/crossref_fundref_telescope.py index 44942750a..fe30dac1a 100644 --- a/academic_observatory_workflows/workflows/crossref_fundref_telescope.py +++ b/academic_observatory_workflows/workflows/crossref_fundref_telescope.py @@ -76,11 +76,11 @@ def __init__( *, dag_id: str, cloud_workspace: CloudWorkspace, - bq_dataset_id: str = "crossref", + bq_dataset_id: str = "crossref_fundref", bq_table_name: str = "crossref_fundref", api_dataset_id: str = "crossref_fundref", schema_folder: str = os.path.join(default_schema_folder(), "crossref_fundref"), - dataset_description: str = "Datasets created by Crossref: https://www.crossref.org/", + dataset_description: str = "The Crossref Funder Registry dataset: https://www.crossref.org/services/funder-registry/", table_description: str = "The Crossref Funder Registry dataset: https://www.crossref.org/services/funder-registry/", observatory_api_conn_id: str = AirflowConns.OBSERVATORY_API, start_date: pendulum.DateTime = pendulum.datetime(2014, 2, 23), diff --git a/academic_observatory_workflows/workflows/crossref_metadata_telescope.py b/academic_observatory_workflows/workflows/crossref_metadata_telescope.py index c1f7dd5e0..6b3ba81c8 100644 --- a/academic_observatory_workflows/workflows/crossref_metadata_telescope.py +++ b/academic_observatory_workflows/workflows/crossref_metadata_telescope.py @@ -18,7 +18,6 @@ from __future__ import annotations import functools -import gzip import json import logging import os @@ -76,7 +75,7 @@ def __init__(self, *, dag_id: str, run_id: str, snapshot_date: pendulum.DateTime self.download_file_name = "crossref_metadata.json.tar.gz" self.download_file_path = os.path.join(self.download_folder, self.download_file_name) self.extract_files_regex = r".*\.json$" - self.transform_files_regex = r".*\.jsonl.gz$" + self.transform_files_regex = r".*\.jsonl$" class CrossrefMetadataTelescope(Workflow): @@ -91,16 +90,16 @@ def __init__( *, dag_id: str, cloud_workspace: CloudWorkspace, - bq_dataset_id: str = "crossref", + bq_dataset_id: str = "crossref_metadata", bq_table_name: str = "crossref_metadata", api_dataset_id: str = "crossref_metadata", schema_folder: str = os.path.join(default_schema_folder(), "crossref_metadata"), - dataset_description: str = "Datasets created by Crossref: https://www.crossref.org/", + dataset_description: str = "The Crossref Metadata Plus dataset: https://www.crossref.org/services/metadata-retrieval/metadata-plus/", table_description: str = "The Crossref Metadata Plus dataset: https://www.crossref.org/services/metadata-retrieval/metadata-plus/", crossref_metadata_conn_id: str = "crossref_metadata", observatory_api_conn_id: str = AirflowConns.OBSERVATORY_API, max_processes: int = os.cpu_count(), - batch_size: int = 200, + batch_size: int = 20, start_date: pendulum.DateTime = pendulum.datetime(2020, 6, 7), schedule_interval: str = "0 0 7 * *", catchup: bool = True, @@ -241,10 +240,11 @@ def upload_downloaded(self, release: CrossrefMetadataRelease, **kwargs): def transform(self, release: CrossrefMetadataRelease, **kwargs): """Task to transform the CrossrefMetadataRelease release for a given month. - Each extracted file is transformed. This is done in parallel using the ThreadPoolExecutor.""" + Each extracted file is transformed.""" logging.info(f"Transform input folder: {release.extract_folder}, output folder: {release.transform_folder}") clean_dir(release.transform_folder) + finished = 0 # List files and sort so that they are processed in ascending order input_file_paths = natsorted(list_files(release.extract_folder, release.extract_files_regex)) @@ -256,21 +256,16 @@ def transform(self, release: CrossrefMetadataRelease, **kwargs): # Create tasks for each file for input_file in chunk: - future = executor.submit(transform_file, input_file) + output_file = os.path.join(release.transform_folder, os.path.basename(input_file) + "l") + future = executor.submit(transform_file, input_file, output_file) futures.append(future) - # Write data from batch into a single jsonl.gz file - # The output file will be a json lines gzip file, hence adding the 'l.gz' to the file extension - file_path = os.path.join(release.transform_folder, f"crossref_metadata_{i:012}.jsonl.gz") - with gzip.open(file_path, "wb") as gzip_file: - with jsonlines.Writer(gzip_file) as writer: - # Write data to the jsonlines.Writer as it becomes available - for future in as_completed(futures): - data = future.result() - writer.write_all(data) - - if i % 1000 == 0: - logging.info(f"Transformed {i + 1} files") + # Wait for completed tasks + for future in as_completed(futures): + future.result() + finished += 1 + if finished % 1000 == 0: + logging.info(f"Transformed {finished} files") def upload_transformed(self, release: CrossrefMetadataRelease, **kwargs) -> None: """Upload the transformed data to Cloud Storage.""" @@ -294,7 +289,7 @@ def bq_load(self, release: CrossrefMetadataRelease, **kwargs): # subfolders: https://cloud.google.com/bigquery/docs/batch-loading-data#load-wildcards uri = gcs_blob_uri( self.cloud_workspace.transform_bucket, - f"{gcs_blob_name_from_path(release.transform_folder)}/*.jsonl.gz", + f"{gcs_blob_name_from_path(release.transform_folder)}/*.jsonl", ) table_id = bq_sharded_table_id( self.cloud_workspace.output_project_id, self.bq_dataset_id, self.bq_table_name, release.snapshot_date @@ -363,24 +358,23 @@ def check_release_exists(month: pendulum.DateTime, api_key: str) -> bool: return False -def transform_file(input_file_path: str): +def transform_file(input_file_path: str, output_file_path: str): """Transform a single Crossref Metadata json file. The json file is converted to a jsonl file and field names are transformed so they are accepted by BigQuery. :param input_file_path: the path of the file to transform. + :param output_file_path: where to save the transformed file. :return: None. """ # Open json - with open(input_file_path, mode="r") as input_file: - input_data = json.load(input_file) - - # Transform data - output_data = [] - for item in input_data["items"]: - output_data.append(transform_item(item)) + with open(input_file_path, mode="r") as in_file: + input_data = json.load(in_file) - return output_data + # Transform and write + with jsonlines.open(output_file_path, mode="w", compact=True) as out_file: + for item in input_data["items"]: + out_file.write(transform_item(item)) def transform_item(item): diff --git a/academic_observatory_workflows/workflows/doi_workflow.py b/academic_observatory_workflows/workflows/doi_workflow.py index e140243da..b8eb27eb2 100644 --- a/academic_observatory_workflows/workflows/doi_workflow.py +++ b/academic_observatory_workflows/workflows/doi_workflow.py @@ -102,14 +102,14 @@ class Aggregation: def make_dataset_transforms( input_project_id: str, output_project_id: str, - dataset_id_crossref_events: str = "crossref", - dataset_id_crossref_metadata: str = "crossref", - dataset_id_crossref_fundref: str = "crossref", + dataset_id_crossref_events: str = "crossref_events", + dataset_id_crossref_metadata: str = "crossref_metadata", + dataset_id_crossref_fundref: str = "crossref_fundref", dataset_id_ror: str = "ror", dataset_id_mag: str = "mag", dataset_id_orcid: str = "orcid", dataset_id_open_citations: str = "open_citations", - dataset_id_unpaywall: str = "our_research", + dataset_id_unpaywall: str = "unpaywall", dataset_id_openalex: str = "openalex", dataset_id_settings: str = "settings", dataset_id_observatory: str = "observatory", @@ -515,7 +515,7 @@ def __init__( bq_dashboards_dataset_id: str = "coki_dashboards", bq_observatory_dataset_id: str = "observatory", bq_elastic_dataset_id: str = "data_export", - bq_unpaywall_dataset_id: str = "our_research", + bq_unpaywall_dataset_id: str = "unpaywall", bq_ror_dataset_id: str = "ror", api_dataset_id: str = "doi", transforms: Tuple = None, diff --git a/academic_observatory_workflows/workflows/oa_web_workflow.py b/academic_observatory_workflows/workflows/oa_web_workflow.py index 1ad9282cd..916cee745 100644 --- a/academic_observatory_workflows/workflows/oa_web_workflow.py +++ b/academic_observatory_workflows/workflows/oa_web_workflow.py @@ -83,7 +83,7 @@ ("outputs_public", "n_outputs_other_platform_open"), ("outputs_other_internet", "n_outputs_other_platform_open"), ] -INCLUSION_THRESHOLD = {"country": 1, "institution": 700} +INCLUSION_THRESHOLD = {"country": 15, "institution": 800} MAX_REPOSITORIES = 200 START_YEAR = 2000 END_YEAR = pendulum.now().year - 1 @@ -576,6 +576,8 @@ def build_indexes(self, release: OaWebRelease, **kwargs): # Aggregate data file df_index = make_index_df(category, df_index, df_data) + logging.info(f"Total {category} entities: {len(df_index)}") + # Save index to intermediate index_path = os.path.join(release.intermediate_path, index_name) rows: List[Dict] = df_index.to_dict("records") @@ -1826,7 +1828,8 @@ def save_coki_oa_dataset(path: str, countries: List[Entity], institutions: List[ subset = { "id": None, "name": None, - "country": None, + "country_name": None, + "country_code": None, "subregion": None, "region": None, "institution_type": None, diff --git a/academic_observatory_workflows/workflows/openalex_telescope.py b/academic_observatory_workflows/workflows/openalex_telescope.py index 6db9411b4..aea3ed277 100644 --- a/academic_observatory_workflows/workflows/openalex_telescope.py +++ b/academic_observatory_workflows/workflows/openalex_telescope.py @@ -656,7 +656,7 @@ def bq_load_upsert_tables(self, release: OpenAlexRelease, **kwargs): schema_file_path=entity.schema_file_path, source_format=SourceFormat.NEWLINE_DELIMITED_JSON, write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, - ignore_unknown_values=False, + ignore_unknown_values=True, ) assert ( success @@ -699,7 +699,7 @@ def bq_load_delete_tables(self, release: OpenAlexRelease, **kwargs): source_format=SourceFormat.CSV, csv_skip_leading_rows=1, write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, - ignore_unknown_values=False, + ignore_unknown_values=True, ) assert ( success @@ -943,9 +943,12 @@ def fetch_merged_ids( results = [] for page in paginator.paginate(Bucket=bucket, Prefix=f"{prefix}/{entity_name}"): for content in page.get("Contents", []): - url = f"s3://{bucket}/{content['Key']}" - content_length = content["Size"] - results.append(MergedId(url, content_length)) + obj_key = content["Key"] + # There is a dud file in data/merged_ids/sources/ + if obj_key != "data/merged_ids/sources/.csv": + url = f"s3://{bucket}/{obj_key}" + content_length = content["Size"] + results.append(MergedId(url, content_length)) # Sort from oldest to newest results.sort(key=lambda m: m.updated_date, reverse=False) @@ -970,16 +973,13 @@ def transform_file(download_path: str, transform_path: str): with gzip.open(download_path, "rb") as f_in, gzip.open(transform_path, "wt", encoding="ascii") as f_out: reader = jsonlines.Reader(f_in) for obj in reader.iter(skip_empty=True): - if "works" in download_path: - transform_object(obj, "abstract_inverted_index") - else: - transform_object(obj, "international") + transform_object(obj) json.dump(obj, f_out) f_out.write("\n") logging.info(f"Finished transform, saved to {transform_path}") -def transform_object(obj: dict, field: str): +def transform_object(obj: dict): """Transform an entry/object for one of the OpenAlex entities. For the Work entity only the "abstract_inverted_index" field is transformed. For the Concept and Institution entities only the "international" field is transformed. @@ -988,18 +988,40 @@ def transform_object(obj: dict, field: str): :param field: The field of interested that is transformed. :return: None. """ - if field == "international": - for nested_field in obj.get(field, {}).keys(): - if not isinstance(obj[field][nested_field], dict): - continue - keys = list(obj[field][nested_field].keys()) - values = list(obj[field][nested_field].values()) - obj[field][nested_field] = {"keys": keys, "values": values} - elif field == "abstract_inverted_index": + # Remove nulls from arrays + # And handle null value + field = "corresponding_institution_ids" + if field in obj: + value = obj.get(field, []) + if value is None: + value = [] + obj[field] = [x for x in value if x is not None] + + # Remove nulls from arrays + # And handle null value + field = "corresponding_author_ids" + if field in obj: + value = obj.get(field, []) + if value is None: + value = [] + obj[field] = [x for x in value if x is not None] + + field = "abstract_inverted_index" + if field in obj: if not isinstance(obj.get(field), dict): return keys = list(obj[field].keys()) values = [str(value)[1:-1] for value in obj[field].values()] obj[field] = {"keys": keys, "values": values} + + field = "international" + if field in obj: + for nested_field in obj.get(field, {}).keys(): + if not isinstance(obj[field][nested_field], dict): + continue + keys = list(obj[field][nested_field].keys()) + values = list(obj[field][nested_field].values()) + + obj[field][nested_field] = {"keys": keys, "values": values} diff --git a/academic_observatory_workflows/workflows/ror_telescope.py b/academic_observatory_workflows/workflows/ror_telescope.py index 1b4be120e..fb9221996 100644 --- a/academic_observatory_workflows/workflows/ror_telescope.py +++ b/academic_observatory_workflows/workflows/ror_telescope.py @@ -20,6 +20,7 @@ import logging import math import os +import shutil import urllib.parse from typing import List, Any, Dict from zipfile import BadZipFile, ZipFile @@ -222,6 +223,12 @@ def extract(self, releases: List[RorRelease], **kwargs): raise AirflowException("Not a zip file") logging.info(f"File extracted to: {release.extract_folder}") + # Remove dud __MACOSX folder that shouldn't be there + try: + shutil.rmtree(os.path.join(release.extract_folder, "__MACOSX")) + except FileNotFoundError: + pass + def transform(self, releases: List[RorRelease], **kwargs): """Task to transform the ROR releases.""" diff --git a/academic_observatory_workflows/workflows/scopus_telescope.py b/academic_observatory_workflows/workflows/scopus_telescope.py index 2fae4fd39..0a686cdc2 100644 --- a/academic_observatory_workflows/workflows/scopus_telescope.py +++ b/academic_observatory_workflows/workflows/scopus_telescope.py @@ -101,7 +101,7 @@ def __init__( scopus_conn_ids: List[str], view: str = "STANDARD", earliest_date: pendulum.DateTime = pendulum.datetime(1800, 1, 1), - bq_dataset_id: str = "elsevier", + bq_dataset_id: str = "scopus", bq_table_name: str = "scopus", api_dataset_id: str = "scopus", schema_folder: str = os.path.join(default_schema_folder(), "scopus"), diff --git a/academic_observatory_workflows/workflows/tests/test_crossref_metadata_telescope.py b/academic_observatory_workflows/workflows/tests/test_crossref_metadata_telescope.py index a768661e4..68cc92042 100644 --- a/academic_observatory_workflows/workflows/tests/test_crossref_metadata_telescope.py +++ b/academic_observatory_workflows/workflows/tests/test_crossref_metadata_telescope.py @@ -173,10 +173,9 @@ def test_telescope(self): ti = env.run_task(workflow.transform.__name__) self.assertEqual(State.SUCCESS, ti.state) file_paths = list_files(release.transform_folder, release.transform_files_regex) - self.assertEqual(1, len(file_paths)) + self.assertEqual(5, len(file_paths)) for file_path in file_paths: self.assertTrue(os.path.isfile(file_path)) - self.assertTrue(is_gzip(file_path)) # Test that transformed files uploaded ti = env.run_task(workflow.upload_transformed.__name__) @@ -311,7 +310,9 @@ def test_transform_file(self): "issn_type": [{"value": "0003-987X", "type": "print"}], } ] - actual_results = transform_file(input_file_path) + output_file_path = os.path.join(t, "output.jsonl") + transform_file(input_file_path, output_file_path) + actual_results = load_jsonl(output_file_path) self.assertEqual(expected_results, actual_results) def test_transform_item(self): diff --git a/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py b/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py index ce3430989..2ad95c33c 100644 --- a/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py +++ b/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py @@ -369,51 +369,57 @@ def test_fetch_merged_ids(self): actual = fetch_merged_ids(bucket=bucket_name, aws_key=self.aws_key, entity_name="authors") self.assertEqual(expected, actual) - @patch("academic_observatory_workflows.workflows.openalex_telescope.transform_object") - def test_transform_file(self, mock_transform_object): - """Test the transform_file function.""" - - mock_transform_object.return_value = {} - with CliRunner().isolated_filesystem() as t: - transform_path = "transform/out.jsonl.gz" - - # Create works entity file - works = {"works": "content"} - works_download_path = "works.jsonl.gz" - with gzip.open(works_download_path, "wt", encoding="ascii") as f_out: - json.dump(works, f_out) - - # Create other entity file (concepts or institution) - concepts = {"concepts": "content"} - concepts_download_path = "concepts.jsonl.gz" - with gzip.open(concepts_download_path, "wt", encoding="ascii") as f_out: - json.dump(concepts, f_out) - - # Test when dir of transform path does not exist yet, using 'works' entity' - self.assertFalse(os.path.isdir(os.path.dirname(transform_path))) - - transform_file(works_download_path, transform_path) - mock_transform_object.assert_called_once_with(works, "abstract_inverted_index") - mock_transform_object.reset_mock() - os.remove(transform_path) - - # Test when dir of transform path does exist, using 'works' entity - self.assertTrue(os.path.isdir(os.path.dirname(transform_path))) - - transform_file(works_download_path, transform_path) - self.assert_file_integrity(transform_path, "682a6d42", "gzip_crc") - mock_transform_object.assert_called_once_with(works, "abstract_inverted_index") - mock_transform_object.reset_mock() - os.remove(transform_path) - - # Test for "concepts" and "institution" entities - transform_file(concepts_download_path, transform_path) - self.assert_file_integrity(transform_path, "d8cafe16", "gzip_crc") - mock_transform_object.assert_called_once_with(concepts, "international") - def test_transform_object(self): """Test the transform_object function.""" + # Null + obj = { + "corresponding_institution_ids": None + } + transform_object(obj) + self.assertDictEqual( + { + "corresponding_institution_ids": [] + }, + obj, + ) + + # Null + obj = { + "corresponding_author_ids": None + } + transform_object(obj) + self.assertDictEqual( + { + "corresponding_author_ids": [] + }, + obj, + ) + + # Null in array + obj = { + "corresponding_institution_ids": [None] + } + transform_object(obj) + self.assertDictEqual( + { + "corresponding_institution_ids": [] + }, + obj, + ) + + # Null in array + obj = { + "corresponding_author_ids": [None] + } + transform_object(obj) + self.assertDictEqual( + { + "corresponding_author_ids": [] + }, + obj, + ) + # Test object with nested "international" fields obj1 = { "international": { @@ -424,7 +430,7 @@ def test_transform_object(self): } } } - transform_object(obj1, "international") + transform_object(obj1) self.assertDictEqual( { "international": { @@ -443,7 +449,7 @@ def test_transform_object(self): # Test object with nested "international" none obj2 = {"international": {"display_name": None}} - transform_object(obj2, "international") + transform_object(obj2) self.assertDictEqual({"international": {"display_name": None}}, obj2) # Test object with nested "abstract_inverted_index" fields @@ -457,7 +463,7 @@ def test_transform_object(self): "primarily": [5], } } - transform_object(obj3, "abstract_inverted_index") + transform_object(obj3) self.assertDictEqual( { "abstract_inverted_index": { @@ -470,7 +476,7 @@ def test_transform_object(self): # Test object with nested "abstract_inverted_index" none obj4 = {"abstract_inverted_index": None} - transform_object(obj4, "abstract_inverted_index") + transform_object(obj4) self.assertDictEqual({"abstract_inverted_index": None}, obj4) diff --git a/academic_observatory_workflows/workflows/unpaywall_telescope.py b/academic_observatory_workflows/workflows/unpaywall_telescope.py index e0c4e2477..94eec490b 100644 --- a/academic_observatory_workflows/workflows/unpaywall_telescope.py +++ b/academic_observatory_workflows/workflows/unpaywall_telescope.py @@ -203,11 +203,11 @@ def __init__( *, dag_id: str, cloud_workspace: CloudWorkspace, - bq_dataset_id: str = "our_research", + bq_dataset_id: str = "unpaywall", bq_table_name: str = "unpaywall", api_dataset_id: str = "unpaywall", schema_folder: str = os.path.join(default_schema_folder(), "unpaywall"), - dataset_description: str = "Our Research datasets: http://ourresearch.org/", + dataset_description: str = "Unpaywall Data Feed: https://unpaywall.org/products/data-feed", table_description: str = "Unpaywall Data Feed: https://unpaywall.org/products/data-feed", primary_key: str = "doi", snapshot_expiry_days: int = 7, @@ -565,7 +565,7 @@ def bq_load_main_table(self, release: UnpaywallRelease, **kwargs) -> None: source_format=SourceFormat.NEWLINE_DELIMITED_JSON, table_description=self.table_description, write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, - ignore_unknown_values=False, + ignore_unknown_values=True, ) set_task_state(success, self.bq_load_upsert_table.__name__, release) @@ -655,7 +655,7 @@ def bq_load_upsert_table(self, release: UnpaywallRelease, **kwargs) -> None: source_format=SourceFormat.NEWLINE_DELIMITED_JSON, table_description=self.table_description, write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, - ignore_unknown_values=False, + ignore_unknown_values=True, ) set_task_state(success, self.bq_load_upsert_table.__name__, release) diff --git a/academic_observatory_workflows/workflows/web_of_science_telescope.py b/academic_observatory_workflows/workflows/web_of_science_telescope.py index a7308a682..b256ee552 100644 --- a/academic_observatory_workflows/workflows/web_of_science_telescope.py +++ b/academic_observatory_workflows/workflows/web_of_science_telescope.py @@ -99,7 +99,7 @@ def __init__( institution_ids: List[str], wos_conn_id: str, earliest_date: pendulum.DateTime = pendulum.datetime(1800, 1, 1), - bq_dataset_id: str = "clarivate", + bq_dataset_id: str = "web_of_science", bq_table_name: str = "web_of_science", api_dataset_id: str = "web_of_science", schema_folder: str = os.path.join(default_schema_folder(), "web_of_science"), From 736f40fe3572d98e9bbaec26313abf74bab5dace Mon Sep 17 00:00:00 2001 From: Jamie Diprose <5715104+jdddog@users.noreply.github.com> Date: Tue, 20 Jun 2023 09:39:31 +1200 Subject: [PATCH 3/3] Filter Crossref type and fix release_date naming in oa web workflow (#167) --- .../database/sql/create_aggregate.sql.jinja2 | 10 ++++++++++ academic_observatory_workflows/model.py | 4 ++++ .../workflows/oa_web_workflow.py | 8 ++++---- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2 b/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2 index 32ab6958a..56f855e16 100644 --- a/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2 +++ b/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2 @@ -13,6 +13,10 @@ # limitations under the License. # Author: Richard Hosking #} +{#Include:journal-article,proceedings-article,report,posted-content,edited-book,book,book-chapter,reference-book,monograph,other,book-section,book-part,reference-entry#} +{#Exclude: the types below and NULL#} +{% set CROSSREF_TYPES_TO_EXCLUDE = '("dataset","database","component","report-component","peer-review","grant","proceedings","journal-issue","report-series","book-track")' %} + # Helper Function: Counting Access Types {# Output Schema: @@ -741,6 +745,8 @@ WITH tmp_disciplines AS UNNEST(dois.affiliations.{{ aggregation_field }}) as aggregrate WHERE aggregrate.identifier IS NOT NULL + AND dois.crossref.type IS NOT NULL + AND dois.crossref.type NOT IN {{ CROSSREF_TYPES_TO_EXCLUDE }} GROUP BY aggregrate.identifier, crossref.{{ group_by_time_field }} @@ -764,6 +770,8 @@ tmp_access_types AS ( UNNEST(dois.affiliations.{{ aggregation_field }}) as aggregrate WHERE aggregrate.identifier IS NOT NULL + AND dois.crossref.type IS NOT NULL + AND dois.crossref.type NOT IN {{ CROSSREF_TYPES_TO_EXCLUDE }} GROUP BY aggregrate.identifier, crossref.{{ group_by_time_field }} @@ -1017,6 +1025,8 @@ SELECT FROM `{{ project_id }}.{{ dataset_id }}.doi{{ snapshot_date.strftime('%Y%m%d') }}` as dois, UNNEST(dois.affiliations.{{ aggregation_field }}) as aggregrate WHERE aggregrate.identifier IS NOT NULL +AND dois.crossref.type IS NOT NULL +AND dois.crossref.type NOT IN {{ CROSSREF_TYPES_TO_EXCLUDE }} GROUP BY aggregrate.identifier, crossref.{{ group_by_time_field }} ) diff --git a/academic_observatory_workflows/model.py b/academic_observatory_workflows/model.py index 46c82bf51..a0e71ad44 100644 --- a/academic_observatory_workflows/model.py +++ b/academic_observatory_workflows/model.py @@ -206,6 +206,7 @@ class Paper: id: int doi: str = None title: str = None + type: str = None published_date: pendulum.Date = None output_type: str = None authors: List[Author] = None @@ -790,6 +791,7 @@ def make_papers( # Make paper paper = Paper( i, + type="journal-article", doi=doi_, title=title_, published_date=published_date_, @@ -1099,6 +1101,7 @@ def make_crossref_metadata(dataset: ObservatoryDataset) -> List[Dict]: # Add Crossref record records.append( { + "type": paper.type, "title": [paper.title], "DOI": paper.doi, "is_referenced_by_count": len(paper.cited_by), @@ -1483,6 +1486,7 @@ def make_doi_table(dataset: ObservatoryDataset) -> List[Dict]: { "doi": doi, "crossref": { + "type": paper.type, "title": paper.title, "published_year": paper.published_date.year, "published_month": paper.published_date.month, diff --git a/academic_observatory_workflows/workflows/oa_web_workflow.py b/academic_observatory_workflows/workflows/oa_web_workflow.py index 916cee745..2be7ee851 100644 --- a/academic_observatory_workflows/workflows/oa_web_workflow.py +++ b/academic_observatory_workflows/workflows/oa_web_workflow.py @@ -83,7 +83,7 @@ ("outputs_public", "n_outputs_other_platform_open"), ("outputs_other_internet", "n_outputs_other_platform_open"), ] -INCLUSION_THRESHOLD = {"country": 15, "institution": 800} +INCLUSION_THRESHOLD = {"country": 15, "institution": 1000} MAX_REPOSITORIES = 200 START_YEAR = 2000 END_YEAR = pendulum.now().year - 1 @@ -697,7 +697,7 @@ def build_datasets(self, release: OaWebRelease, **kwargs): ) for version in versions ] - last_updated = zenodo_versions[0].snapshot_date.format("D MMMM YYYY") + last_updated = zenodo_versions[0].release_date.format("D MMMM YYYY") country_stats = make_entity_stats(countries) institution_stats = make_entity_stats(institutions) stats = Stats(START_YEAR, END_YEAR, last_updated, zenodo_versions, country_stats, institution_stats) @@ -960,11 +960,11 @@ def to_dict(self) -> Dict: @dataclasses.dataclass class ZenodoVersion: - snapshot_date: pendulum.DateTime + release_date: pendulum.DateTime download_url: str def to_dict(self) -> Dict: - return {"snapshot_date": self.snapshot_date.strftime("%Y-%m-%d"), "download_url": self.download_url} + return {"release_date": self.release_date.strftime("%Y-%m-%d"), "download_url": self.download_url} @dataclasses.dataclass