From d5f994a68da7742e8173c8861d13f7ea708e0a77 Mon Sep 17 00:00:00 2001 From: Jamie Diprose <5715104+jdddog@users.noreply.github.com> Date: Mon, 3 Jul 2023 17:27:55 +1200 Subject: [PATCH] Refactor DOI table --- .../database/sql/create_aggregate.sql.jinja2 | 79 +++---- .../database/sql/create_doi.sql.jinja2 | 28 +-- .../database/sql/create_mag.sql.jinja2 | 135 ------------ ...ql.jinja2 => create_openaccess.sql.jinja2} | 2 +- ...> create_openaccess_repo_names.sql.jinja2} | 0 academic_observatory_workflows/model.py | 195 +----------------- .../workflows/doi_workflow.py | 15 +- .../workflows/tests/test_doi_workflow.py | 9 +- 8 files changed, 59 insertions(+), 404 deletions(-) delete mode 100644 academic_observatory_workflows/database/sql/create_mag.sql.jinja2 rename academic_observatory_workflows/database/sql/{create_unpaywall.sql.jinja2 => create_openaccess.sql.jinja2} (99%) rename academic_observatory_workflows/database/sql/{create_unpaywall_repo_names.sql.jinja2 => create_openaccess_repo_names.sql.jinja2} (100%) diff --git a/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2 b/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2 index 56f855e16..1f6b22ae8 100644 --- a/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2 +++ b/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2 @@ -32,19 +32,19 @@ num_green_only_ignoring_bronze_outputs INTEGER NULLABLE num_has_license INTEGER NULLABLE num_is_cclicensed INTEGER NULLABLE #} -CREATE TEMP FUNCTION count_access_types(unpaywall ANY TYPE) as ( +CREATE TEMP FUNCTION count_access_types(coki ANY TYPE) as ( (SELECT as STRUCT - COUNTIF(is_oa) as num_oa_outputs, - COUNTIF(is_in_doaj) as num_in_doaj, - COUNTIF(green) as num_green_outputs, - COUNTIF(gold) as num_gold_outputs, + COUNTIF(oa_coki.open) as num_oa_outputs, +{# COUNTIF(oa_color.is_in_doaj) as num_in_doaj,#} + COUNTIF(oa_color.green) as num_green_outputs, + COUNTIF(oa_color.gold) as num_gold_outputs, COUNTIF(gold_just_doaj) as num_gold_just_doaj_outputs, - COUNTIF(hybrid) as num_hybrid_outputs, - COUNTIF(bronze) as num_bronze_outputs, - COUNTIF(green_only) as num_green_only_outputs, - COUNTIF(green_only_ignoring_bronze) as num_green_only_ignoring_bronze_outputs, - COUNTIF(has_license) as num_has_license, - COUNTIF(is_cclicensed) as num_is_cclicensed + COUNTIF(oa_color.hybrid) as num_hybrid_outputs, + COUNTIF(oa_color.bronze) as num_bronze_outputs, + COUNTIF(oa_color.green_only) as num_green_only_outputs, + COUNTIF(oa_color.green_only_ignoring_bronze) as num_green_only_ignoring_bronze_outputs, + COUNTIF(oa_license.has_license) as num_has_license, + COUNTIF(oa_license.is_cclicensed) as num_is_cclicensed FROM UNNEST(unpaywall)) ); @@ -609,14 +609,14 @@ CREATE TEMP FUNCTION process_relations(relationships ANY TYPE, total INT64, tota relation.identifier as id, COUNT(relation.identifier) as total_outputs, ROUND(SAFE_DIVIDE( COUNT(relation.identifier), total), 3) as percentage_of_all_outputs, - ROUND(SAFE_DIVIDE( COUNTIF(unpaywall.is_oa) , total_oa ), 3) as percentage_of_all_oa, + ROUND(SAFE_DIVIDE( COUNTIF(coki.oa_coki.open) , total_oa ), 3) as percentage_of_all_oa, MAX(relation.name) as name, MAX(relation.country) as country, MAX(relation.country_code) as country_code, MAX(relation.region) as region, MAX(relation.subregion) as subregion, MAX(relation.coordinates) as coordinates, - count_access_types(ARRAY_AGG(unpaywall)).*, + count_access_types(ARRAY_AGG(coki)).*, STRUCT( SUM(citations.openalex) as openalex, SUM(citations.crossref) as crosssref, @@ -727,7 +727,7 @@ WITH tmp_disciplines AS dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations ) as citations, - unpaywall.is_oa as is_oa, unpaywall.green as green, unpaywall.gold as gold, unpaywall.gold_just_doaj, unpaywall.hybrid, unpaywall.bronze, unpaywall.green_only, + coki.oa_coki.open as is_oa, coki.oa_color.green as green, coki.oa_color.gold as gold, coki.oa_color.gold_just_doaj, coki.oa_color.hybrid, coki.oa_color.bronze, coki.oa_color.green_only, -- Total Funding (SELECT COUNT(funder) > 0 from UNNEST(affiliations.funders) as funder) as funding, -- Domestic, international, both, none or unknown funding @@ -761,7 +761,7 @@ tmp_access_types AS ( ARRAY_AGG( STRUCT( dois.doi, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, - unpaywall.is_oa, unpaywall.green, unpaywall.gold, unpaywall.gold_just_doaj, unpaywall.hybrid, unpaywall.bronze, unpaywall.green_only + coki.oa_coki.open, coki.oa_color.green, coki.oa_color.gold, coki.oa_color.gold_just_doaj, coki.oa_color.hybrid, coki.oa_color.bronze, coki.oa_color.green_only ) ) ) as access_types @@ -814,11 +814,12 @@ SELECT ) as citations, -- Output Types + count_output_types( ARRAY_AGG( STRUCT( - unpaywall.output_type, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, - unpaywall.is_oa, unpaywall.green, unpaywall.gold, unpaywall.gold_just_doaj, unpaywall.hybrid, unpaywall.bronze, unpaywall.green_only + crossref.type, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, + coki.oa_coki.open, coki.oa_color.green, coki.oa_color.gold, coki.oa_color.gold_just_doaj, coki.oa_color.hybrid, coki.oa_color.bronze, coki.oa_color.green_only ) ) ) as output_types, @@ -849,13 +850,13 @@ SELECT ARRAY( (SELECT as STRUCT relation, - unpaywall, + coki, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, - ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines + ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa_coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines FROM UNNEST(affiliations.institutions) as relation WHERE relation.identifier <> aggregrate.identifier) ) - ), COUNT(dois.doi), COUNTIF(unpaywall.is_oa = True) + ), COUNT(dois.doi), COUNTIF(coki.oa_coki.open = True) ) as institutions, {% endif %} @@ -873,13 +874,13 @@ SELECT ARRAY( (SELECT as STRUCT relation, - unpaywall, + coki, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, - ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines + ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa_coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines FROM UNNEST(affiliations.countries) as relation WHERE relation.identifier <> aggregrate.country_code OR aggregrate.country_code IS NULL) ) - ), COUNT(dois.doi), COUNTIF(unpaywall.is_oa = True) + ), COUNT(dois.doi), COUNTIF(coki.oa_coki.open = True) ) as countries, {% endif %} @@ -897,13 +898,13 @@ SELECT ARRAY( (SELECT as STRUCT relation, - unpaywall, + coki, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, - ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines + ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa_coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines FROM UNNEST(affiliations.groupings) as relation WHERE relation.identifier <> aggregrate.identifier) ) - ), COUNT(dois.doi), COUNTIF(unpaywall.is_oa = True) + ), COUNT(dois.doi), COUNTIF(coki.oa_coki.open = True) ) as groupings, {% endif %} @@ -921,13 +922,13 @@ SELECT ARRAY( (SELECT as STRUCT relation, - unpaywall, + coki, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, - ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines + ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa_coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines FROM UNNEST(affiliations.funders) as relation WHERE relation.identifier <> aggregrate.identifier) ) - ), COUNT(dois.doi), COUNTIF(unpaywall.is_oa = True) + ), COUNT(dois.doi), COUNTIF(coki.oa_coki.open = True) ) as funders, {% endif %} @@ -947,14 +948,14 @@ SELECT relation as identifier, relation as name, CAST(NULL as STRING) as country, CAST(NULL as STRING) as country_code, CAST(NULL as STRING) as region, CAST(NULL as STRING) as subregion, CAST(NULL as STRING) as coordinates ) as relation, - unpaywall, + coki, STRUCT( dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations ) as citations, - ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines + ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa_coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines FROM UNNEST(aggregrate.members) as relation) ) - ), COUNT(dois.doi), COUNTIF(unpaywall.is_oa = True) + ), COUNT(dois.doi), COUNTIF(coki.oa_coki.open = True) ) as members, {% endif %} @@ -972,13 +973,13 @@ SELECT ARRAY( (SELECT as STRUCT relation, - unpaywall, + coki, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, - ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines + ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa_coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines FROM UNNEST(affiliations.publishers) as relation WHERE relation.identifier <> aggregrate.identifier) ) - ), COUNT(dois.doi), COUNTIF(unpaywall.is_oa = True) + ), COUNT(dois.doi), COUNTIF(coki.oa_coki.open = True) ) as publishers, {% endif %} @@ -996,13 +997,13 @@ SELECT ARRAY( (SELECT as STRUCT relation, - unpaywall, + coki, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, - ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines + ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa_coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines FROM UNNEST(affiliations.journals) as relation WHERE relation.identifier <> aggregrate.identifier) ) - ), COUNT(dois.doi), COUNTIF(unpaywall.is_oa = True) + ), COUNT(dois.doi), COUNTIF(coki.oa_coki.open = True) ) as journals, {% endif %} @@ -1019,7 +1020,7 @@ SELECT event.source, event.count, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, - unpaywall.is_oa as is_oa, unpaywall.green as green, unpaywall.gold as gold, unpaywall.gold_just_doaj, unpaywall.hybrid, unpaywall.bronze, unpaywall.green_only + coki.oa_coki.open as is_oa, coki.oa_color.green as green, coki.oa_color.gold as gold, coki.oa_color.gold_just_doaj, coki.oa_color.hybrid, coki.oa_color.bronze, coki.oa_color.green_only FROM UNNEST(dois.events.events) as event))) ) as events, diff --git a/academic_observatory_workflows/database/sql/create_doi.sql.jinja2 b/academic_observatory_workflows/database/sql/create_doi.sql.jinja2 index 789140f99..c461fa016 100644 --- a/academic_observatory_workflows/database/sql/create_doi.sql.jinja2 +++ b/academic_observatory_workflows/database/sql/create_doi.sql.jinja2 @@ -37,7 +37,7 @@ SELECT *, FROM (SELECT - {# This SQL block links unpaywall, mag, open citations and crossref events to the DOI and the metadata found in the crossref metadata dataset #} + {# This SQL block links unpaywall, open citations and crossref events to the DOI and the metadata found in the crossref metadata dataset #} UPPER(TRIM(ref.doi)) as doi, STRUCT( title, abstract, issued.date_parts[offset(0)] as published_year, @@ -46,9 +46,9 @@ SELECT type, ISSN, ISBN, issn_type, publisher_location, publisher, member, prefix, container_title, short_container_title, group_title, references_count, is_referenced_by_count, subject, published_print, license, volume, funder, page, author, link, clinical_trial_number, alternative_id ) as crossref, - (SELECT as STRUCT * from `{{ observatory_intermediate.project_id }}.{{ observatory_intermediate.dataset_id }}.unpaywall{{ snapshot_date.strftime('%Y%m%d') }}` as oa WHERE oa.doi = UPPER(TRIM(ref.doi))) as unpaywall, + (SELECT as STRUCT * from `{{ observatory_intermediate.project_id }}.{{ observatory_intermediate.dataset_id }}.openaccess{{ snapshot_date.strftime('%Y%m%d') }}` as oa WHERE oa.doi = UPPER(TRIM(ref.doi))) as openaccess, + (SELECT as STRUCT * from `{{ unpaywall.project_id }}.{{ unpaywall.dataset_id }}.{{ unpaywall.table_id }}` as unpaywall WHERE unpaywall.doi = UPPER(TRIM(ref.doi))) as unpaywall, (SELECT as STRUCT * from `{{ observatory_intermediate.project_id }}.{{ observatory_intermediate.dataset_id }}.openalex{{ snapshot_date.strftime('%Y%m%d') }}` as openalex WHERE openalex.doi = UPPER(TRIM(ref.doi))) as openalex, - (SELECT as STRUCT * from `{{ observatory_intermediate.project_id }}.{{ observatory_intermediate.dataset_id }}.mag{{ snapshot_date.strftime('%Y%m%d') }}` as mag WHERE mag.doi = UPPER(TRIM(ref.doi))) as mag, (SELECT as STRUCT * from `{{ observatory_intermediate.project_id }}.{{ observatory_intermediate.dataset_id }}.open_citations{{ snapshot_date.strftime('%Y%m%d') }}` as oa WHERE oa.doi = UPPER(TRIM(ref.doi))) as open_citations, (SELECT as STRUCT * from `{{ observatory_intermediate.project_id }}.{{ observatory_intermediate.dataset_id }}.crossref_events{{ snapshot_date.strftime('%Y%m%d') }}` as events WHERE events.doi = UPPER(TRIM(ref.doi))) as events, (SELECT as STRUCT * from coki_affiliations_temp as coki_affiliations WHERE coki_affiliations.doi = UPPER(TRIM(ref.doi))) as coki_affiliations, @@ -101,7 +101,7 @@ SELECT -- Discipline {# Chooses the values that are passed along in the discipline affiliations, the schema is matching all other affiliations to enable flexibility of later SQL templates. - The values come from the fields of study in the MAG dataset + The values come from the fields of study in the OpenAlex dataset #} ARRAY(SELECT as STRUCT field.display_name as identifier, field.display_name as name, ["Discipline"] as types, CAST(NULL as STRING) as country, CAST(NULL as STRING) as country_code, CAST(NULL as STRING) as country_code_2, CAST(NULL as STRING) as region, @@ -124,7 +124,7 @@ SELECT While there is only ever one Journal for a pubication, this single struct is placed inside of an array '[ ]' to ensure it conforms to the same schema as other affiliation types. This enables templated queries downstream to be greatly simplified #} - [ STRUCT( unpaywall.journal_issn_l as identifier, unpaywall.normalised_journal_name as name, ["Journal"] as types, CAST(NULL as STRING) as country, + [ STRUCT( openaccess.journal_issn_l as identifier, openaccess.normalised_journal_name as name, ["Journal"] as types, CAST(NULL as STRING) as country, CAST(NULL as STRING) as country_code, CAST(NULL as STRING) as country_code_2, CAST(NULL as STRING) as region, CAST(NULL as STRING) as subregion, CAST(NULL as STRING) as coordinates, CAST([] AS ARRAY) as members ) ] as journals, @@ -216,7 +216,7 @@ SELECT ) as ror_groups, {# - dois_temp_table is created as the first sub-query in this script. It contains the data from crossref, unpaywall, openalex, mag, open_citations and crossref events + dois_temp_table is created as the first sub-query in this script. It contains the data from crossref, unpaywall, openalex, open_citations and crossref events This is then LEFT JOINed against the ROR dataset to take the raw ROR IDs that come from the OpenAlex dataset, with a more detailed view of that institution and its location. The county and region information comes from here This Instituional information is JOINed against a COKI created dataset, groupings, which allows for arbitrary grouping of institutions (or really RORs). @@ -236,19 +236,13 @@ Brings together the two temporary tables above. - affiliations is the derived data that neatly organises all the relationships a doi has to authors, their institutions, countries and regions of those institutions, publisher, journal, funders #} SELECT - dois.* EXCEPT (coki_affiliations, unpaywall), - -- Put the unpaywall struct back how it used to be for backwards compatibility - (SELECT as STRUCT - dois.unpaywall.* EXCEPT(repositories, oa_color, oa_license, oa_coki), - dois.unpaywall.oa_color.*, - dois.unpaywall.oa_license.* - ) as unpaywall, + dois.* EXCEPT (coki_affiliations, openaccess), -- The coki struct, which contains fields for a work generated by COKI STRUCT( - dois.unpaywall.oa_color as oa_color, - dois.unpaywall.oa_license as oa_license, - dois.unpaywall.oa_coki as oa_coki, - dois.unpaywall.repositories as repositories, + dois.openaccess.oa_color as oa_color, + dois.openaccess.oa_license as oa_license, + dois.openaccess.oa_coki as oa_coki, + dois.openaccess.repositories as repositories, STRUCT( dois.coki_affiliations.author_institutions as author_institutions, dois.coki_affiliations.genealogical_institutions as genealogical_institutions diff --git a/academic_observatory_workflows/database/sql/create_mag.sql.jinja2 b/academic_observatory_workflows/database/sql/create_mag.sql.jinja2 deleted file mode 100644 index 967ae3ba4..000000000 --- a/academic_observatory_workflows/database/sql/create_mag.sql.jinja2 +++ /dev/null @@ -1,135 +0,0 @@ -{# Copyright 2020 Curtin University -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Author: Richard Hosking, James Diprose #} - -{# This query allows affiliation IDs recorded in MAG to be modified when problems are discovered #} -WITH affiliations_processed as ( - SELECT - affiliation.AffiliationId, - affiliation.DisplayName, - IF(override.grid_id is not null, override.grid_id, affiliation.GridId) as GridId - FROM `{{ mag.project_id }}.{{ mag.dataset_id }}.Affiliations{{ mag.snapshot_date.strftime('%Y%m%d') }}` as affiliation - LEFT JOIN `{{ mag_affiliation_override.project_id }}.{{ mag_affiliation_override.dataset_id }}.{{ mag_affiliation_override.table_id }}` as override on override.AffiliationId = affiliation.AffiliationId -), - -{# This query preprocesses the Fields of Study to simply the later queries #} -fields_of_study as ( - SELECT - fields.*, - ARRAY(SELECT AS STRUCT - extended.AttributeType as AttributeType, - extended.AttributeValue as AttributeValue - FROM `{{ mag.project_id }}.{{ mag.dataset_id }}.FieldOfStudyExtendedAttributes{{ mag.snapshot_date.strftime('%Y%m%d') }}` as extended - WHERE extended.FieldOfStudyId = fields.FieldOfStudyId - ) as extended - FROM `{{ mag.project_id }}.{{ mag.dataset_id }}.FieldsOfStudy{{ mag.snapshot_date.strftime('%Y%m%d') }}` as fields -) - -{# Main Query that takes the range of normalised MAG tables and creates single, DOI indexed, table that is easier to join with Crossref #} -SELECT - papers.* EXCEPT (journalId, ConferenceSeriesId, ConferenceInstanceId, CreatedDate) REPLACE ( UPPER(TRIM(Doi)) AS Doi), - REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REGEXP_REPLACE(JSON_EXTRACT(IndexedAbstract, '$.InvertedIndex'), "[0-9]+", ""), ":", ""), ",", " "), '"', ""), "{", ""), "}", ""), "[", ""), "]", "") as abstract, - fields.fields, - mesh.mesh, - authors.authors, - STRUCT(journal.JournalId, journal.DisplayName, journal.Issn, journal.Publisher) as journal, - STRUCT(conferenceInstance.ConferenceInstanceId, - conferenceInstance.NormalizedName, - conferenceInstance.DisplayName, - conferenceInstance.Location, - conferenceInstance.OfficialUrl, - conferenceInstance.StartDate, - conferenceInstance.EndDate, - conferenceInstance.PaperCount, - conferenceInstance.Latitude, - conferenceInstance.Longitude - ) as conferenceInstance, - STRUCT(conferenceSeries.ConferenceSeriesId, conferenceSeries.NormalizedName, conferenceSeries.DisplayName) as conferenceSeries, - extended.attributes, - resources.resources, - urls.urls, - ARRAY((SELECT GridId FROM authors.authors WHERE GridId IS NOT NULL GROUP BY GridID)) as grids -FROM (SELECT UPPER(TRIM(doi)), ARRAY_AGG(Paperid ORDER BY CitationCount DESC)[offset(0)] as PaperId - FROM `{{ mag.project_id }}.{{ mag.dataset_id }}.Papers{{ mag.snapshot_date.strftime('%Y%m%d') }}` as papers - WHERE (papers.FamilyId is null OR papers.FamilyId = papers.PaperId) AND papers.doi IS NOT NULL - GROUP BY UPPER(TRIM(doi))) as dois - -LEFT JOIN `{{ mag.project_id }}.{{ mag.dataset_id }}.Papers{{ mag.snapshot_date.strftime('%Y%m%d') }}` as papers ON papers.PaperId = dois.PaperId - --- Abstract -LEFT JOIN `{{ mag.project_id }}.{{ mag.dataset_id }}.PaperAbstractsInvertedIndex{{ mag.snapshot_date.strftime('%Y%m%d') }}` as abstracts ON abstracts.PaperId = papers.PaperId - --- Journal -LEFT JOIN `{{ mag.project_id }}.{{ mag.dataset_id }}.Journals{{ mag.snapshot_date.strftime('%Y%m%d') }}` as journal ON journal.JournalId = papers.JournalId - --- ConferenceInstance -LEFT JOIN `{{ mag.project_id }}.{{ mag.dataset_id }}.ConferenceInstances{{ mag.snapshot_date.strftime('%Y%m%d') }}` as conferenceInstance ON conferenceInstance.ConferenceInstanceId = papers.ConferenceInstanceId - --- ConferenceSeries -LEFT JOIN `{{ mag.project_id }}.{{ mag.dataset_id }}.ConferenceSeries{{ mag.snapshot_date.strftime('%Y%m%d') }}` as conferenceSeries ON conferenceSeries.ConferenceSeriesId = papers.ConferenceSeriesId - --- Fields of Study -LEFT JOIN (SELECT - papers.PaperId, - -- Fields of Study - STRUCT( - ARRAY_AGG(IF(fields.Level = 0, STRUCT(fields.DisplayName,fields.FieldOfStudyId,fields.Rank,fields.MainType,paperFields.Score,extended), null) IGNORE NULLS ORDER BY paperFields.Score DESC) as level_0, - ARRAY_AGG(IF(fields.Level = 1, STRUCT(fields.DisplayName,fields.FieldOfStudyId,fields.Rank,fields.MainType,paperFields.Score,extended), null) IGNORE NULLS ORDER BY paperFields.Score DESC) as level_1, - ARRAY_AGG(IF(fields.Level = 2, STRUCT(fields.DisplayName,fields.FieldOfStudyId,fields.Rank,fields.MainType,paperFields.Score,extended), null) IGNORE NULLS ORDER BY paperFields.Score DESC) as level_2, - ARRAY_AGG(IF(fields.Level = 3, STRUCT(fields.DisplayName,fields.FieldOfStudyId,fields.Rank,fields.MainType,paperFields.Score,extended), null) IGNORE NULLS ORDER BY paperFields.Score DESC) as level_3, - ARRAY_AGG(IF(fields.Level = 4, STRUCT(fields.DisplayName,fields.FieldOfStudyId,fields.Rank,fields.MainType,paperFields.Score,extended), null) IGNORE NULLS ORDER BY paperFields.Score DESC) as level_4, - ARRAY_AGG(IF(fields.Level = 5, STRUCT(fields.DisplayName,fields.FieldOfStudyId,fields.Rank,fields.MainType,paperFields.Score,extended), null) IGNORE NULLS ORDER BY paperFields.Score DESC) as level_5) as fields - FROM `{{ mag.project_id }}.{{ mag.dataset_id }}.Papers{{ mag.snapshot_date.strftime('%Y%m%d') }}` as papers - LEFT JOIN `{{ mag.project_id }}.{{ mag.dataset_id }}.PaperFieldsOfStudy{{ mag.snapshot_date.strftime('%Y%m%d') }}` as paperFields on papers.PaperId = paperFields.PaperId - LEFT JOIN fields_of_study as fields on fields.FieldOfStudyId = paperFields.FieldOfStudyId - WHERE papers.Doi IS NOT NULL - GROUP BY papers.PaperId) as fields ON fields.PaperId = papers.PaperId - --- Authors -LEFT JOIN (SELECT - papers.PaperId, - ARRAY_AGG(STRUCT(paperAuthorAffiliations.AuthorSequenceNumber, paperAuthorAffiliations.AuthorID, paperAuthorAffiliations.OriginalAuthor, paperAuthorAffiliations.AffiliationId, paperAuthorAffiliations.OriginalAffiliation, affiliation.GridId, affiliation.DisplayName) IGNORE NULLS ORDER BY paperAuthorAffiliations.AuthorSequenceNumber ASC) as authors - FROM `{{ mag.project_id }}.{{ mag.dataset_id }}.Papers{{ mag.snapshot_date.strftime('%Y%m%d') }}` as papers - LEFT JOIN `{{ mag.project_id }}.{{ mag.dataset_id }}.PaperAuthorAffiliations{{ mag.snapshot_date.strftime('%Y%m%d') }}` as paperAuthorAffiliations on paperAuthorAffiliations.PaperId = papers.PaperId - LEFT JOIN affiliations_processed as affiliation on affiliation.AffiliationId = paperAuthorAffiliations.AffiliationId - GROUP BY papers.PaperId) as authors ON authors.PaperId = papers.PaperId - --- Extended Attributes -LEFT JOIN (SELECT - PaperId, - ARRAY_AGG(STRUCT( AttributeType, AttributeValue)) as attributes - FROM `{{ mag.project_id }}.{{ mag.dataset_id }}.PaperExtendedAttributes{{ mag.snapshot_date.strftime('%Y%m%d') }}` - GROUP BY PaperId) as extended ON extended.PaperId = papers.PaperId - --- Resources -LEFT JOIN (SELECT - PaperId, - ARRAY_AGG(STRUCT( ResourceType , ResourceUrl )) as resources - FROM `{{ mag.project_id }}.{{ mag.dataset_id }}.PaperResources{{ mag.snapshot_date.strftime('%Y%m%d') }}` - GROUP BY PaperId) as resources ON resources.PaperId = papers.PaperId - --- URLs -LEFT JOIN (SELECT - PaperId, - ARRAY_AGG(STRUCT( SourceType , SourceUrl, LanguageCode )) as urls - FROM `{{ mag.project_id }}.{{ mag.dataset_id }}.PaperUrls{{ mag.snapshot_date.strftime('%Y%m%d') }}` - GROUP BY PaperId) as urls ON urls.PaperId = papers.PaperId - --- PaperMESH -LEFT JOIN (SELECT - PaperId, - ARRAY_AGG(STRUCT( DescriptorUI, DescriptorName, QualifierUI, QualifierName, IsMajorTopic )) as mesh - FROM `{{ mag.project_id }}.{{ mag.dataset_id }}.PaperMeSH{{ mag.snapshot_date.strftime('%Y%m%d') }}` - GROUP BY PaperId) as mesh ON mesh.PaperId = papers.PaperId \ No newline at end of file diff --git a/academic_observatory_workflows/database/sql/create_unpaywall.sql.jinja2 b/academic_observatory_workflows/database/sql/create_openaccess.sql.jinja2 similarity index 99% rename from academic_observatory_workflows/database/sql/create_unpaywall.sql.jinja2 rename to academic_observatory_workflows/database/sql/create_openaccess.sql.jinja2 index c2bbb0aa5..34b20650c 100644 --- a/academic_observatory_workflows/database/sql/create_unpaywall.sql.jinja2 +++ b/academic_observatory_workflows/database/sql/create_openaccess.sql.jinja2 @@ -14,7 +14,7 @@ # Author: Richard Hosking, James Diprose, Contributors -## AGGREGATE UNPAYWALL QUERY TEMPLATE +## AGGREGATE OPENACCESS QUERY TEMPLATE This template query contains the SQL that directly interprets Unpaywall data to determine OA categories at the output level. This is therefore diff --git a/academic_observatory_workflows/database/sql/create_unpaywall_repo_names.sql.jinja2 b/academic_observatory_workflows/database/sql/create_openaccess_repo_names.sql.jinja2 similarity index 100% rename from academic_observatory_workflows/database/sql/create_unpaywall_repo_names.sql.jinja2 rename to academic_observatory_workflows/database/sql/create_openaccess_repo_names.sql.jinja2 diff --git a/academic_observatory_workflows/model.py b/academic_observatory_workflows/model.py index a0e71ad44..dfca7918a 100644 --- a/academic_observatory_workflows/model.py +++ b/academic_observatory_workflows/model.py @@ -16,7 +16,6 @@ from __future__ import annotations -import math import os import random import urllib.parse @@ -25,6 +24,7 @@ from datetime import datetime from typing import Dict, List, Tuple +import math import pandas as pd import pendulum from click.testing import CliRunner @@ -1006,59 +1006,6 @@ def make_openalex_dataset(dataset: ObservatoryDataset) -> List[dict]: return result -@dataclass -class MagDataset: - """A container to hold the Microsoft Academic Graph tables. - - :param: Affiliations table rows. - :param: Papers table rows. - :param: PaperAuthorAffiliations rows. - :param: FieldsOfStudy rows. - :param: PaperFieldsOfStudy rows. - """ - - affiliations: List[Dict] - papers: List[Dict] - paper_author_affiliations: List[Dict] - fields_of_study: List[Dict] - paper_fields_of_study: List[Dict] - - -def make_mag(dataset: ObservatoryDataset) -> MagDataset: - """Generate the Microsoft Academic Graph tables from an ObservatoryDataset instance. - - :param dataset: the Observatory Dataset. - :return: the Microsoft Academic Graph dataset. - """ - - # Create affiliations - affiliations = [] - for institute in dataset.institutions: - affiliations.append({"AffiliationId": institute.id, "DisplayName": institute.name, "GridId": institute.grid_id}) - - # Create fields of study - fields_of_study = [] - for fos in dataset.fields_of_study: - fields_of_study.append({"FieldOfStudyId": fos.id, "DisplayName": fos.name, "Level": fos.level}) - - # Create papers, paper_author_affiliations and paper_fields_of_study - papers = [] - paper_author_affiliations = [] - paper_fields_of_study = [] - for paper in dataset.papers: - papers.append({"PaperId": paper.id, "CitationCount": len(paper.cited_by), "Doi": paper.doi}) - - for author in paper.authors: - paper_author_affiliations.append( - {"PaperId": paper.id, "AuthorId": author.id, "AffiliationId": author.institution.id} - ) - - for fos in paper.fields_of_study: - paper_fields_of_study.append({"PaperId": paper.id, "FieldOfStudyId": fos.id}) - - return MagDataset(affiliations, papers, paper_author_affiliations, fields_of_study, paper_fields_of_study) - - def make_crossref_fundref(dataset: ObservatoryDataset) -> List[Dict]: """Generate the Crossref Fundref table from an ObservatoryDataset instance. @@ -1140,7 +1087,6 @@ def bq_load_observatory_dataset( # Generate source datasets open_citations = make_open_citations(observatory_dataset) crossref_events = make_crossref_events(observatory_dataset) - mag: MagDataset = make_mag(observatory_dataset) openalex: List[dict] = make_openalex_dataset(observatory_dataset) crossref_fundref = make_crossref_fundref(observatory_dataset) unpaywall = make_unpaywall(observatory_dataset) @@ -1195,55 +1141,6 @@ def bq_load_observatory_dataset( release_date=snapshot_date, ), ), - Table( - "Affiliations", - True, - dataset_id_all, - mag.affiliations, - bq_find_schema( - path=os.path.join(schema_path, "mag"), table_name="MagAffiliations", release_date=snapshot_date - ), - ), - Table( - "FieldsOfStudy", - True, - dataset_id_all, - mag.fields_of_study, - bq_find_schema( - path=os.path.join(schema_path, "mag"), table_name="MagFieldsOfStudy", release_date=snapshot_date - ), - ), - Table( - "PaperAuthorAffiliations", - True, - dataset_id_all, - mag.paper_author_affiliations, - bq_find_schema( - path=os.path.join(schema_path, "mag"), - table_name="MagPaperAuthorAffiliations", - release_date=snapshot_date, - ), - ), - Table( - "PaperFieldsOfStudy", - True, - dataset_id_all, - mag.paper_fields_of_study, - bq_find_schema( - path=os.path.join(schema_path, "mag"), - table_name="MagPaperFieldsOfStudy", - release_date=snapshot_date, - ), - ), - Table( - "Papers", - True, - dataset_id_all, - mag.papers, - bq_find_schema( - path=os.path.join(schema_path, "mag"), table_name="MagPapers", release_date=snapshot_date - ), - ), Table( "open_citations", True, @@ -1290,95 +1187,6 @@ def bq_load_observatory_dataset( mag_affiliation_override, bq_find_schema(path=os.path.join(schema_path, "doi"), table_name="mag_affiliation_override"), ), - Table( - "PaperAbstractsInvertedIndex", - True, - dataset_id_all, - [], - bq_find_schema( - path=os.path.join(schema_path, "mag"), - table_name="MagPaperAbstractsInvertedIndex", - release_date=snapshot_date, - ), - ), - Table( - "Journals", - True, - dataset_id_all, - [], - bq_find_schema( - path=os.path.join(schema_path, "mag"), table_name="MagJournals", release_date=snapshot_date - ), - ), - Table( - "ConferenceInstances", - True, - dataset_id_all, - [], - bq_find_schema( - path=os.path.join(schema_path, "mag"), - table_name="MagConferenceInstances", - release_date=snapshot_date, - ), - ), - Table( - "ConferenceSeries", - True, - dataset_id_all, - [], - bq_find_schema( - path=os.path.join(schema_path, "mag"), table_name="MagConferenceSeries", release_date=snapshot_date - ), - ), - Table( - "FieldOfStudyExtendedAttributes", - True, - dataset_id_all, - [], - bq_find_schema( - path=os.path.join(schema_path, "mag"), - table_name="MagFieldOfStudyExtendedAttributes", - release_date=snapshot_date, - ), - ), - Table( - "PaperExtendedAttributes", - True, - dataset_id_all, - [], - bq_find_schema( - path=os.path.join(schema_path, "mag"), - table_name="MagPaperExtendedAttributes", - release_date=snapshot_date, - ), - ), - Table( - "PaperResources", - True, - dataset_id_all, - [], - bq_find_schema( - path=os.path.join(schema_path, "mag"), table_name="MagPaperResources", release_date=snapshot_date - ), - ), - Table( - "PaperUrls", - True, - dataset_id_all, - [], - bq_find_schema( - path=os.path.join(schema_path, "mag"), table_name="MagPaperUrls", release_date=snapshot_date - ), - ), - Table( - "PaperMeSH", - True, - dataset_id_all, - [], - bq_find_schema( - path=os.path.join(schema_path, "mag"), table_name="MagPaperMeSH", release_date=snapshot_date - ), - ), Table( "orcid", False, @@ -1495,7 +1303,6 @@ def make_doi_table(dataset: ObservatoryDataset) -> List[Dict]: }, "unpaywall": {}, "unpaywall_history": {}, - "mag": {}, "open_citations": {}, "events": events, "affiliations": { diff --git a/academic_observatory_workflows/workflows/doi_workflow.py b/academic_observatory_workflows/workflows/doi_workflow.py index b8eb27eb2..1008f925a 100644 --- a/academic_observatory_workflows/workflows/doi_workflow.py +++ b/academic_observatory_workflows/workflows/doi_workflow.py @@ -106,7 +106,6 @@ def make_dataset_transforms( dataset_id_crossref_metadata: str = "crossref_metadata", dataset_id_crossref_fundref: str = "crossref_fundref", dataset_id_ror: str = "ror", - dataset_id_mag: str = "mag", dataset_id_orcid: str = "orcid", dataset_id_open_citations: str = "open_citations", dataset_id_unpaywall: str = "unpaywall", @@ -141,16 +140,6 @@ def make_dataset_transforms( }, output_table=Table(output_project_id, dataset_id_observatory_intermediate, "ror"), ), - Transform( - inputs={ - "mag": Table(input_project_id, dataset_id_mag, "Affiliations", sharded=True), - "mag_affiliation_override": Table( - input_project_id, dataset_id_settings, "mag_affiliation_override" - ), - }, - output_table=Table(output_project_id, dataset_id_observatory_intermediate, "mag"), - output_clustering_fields=["Doi"], - ), Transform( inputs={"orcid": Table(input_project_id, dataset_id_orcid, "orcid")}, output_table=Table(output_project_id, dataset_id_observatory_intermediate, "orcid"), @@ -175,7 +164,7 @@ def make_dataset_transforms( sharded=True, ), }, - output_table=Table(output_project_id, dataset_id_observatory_intermediate, "unpaywall"), + output_table=Table(output_project_id, dataset_id_observatory_intermediate, "openaccess"), output_clustering_fields=["doi"], ), Transform( @@ -693,7 +682,7 @@ def create_repo_institution_to_ror_table(self, release: SnapshotRelease, **kwarg """Create the repository_institution_to_ror_table.""" # Fetch unique Unpaywall repository institution names - template_path = os.path.join(sql_folder(), make_sql_jinja2_filename("create_unpaywall_repo_names")) + template_path = os.path.join(sql_folder(), make_sql_jinja2_filename("create_openaccess_repo_names")) sql = render_template(template_path, project_id=self.input_project_id, dataset_id=self.bq_unpaywall_dataset_id) records = bq_run_query(sql) diff --git a/academic_observatory_workflows/workflows/tests/test_doi_workflow.py b/academic_observatory_workflows/workflows/tests/test_doi_workflow.py index 2e3495c7d..e20e69a93 100644 --- a/academic_observatory_workflows/workflows/tests/test_doi_workflow.py +++ b/academic_observatory_workflows/workflows/tests/test_doi_workflow.py @@ -212,19 +212,17 @@ def test_dag_structure(self): "create_crossref_events", "create_crossref_fundref", "create_ror", - "create_mag", "create_orcid", "create_open_citations", - "create_unpaywall", + "create_openaccess", "create_openalex", ], "create_crossref_events": ["create_doi"], "create_crossref_fundref": ["create_doi"], "create_ror": ["create_doi"], - "create_mag": ["create_doi"], "create_orcid": ["create_doi"], "create_open_citations": ["create_doi"], - "create_unpaywall": ["create_doi"], + "create_openaccess": ["create_doi"], "create_openalex": ["create_doi"], "create_doi": ["create_book"], "create_book": [ @@ -327,7 +325,6 @@ def test_telescope(self): dataset_id_crossref_metadata=fake_dataset_id, dataset_id_crossref_fundref=fake_dataset_id, dataset_id_ror=fake_dataset_id, - dataset_id_mag=fake_dataset_id, dataset_id_orcid=fake_dataset_id, dataset_id_open_citations=fake_dataset_id, dataset_id_unpaywall=fake_dataset_id, @@ -710,7 +707,9 @@ def assert_doi_affiliations(self, expected: Dict, actual: Dict): # Subfields fields = ["institutions", "countries", "subregions", "regions", "journals", "publishers", "funders"] + print("assert_doi_affiliations:") for field in fields: + print(f"\t{field}") self.assert_doi_affiliation(expected, actual, field) def assert_doi_affiliation(self, expected: Dict, actual: Dict, key: str):