Skip to content

Commit

Permalink
Refactor DOI table
Browse files Browse the repository at this point in the history
  • Loading branch information
jdddog committed Jul 3, 2023
1 parent 5019bbc commit d5f994a
Show file tree
Hide file tree
Showing 8 changed files with 59 additions and 404 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,19 +32,19 @@ num_green_only_ignoring_bronze_outputs INTEGER NULLABLE
num_has_license INTEGER NULLABLE
num_is_cclicensed INTEGER NULLABLE
#}
CREATE TEMP FUNCTION count_access_types(unpaywall ANY TYPE) as (
CREATE TEMP FUNCTION count_access_types(coki ANY TYPE) as (
(SELECT as STRUCT
COUNTIF(is_oa) as num_oa_outputs,
COUNTIF(is_in_doaj) as num_in_doaj,
COUNTIF(green) as num_green_outputs,
COUNTIF(gold) as num_gold_outputs,
COUNTIF(oa_coki.open) as num_oa_outputs,
{# COUNTIF(oa_color.is_in_doaj) as num_in_doaj,#}
COUNTIF(oa_color.green) as num_green_outputs,
COUNTIF(oa_color.gold) as num_gold_outputs,
COUNTIF(gold_just_doaj) as num_gold_just_doaj_outputs,
COUNTIF(hybrid) as num_hybrid_outputs,
COUNTIF(bronze) as num_bronze_outputs,
COUNTIF(green_only) as num_green_only_outputs,
COUNTIF(green_only_ignoring_bronze) as num_green_only_ignoring_bronze_outputs,
COUNTIF(has_license) as num_has_license,
COUNTIF(is_cclicensed) as num_is_cclicensed
COUNTIF(oa_color.hybrid) as num_hybrid_outputs,
COUNTIF(oa_color.bronze) as num_bronze_outputs,
COUNTIF(oa_color.green_only) as num_green_only_outputs,
COUNTIF(oa_color.green_only_ignoring_bronze) as num_green_only_ignoring_bronze_outputs,
COUNTIF(oa_license.has_license) as num_has_license,
COUNTIF(oa_license.is_cclicensed) as num_is_cclicensed
FROM UNNEST(unpaywall))
);

Expand Down Expand Up @@ -609,14 +609,14 @@ CREATE TEMP FUNCTION process_relations(relationships ANY TYPE, total INT64, tota
relation.identifier as id,
COUNT(relation.identifier) as total_outputs,
ROUND(SAFE_DIVIDE( COUNT(relation.identifier), total), 3) as percentage_of_all_outputs,
ROUND(SAFE_DIVIDE( COUNTIF(unpaywall.is_oa) , total_oa ), 3) as percentage_of_all_oa,
ROUND(SAFE_DIVIDE( COUNTIF(coki.oa_coki.open) , total_oa ), 3) as percentage_of_all_oa,
MAX(relation.name) as name,
MAX(relation.country) as country,
MAX(relation.country_code) as country_code,
MAX(relation.region) as region,
MAX(relation.subregion) as subregion,
MAX(relation.coordinates) as coordinates,
count_access_types(ARRAY_AGG(unpaywall)).*,
count_access_types(ARRAY_AGG(coki)).*,
STRUCT(
SUM(citations.openalex) as openalex,
SUM(citations.crossref) as crosssref,
Expand Down Expand Up @@ -727,7 +727,7 @@ WITH tmp_disciplines AS
dois.crossref.references_count as crossref,
dois.open_citations.citations_total as open_citations
) as citations,
unpaywall.is_oa as is_oa, unpaywall.green as green, unpaywall.gold as gold, unpaywall.gold_just_doaj, unpaywall.hybrid, unpaywall.bronze, unpaywall.green_only,
coki.oa_coki.open as is_oa, coki.oa_color.green as green, coki.oa_color.gold as gold, coki.oa_color.gold_just_doaj, coki.oa_color.hybrid, coki.oa_color.bronze, coki.oa_color.green_only,
-- Total Funding
(SELECT COUNT(funder) > 0 from UNNEST(affiliations.funders) as funder) as funding,
-- Domestic, international, both, none or unknown funding
Expand Down Expand Up @@ -761,7 +761,7 @@ tmp_access_types AS (
ARRAY_AGG(
STRUCT(
dois.doi, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations,
unpaywall.is_oa, unpaywall.green, unpaywall.gold, unpaywall.gold_just_doaj, unpaywall.hybrid, unpaywall.bronze, unpaywall.green_only
coki.oa_coki.open, coki.oa_color.green, coki.oa_color.gold, coki.oa_color.gold_just_doaj, coki.oa_color.hybrid, coki.oa_color.bronze, coki.oa_color.green_only
)
)
) as access_types
Expand Down Expand Up @@ -814,11 +814,12 @@ SELECT
) as citations,

-- Output Types

count_output_types(
ARRAY_AGG(
STRUCT(
unpaywall.output_type, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations,
unpaywall.is_oa, unpaywall.green, unpaywall.gold, unpaywall.gold_just_doaj, unpaywall.hybrid, unpaywall.bronze, unpaywall.green_only
crossref.type, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations,
coki.oa_coki.open, coki.oa_color.green, coki.oa_color.gold, coki.oa_color.gold_just_doaj, coki.oa_color.hybrid, coki.oa_color.bronze, coki.oa_color.green_only
)
)
) as output_types,
Expand Down Expand Up @@ -849,13 +850,13 @@ SELECT
ARRAY(
(SELECT as STRUCT
relation,
unpaywall,
coki,
STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations,
ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines
ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa_coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines
FROM UNNEST(affiliations.institutions) as relation
WHERE relation.identifier <> aggregrate.identifier)
)
), COUNT(dois.doi), COUNTIF(unpaywall.is_oa = True)
), COUNT(dois.doi), COUNTIF(coki.oa_coki.open = True)
) as institutions,
{% endif %}

Expand All @@ -873,13 +874,13 @@ SELECT
ARRAY(
(SELECT as STRUCT
relation,
unpaywall,
coki,
STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations,
ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines
ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa_coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines
FROM UNNEST(affiliations.countries) as relation
WHERE relation.identifier <> aggregrate.country_code OR aggregrate.country_code IS NULL)
)
), COUNT(dois.doi), COUNTIF(unpaywall.is_oa = True)
), COUNT(dois.doi), COUNTIF(coki.oa_coki.open = True)
) as countries,
{% endif %}

Expand All @@ -897,13 +898,13 @@ SELECT
ARRAY(
(SELECT as STRUCT
relation,
unpaywall,
coki,
STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations,
ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines
ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa_coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines
FROM UNNEST(affiliations.groupings) as relation
WHERE relation.identifier <> aggregrate.identifier)
)
), COUNT(dois.doi), COUNTIF(unpaywall.is_oa = True)
), COUNT(dois.doi), COUNTIF(coki.oa_coki.open = True)
) as groupings,
{% endif %}

Expand All @@ -921,13 +922,13 @@ SELECT
ARRAY(
(SELECT as STRUCT
relation,
unpaywall,
coki,
STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations,
ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines
ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa_coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines
FROM UNNEST(affiliations.funders) as relation
WHERE relation.identifier <> aggregrate.identifier)
)
), COUNT(dois.doi), COUNTIF(unpaywall.is_oa = True)
), COUNT(dois.doi), COUNTIF(coki.oa_coki.open = True)
) as funders,
{% endif %}

Expand All @@ -947,14 +948,14 @@ SELECT
relation as identifier, relation as name, CAST(NULL as STRING) as country, CAST(NULL as STRING) as country_code,
CAST(NULL as STRING) as region, CAST(NULL as STRING) as subregion, CAST(NULL as STRING) as coordinates
) as relation,
unpaywall,
coki,
STRUCT(
dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations
) as citations,
ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines
ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa_coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines
FROM UNNEST(aggregrate.members) as relation)
)
), COUNT(dois.doi), COUNTIF(unpaywall.is_oa = True)
), COUNT(dois.doi), COUNTIF(coki.oa_coki.open = True)
) as members,
{% endif %}

Expand All @@ -972,13 +973,13 @@ SELECT
ARRAY(
(SELECT as STRUCT
relation,
unpaywall,
coki,
STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations,
ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines
ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa_coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines
FROM UNNEST(affiliations.publishers) as relation
WHERE relation.identifier <> aggregrate.identifier)
)
), COUNT(dois.doi), COUNTIF(unpaywall.is_oa = True)
), COUNT(dois.doi), COUNTIF(coki.oa_coki.open = True)
) as publishers,
{% endif %}

Expand All @@ -996,13 +997,13 @@ SELECT
ARRAY(
(SELECT as STRUCT
relation,
unpaywall,
coki,
STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations,
ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines
ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa_coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines
FROM UNNEST(affiliations.journals) as relation
WHERE relation.identifier <> aggregrate.identifier)
)
), COUNT(dois.doi), COUNTIF(unpaywall.is_oa = True)
), COUNT(dois.doi), COUNTIF(coki.oa_coki.open = True)
) as journals,
{% endif %}

Expand All @@ -1019,7 +1020,7 @@ SELECT
event.source,
event.count,
STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations,
unpaywall.is_oa as is_oa, unpaywall.green as green, unpaywall.gold as gold, unpaywall.gold_just_doaj, unpaywall.hybrid, unpaywall.bronze, unpaywall.green_only
coki.oa_coki.open as is_oa, coki.oa_color.green as green, coki.oa_color.gold as gold, coki.oa_color.gold_just_doaj, coki.oa_color.hybrid, coki.oa_color.bronze, coki.oa_color.green_only
FROM UNNEST(dois.events.events) as event)))
) as events,

Expand Down
28 changes: 11 additions & 17 deletions academic_observatory_workflows/database/sql/create_doi.sql.jinja2
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ SELECT
*,
FROM
(SELECT
{# This SQL block links unpaywall, mag, open citations and crossref events to the DOI and the metadata found in the crossref metadata dataset #}
{# This SQL block links unpaywall, open citations and crossref events to the DOI and the metadata found in the crossref metadata dataset #}
UPPER(TRIM(ref.doi)) as doi,
STRUCT(
title, abstract, issued.date_parts[offset(0)] as published_year,
Expand All @@ -46,9 +46,9 @@ SELECT
type, ISSN, ISBN, issn_type, publisher_location, publisher, member, prefix, container_title, short_container_title, group_title, references_count,
is_referenced_by_count, subject, published_print, license, volume, funder, page, author, link, clinical_trial_number, alternative_id
) as crossref,
(SELECT as STRUCT * from `{{ observatory_intermediate.project_id }}.{{ observatory_intermediate.dataset_id }}.unpaywall{{ snapshot_date.strftime('%Y%m%d') }}` as oa WHERE oa.doi = UPPER(TRIM(ref.doi))) as unpaywall,
(SELECT as STRUCT * from `{{ observatory_intermediate.project_id }}.{{ observatory_intermediate.dataset_id }}.openaccess{{ snapshot_date.strftime('%Y%m%d') }}` as oa WHERE oa.doi = UPPER(TRIM(ref.doi))) as openaccess,
(SELECT as STRUCT * from `{{ unpaywall.project_id }}.{{ unpaywall.dataset_id }}.{{ unpaywall.table_id }}` as unpaywall WHERE unpaywall.doi = UPPER(TRIM(ref.doi))) as unpaywall,
(SELECT as STRUCT * from `{{ observatory_intermediate.project_id }}.{{ observatory_intermediate.dataset_id }}.openalex{{ snapshot_date.strftime('%Y%m%d') }}` as openalex WHERE openalex.doi = UPPER(TRIM(ref.doi))) as openalex,
(SELECT as STRUCT * from `{{ observatory_intermediate.project_id }}.{{ observatory_intermediate.dataset_id }}.mag{{ snapshot_date.strftime('%Y%m%d') }}` as mag WHERE mag.doi = UPPER(TRIM(ref.doi))) as mag,
(SELECT as STRUCT * from `{{ observatory_intermediate.project_id }}.{{ observatory_intermediate.dataset_id }}.open_citations{{ snapshot_date.strftime('%Y%m%d') }}` as oa WHERE oa.doi = UPPER(TRIM(ref.doi))) as open_citations,
(SELECT as STRUCT * from `{{ observatory_intermediate.project_id }}.{{ observatory_intermediate.dataset_id }}.crossref_events{{ snapshot_date.strftime('%Y%m%d') }}` as events WHERE events.doi = UPPER(TRIM(ref.doi))) as events,
(SELECT as STRUCT * from coki_affiliations_temp as coki_affiliations WHERE coki_affiliations.doi = UPPER(TRIM(ref.doi))) as coki_affiliations,
Expand Down Expand Up @@ -101,7 +101,7 @@ SELECT
-- Discipline
{#
Chooses the values that are passed along in the discipline affiliations, the schema is matching all other affiliations to enable flexibility of later SQL templates.
The values come from the fields of study in the MAG dataset
The values come from the fields of study in the OpenAlex dataset
#}
ARRAY(SELECT as STRUCT field.display_name as identifier, field.display_name as name, ["Discipline"] as types,
CAST(NULL as STRING) as country, CAST(NULL as STRING) as country_code, CAST(NULL as STRING) as country_code_2, CAST(NULL as STRING) as region,
Expand All @@ -124,7 +124,7 @@ SELECT
While there is only ever one Journal for a pubication, this single struct is placed inside of an array '[ ]' to ensure it conforms to the same schema as other affiliation types.
This enables templated queries downstream to be greatly simplified
#}
[ STRUCT( unpaywall.journal_issn_l as identifier, unpaywall.normalised_journal_name as name, ["Journal"] as types, CAST(NULL as STRING) as country,
[ STRUCT( openaccess.journal_issn_l as identifier, openaccess.normalised_journal_name as name, ["Journal"] as types, CAST(NULL as STRING) as country,
CAST(NULL as STRING) as country_code, CAST(NULL as STRING) as country_code_2, CAST(NULL as STRING) as region, CAST(NULL as STRING) as subregion, CAST(NULL as STRING) as coordinates,
CAST([] AS ARRAY<STRING>) as members )
] as journals,
Expand Down Expand Up @@ -216,7 +216,7 @@ SELECT
) as ror_groups,

{#
dois_temp_table is created as the first sub-query in this script. It contains the data from crossref, unpaywall, openalex, mag, open_citations and crossref events
dois_temp_table is created as the first sub-query in this script. It contains the data from crossref, unpaywall, openalex, open_citations and crossref events
This is then LEFT JOINed against the ROR dataset to take the raw ROR IDs that come from the OpenAlex dataset, with a more detailed view of that institution and its location. The county and region information comes from here
This Instituional information is JOINed against a COKI created dataset, groupings, which allows for arbitrary grouping of institutions (or really RORs).
Expand All @@ -236,19 +236,13 @@ Brings together the two temporary tables above.
- affiliations is the derived data that neatly organises all the relationships a doi has to authors, their institutions, countries and regions of those institutions, publisher, journal, funders
#}
SELECT
dois.* EXCEPT (coki_affiliations, unpaywall),
-- Put the unpaywall struct back how it used to be for backwards compatibility
(SELECT as STRUCT
dois.unpaywall.* EXCEPT(repositories, oa_color, oa_license, oa_coki),
dois.unpaywall.oa_color.*,
dois.unpaywall.oa_license.*
) as unpaywall,
dois.* EXCEPT (coki_affiliations, openaccess),
-- The coki struct, which contains fields for a work generated by COKI
STRUCT(
dois.unpaywall.oa_color as oa_color,
dois.unpaywall.oa_license as oa_license,
dois.unpaywall.oa_coki as oa_coki,
dois.unpaywall.repositories as repositories,
dois.openaccess.oa_color as oa_color,
dois.openaccess.oa_license as oa_license,
dois.openaccess.oa_coki as oa_coki,
dois.openaccess.repositories as repositories,
STRUCT(
dois.coki_affiliations.author_institutions as author_institutions,
dois.coki_affiliations.genealogical_institutions as genealogical_institutions
Expand Down
Loading

0 comments on commit d5f994a

Please sign in to comment.