Update

The-Academic-Observatory · Sep 20, 2023 · 591221b · 591221b
1 parent 064def5
commit 591221b
Show file tree

Hide file tree

Showing 7 changed files with 168 additions and 72 deletions.
diff --git a/academic_observatory_workflows/database/sql/create_crossref_fundref.sql.jinja2 b/academic_observatory_workflows/database/sql/create_crossref_fundref.sql.jinja2
@@ -15,8 +15,8 @@
 # Author: Richard Hosking, James Diprose #}
 
 SELECT
-    UPPER(TRIM(crossref.doi)) as doi,
+    crossref.doi,
     ARRAY_AGG(STRUCT(funder, fundref)) as funders
 FROM `{{ crossref_metadata.project_id }}.{{ crossref_metadata.dataset_id }}.{{ crossref_metadata.table_id }}` as crossref, UNNEST(crossref.funder) as funder
-LEFT JOIN `{{ crossref_fundref.project_id }}.{{ crossref_fundref.dataset_id }}.{{ crossref_fundref.table_id }}` as fundref on SUBSTR(fundref.funder, 19) = funder.doi
-GROUP BY UPPER(TRIM(crossref.doi))
+LEFT JOIN `{{ crossref_fundref.project_id }}.{{ crossref_fundref.dataset_id }}.{{ crossref_fundref.table_id }}` as fundref on UPPER(TRIM(SUBSTR(fundref.funder, 19))) = funder.doi
+GROUP BY crossref.doi
diff --git a/academic_observatory_workflows/database/sql/create_crossref_metadata.sql.jinja2 b/academic_observatory_workflows/database/sql/create_crossref_metadata.sql.jinja2
@@ -0,0 +1,62 @@
+{# Copyright 2020 Curtin University
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Author: James Diprose #}
+
+WITH crossref_subset AS (
+  SELECT
+    UPPER(TRIM(DOI)) as doi,
+    title,
+    abstract,
+    issued.date_parts[offset(0)] as published_year,
+    CASE WHEN ARRAY_LENGTH(issued.date_parts) > 1 THEN issued.date_parts[offset(1)] ELSE 13 END as published_month,
+    CONCAT(issued.date_parts[offset(0)], "-", CASE WHEN ARRAY_LENGTH(issued.date_parts) > 1 THEN issued.date_parts[offset(1)] ELSE 13 END) as published_year_month,
+    type,
+    ISSN,
+    ISBN,
+    issn_type,
+    publisher_location,
+    publisher,
+    member,
+    prefix,
+    container_title,
+    short_container_title,
+    group_title,
+    references_count,
+    is_referenced_by_count,
+    subject,
+    published_print,
+    license,
+    volume,
+    ARRAY(
+      SELECT AS STRUCT
+        UPPER(TRIM(f.DOI)) as doi,
+        f.award,
+        f.doi_asserted_by,
+        f.name
+      FROM UNNEST(funder) as f
+    ) as funder,
+    page,
+    author,
+    link,
+    clinical_trial_number,
+    alternative_id,
+    ROW_NUMBER() OVER (PARTITION BY UPPER(TRIM(DOI)) ORDER BY indexed.date_time DESC) AS rn
+  FROM `{{ crossref_metadata.project_id }}.{{ crossref_metadata.dataset_id }}.{{ crossref_metadata.table_id }}`
+  WHERE ARRAY_LENGTH(issued.date_parts) > 0
+)
+
+SELECT * EXCEPT(rn)
+FROM crossref_subset
+WHERE rn = 1
diff --git a/academic_observatory_workflows/database/sql/create_doi.sql.jinja2 b/academic_observatory_workflows/database/sql/create_doi.sql.jinja2
@@ -25,8 +25,8 @@ WITH coki_affiliations_temp as (
       openalex.doi,
       ARRAY_AGG(DISTINCT ror_hierarchy.child_id IGNORE NULLS) as author_institutions,
       ARRAY_CONCAT_AGG(ror_hierarchy.ror_ids) as genealogical_institutions
-    FROM `{{ observatory_intermediate.project_id }}.{{ observatory_intermediate.dataset_id }}.openalex{{ snapshot_date.strftime('%Y%m%d') }}` as openalex, UNNEST(IF(openalex.authorships is not null, openalex.authorships, [])) AS authors, UNNEST(IF(authors.institutions is not null, authors.institutions, [])) as institution
-    LEFT JOIN `{{ observatory_intermediate.project_id }}.{{ observatory_intermediate.dataset_id }}.ror_hierarchy{{ snapshot_date.strftime('%Y%m%d') }}` as ror_hierarchy ON ror_hierarchy.child_id = institution.ror
+    FROM `{{ openalex.project_id }}.{{ openalex.dataset_id }}.{{ openalex.table_id }}` as openalex, UNNEST(IF(openalex.authorships is not null, openalex.authorships, [])) AS authors, UNNEST(IF(authors.institutions is not null, authors.institutions, [])) as institution
+    LEFT JOIN `{{ ror_hierarchy.project_id }}.{{ ror_hierarchy.dataset_id }}.{{ ror_hierarchy.table_id }}` as ror_hierarchy ON ror_hierarchy.child_id = institution.ror
     GROUP BY openalex.doi) as coki_affiliations
 ),
 
@@ -38,23 +38,16 @@ SELECT
  FROM
   (SELECT
     {# This SQL block links unpaywall, pubmed, open citations and crossref events to the DOI and the metadata found in the crossref metadata dataset #}
-    UPPER(TRIM(ref.doi)) as doi,
-    STRUCT(
-            title, abstract, issued.date_parts[offset(0)] as published_year,
-            CASE WHEN ARRAY_LENGTH(issued.date_parts) > 1 THEN issued.date_parts[offset(1)] ELSE 13 END as published_month,
-            CONCAT(issued.date_parts[offset(0)], "-", CASE WHEN ARRAY_LENGTH(issued.date_parts) > 1 THEN issued.date_parts[offset(1)] ELSE 13 END) as published_year_month,
-            type, ISSN, ISBN, issn_type, publisher_location, publisher, member, prefix, container_title, short_container_title, group_title, references_count,
-            is_referenced_by_count, subject, published_print, license, volume, funder, page, author, link, clinical_trial_number, alternative_id
-    ) as crossref,
-    (SELECT as STRUCT * from `{{ observatory_intermediate.project_id }}.{{ observatory_intermediate.dataset_id }}.openaccess{{ snapshot_date.strftime('%Y%m%d') }}` as oa WHERE oa.doi = UPPER(TRIM(ref.doi))) as openaccess,
-    (SELECT as STRUCT * from `{{ unpaywall.project_id }}.{{ unpaywall.dataset_id }}.{{ unpaywall.table_id }}` as unpaywall WHERE unpaywall.doi = UPPER(TRIM(ref.doi))) as unpaywall,
-    (SELECT as STRUCT * from `{{ observatory_intermediate.project_id }}.{{ observatory_intermediate.dataset_id }}.openalex{{ snapshot_date.strftime('%Y%m%d') }}` as openalex WHERE openalex.doi = UPPER(TRIM(ref.doi))) as openalex,
-    (SELECT as STRUCT * from `{{ observatory_intermediate.project_id }}.{{ observatory_intermediate.dataset_id }}.open_citations{{ snapshot_date.strftime('%Y%m%d') }}` as oa WHERE oa.doi = UPPER(TRIM(ref.doi))) as open_citations,
-    (SELECT as STRUCT * from `{{ observatory_intermediate.project_id }}.{{ observatory_intermediate.dataset_id }}.crossref_events{{ snapshot_date.strftime('%Y%m%d') }}` as events WHERE events.doi = UPPER(TRIM(ref.doi))) as events,
-    (SELECT as STRUCT * from `{{ observatory_intermediate.project_id }}.{{ observatory_intermediate.dataset_id }}.pubmed{{ snapshot_date.strftime('%Y%m%d') }}` as pubmed WHERE pubmed.doi = UPPER(TRIM(ref.doi))) as pubmed,
-    (SELECT as STRUCT * from coki_affiliations_temp as coki_affiliations WHERE coki_affiliations.doi = UPPER(TRIM(ref.doi))) as coki_affiliations,
-  FROM `{{ crossref_metadata.project_id }}.{{ crossref_metadata.dataset_id }}.crossref_metadata{{ crossref_metadata.snapshot_date.strftime('%Y%m%d') }}` as ref
-  WHERE ARRAY_LENGTH(issued.date_parts) > 0)
+    ref.doi as doi,
+    ref.* AS crossref,
+    (SELECT as STRUCT * from `{{ openaccess.project_id }}.{{ openaccess.dataset_id }}.{{ openaccess.table_id }}` as oa WHERE oa.doi = ref.doi as openaccess,
+    (SELECT as STRUCT * from `{{ unpaywall.project_id }}.{{ unpaywall.dataset_id }}.{{ unpaywall.table_id }}` as unpaywall WHERE unpaywall.doi = ref.doi as unpaywall,
+    (SELECT as STRUCT * from `{{ openalex.project_id }}.{{ openalex.dataset_id }}.{{ openalex.table_id }}` as openalex WHERE openalex.doi = ref.doi as openalex,
+    (SELECT as STRUCT * from `{{ open_citations.project_id }}.{{ open_citations.dataset_id }}.{{ open_citations.table_id }}` as oa WHERE oa.doi = ref.doi as open_citations,
+    (SELECT as STRUCT * from `{{ crossref_events.project_id }}.{{ crossref_events.dataset_id }}.{{ crossref_events.table_id }}` as events WHERE events.doi = ref.doi as events,
+    (SELECT as STRUCT * from `{{ pubmed.project_id }}.{{ pubmed.dataset_id }}.{{ pubmed.table_id }}` as pubmed WHERE pubmed.doi = ref.doi as pubmed,
+    (SELECT as STRUCT * from coki_affiliations_temp as coki_affiliations WHERE coki_affiliations.doi = ref.doi as coki_affiliations,
+  FROM `{{ crossref_metadata.project_id }}.{{ crossref_metadata.dataset_id }}.{{ crossref_metadata.table_id }}` as ref
 ),
 
 {# this query builds the .affiliation section of the final doi table. The primary purpose of this is to allow the aggregrate_doi query #}
@@ -224,11 +217,11 @@ SELECT
    Lastly, Fundref and ORCID are also LEFT JOINed in order to drive the Funder and Author relationships
   #}
   FROM dois_temp_table as dois, UNNEST(coki_affiliations.genealogical_institutions) as ror_id
-  LEFT JOIN `{{ observatory_intermediate.project_id }}.{{ observatory_intermediate.dataset_id }}.ror{{ snapshot_date.strftime('%Y%m%d') }}` as institution on ror_id = institution.id
-  LEFT JOIN (SELECT ror, ARRAY_AGG(STRUCT(group_id, group_name, country_code)) as groupings FROM `{{ groupings.project_id }}.{{ groupings.dataset_id }}.groupings` CROSS JOIN UNNEST(rors) as ror GROUP BY ror) as ror_groups on institution.id = ror_groups.ror
+  LEFT JOIN `{{ ror.project_id }}.{{ ror.dataset_id }}.{{ ror.table_id }}` as institution on ror_id = institution.id
+  LEFT JOIN (SELECT ror, ARRAY_AGG(STRUCT(group_id, group_name, country_code)) as groupings FROM `{{ groupings.project_id }}.{{ groupings.dataset_id }}.{{ groupings.table_id }}` CROSS JOIN UNNEST(rors) as ror GROUP BY ror) as ror_groups on institution.id = ror_groups.ror
   GROUP BY doi) as base on extras.doi = base.doi
-  LEFT JOIN `{{ observatory_intermediate.project_id }}.{{ observatory_intermediate.dataset_id }}.crossref_fundref{{ snapshot_date.strftime('%Y%m%d') }}` as fundref on fundref.doi = extras.doi
-  LEFT JOIN `{{ observatory_intermediate.project_id }}.{{ observatory_intermediate.dataset_id }}.orcid{{ snapshot_date.strftime('%Y%m%d') }}` as orcid on UPPER(TRIM(orcid.doi)) = UPPER(TRIM(extras.doi))
+  LEFT JOIN `{{ crossref_fundref.project_id }}.{{ crossref_fundref.dataset_id }}.{{ crossref_fundref.table_id }}` as fundref on fundref.doi = extras.doi
+  LEFT JOIN `{{ orcid.project_id }}.{{ orcid.dataset_id }}.{{ orcid.table_id }}` as orcid on orcid.doi = extras.doi
 )
 
 {#

diff --git a/academic_observatory_workflows/database/sql/create_orcid.sql.jinja2 b/academic_observatory_workflows/database/sql/create_orcid.sql.jinja2
@@ -34,7 +34,7 @@ WITH
 
 {# This is the main query, which inverts the ORCID centered data to be DOI centric, so it has a list of authors per DOI, instead of a list of DOIs per author#}
 SELECT
-  doi,
+  UPPER(TRIM(doi)) as doi,
   ARRAY_AGG(orcid) AS orcid
 FROM (
   SELECT

diff --git a/academic_observatory_workflows/database/sql/create_pubmed.sql.jinja2 b/academic_observatory_workflows/database/sql/create_pubmed.sql.jinja2
@@ -17,7 +17,7 @@
 --- This is to make an intermediate table of all records that have the highest version number from Pubmed which will be joined onto the DOI table. 
 
 SELECT 
-  recent.doi,
+  UPPER(TRIM(recent.doi)) as doi,
   pubmed.*
 FROM (
   SELECT
-Original file line number
+Diff line change
@@ Expand Up / @@ -34,7 +34,7 @@ WITH @@
     {# This is the main query, which inverts the ORCID centered data to be DOI centric, so it has a list of authors per DOI, instead of a list of DOIs per author#}
     SELECT
-      doi,
+      UPPER(TRIM(doi)) as doi,
       ARRAY_AGG(orcid) AS orcid
     FROM (
       SELECT
@@ Expand Down @@