Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
  • Loading branch information
jdddog committed Sep 21, 2023
1 parent 591221b commit c11cfd1
Show file tree
Hide file tree
Showing 8 changed files with 202 additions and 301 deletions.
174 changes: 0 additions & 174 deletions academic_observatory_workflows/database/sql/create_book.sql.jinja2

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,16 @@
# Author: Richard Hosking, James Diprose #}

WITH fundref_transformed AS (
SELECT
UPPER(TRIM(SUBSTR(funder, 19))) as funder,
* EXCEPT(funder)
FROM `{{ crossref_fundref.project_id }}.{{ crossref_fundref.dataset_id }}.{{ crossref_fundref.table_id }}` as fundref
)

SELECT
crossref.doi,
ARRAY_AGG(STRUCT(funder, fundref)) as funders
crossref.doi as doi,
ARRAY_AGG(STRUCT(funder, fundref)) as funders,
FROM `{{ crossref_metadata.project_id }}.{{ crossref_metadata.dataset_id }}.{{ crossref_metadata.table_id }}` as crossref, UNNEST(crossref.funder) as funder
LEFT JOIN `{{ crossref_fundref.project_id }}.{{ crossref_fundref.dataset_id }}.{{ crossref_fundref.table_id }}` as fundref on UPPER(TRIM(SUBSTR(fundref.funder, 19))) = funder.doi
LEFT JOIN fundref_transformed as fundref on fundref.funder = funder.doi
GROUP BY crossref.doi
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{# Copyright 2020 Curtin University
{# Copyright 2023 Curtin University
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
21 changes: 10 additions & 11 deletions academic_observatory_workflows/database/sql/create_doi.sql.jinja2
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,15 @@ SELECT
(SELECT
{# This SQL block links unpaywall, pubmed, open citations and crossref events to the DOI and the metadata found in the crossref metadata dataset #}
ref.doi as doi,
ref.* AS crossref,
(SELECT as STRUCT * from `{{ openaccess.project_id }}.{{ openaccess.dataset_id }}.{{ openaccess.table_id }}` as oa WHERE oa.doi = ref.doi as openaccess,
(SELECT as STRUCT * from `{{ unpaywall.project_id }}.{{ unpaywall.dataset_id }}.{{ unpaywall.table_id }}` as unpaywall WHERE unpaywall.doi = ref.doi as unpaywall,
(SELECT as STRUCT * from `{{ openalex.project_id }}.{{ openalex.dataset_id }}.{{ openalex.table_id }}` as openalex WHERE openalex.doi = ref.doi as openalex,
(SELECT as STRUCT * from `{{ open_citations.project_id }}.{{ open_citations.dataset_id }}.{{ open_citations.table_id }}` as oa WHERE oa.doi = ref.doi as open_citations,
(SELECT as STRUCT * from `{{ crossref_events.project_id }}.{{ crossref_events.dataset_id }}.{{ crossref_events.table_id }}` as events WHERE events.doi = ref.doi as events,
(SELECT as STRUCT * from `{{ pubmed.project_id }}.{{ pubmed.dataset_id }}.{{ pubmed.table_id }}` as pubmed WHERE pubmed.doi = ref.doi as pubmed,
(SELECT as STRUCT * from coki_affiliations_temp as coki_affiliations WHERE coki_affiliations.doi = ref.doi as coki_affiliations,
FROM `{{ crossref_metadata.project_id }}.{{ crossref_metadata.dataset_id }}.{{ crossref_metadata.table_id }}` as ref
(SELECT as STRUCT * from `{{ crossref_metadata.project_id }}.{{ crossref_metadata.dataset_id }}.{{ crossref_metadata.table_id }}` as ref_row WHERE ref_row.doi = ref.doi) as crossref,
(SELECT as STRUCT * from `{{ openaccess.project_id }}.{{ openaccess.dataset_id }}.{{ openaccess.table_id }}` as oa WHERE oa.doi = ref.doi) as openaccess,
(SELECT as STRUCT * from `{{ unpaywall.project_id }}.{{ unpaywall.dataset_id }}.{{ unpaywall.table_id }}` as unpaywall WHERE unpaywall.doi = ref.doi) as unpaywall,
(SELECT as STRUCT * from `{{ openalex.project_id }}.{{ openalex.dataset_id }}.{{ openalex.table_id }}` as openalex WHERE openalex.doi = ref.doi) as openalex,
(SELECT as STRUCT * from `{{ open_citations.project_id }}.{{ open_citations.dataset_id }}.{{ open_citations.table_id }}` as oa WHERE oa.doi = ref.doi) as open_citations,
(SELECT as STRUCT * from `{{ crossref_events.project_id }}.{{ crossref_events.dataset_id }}.{{ crossref_events.table_id }}` as events WHERE events.doi = ref.doi) as events,
(SELECT as STRUCT * from `{{ pubmed.project_id }}.{{ pubmed.dataset_id }}.{{ pubmed.table_id }}` as pubmed WHERE pubmed.doi = ref.doi) as pubmed,
(SELECT as STRUCT * from coki_affiliations_temp as coki_affiliations WHERE coki_affiliations.doi = ref.doi) as coki_affiliations,
FROM `{{ crossref_metadata.project_id }}.{{ crossref_metadata.dataset_id }}.{{ crossref_metadata.table_id }}` as ref)
),

{# this query builds the .affiliation section of the final doi table. The primary purpose of this is to allow the aggregrate_doi query #}
Expand Down Expand Up @@ -218,8 +218,7 @@ SELECT
#}
FROM dois_temp_table as dois, UNNEST(coki_affiliations.genealogical_institutions) as ror_id
LEFT JOIN `{{ ror.project_id }}.{{ ror.dataset_id }}.{{ ror.table_id }}` as institution on ror_id = institution.id
LEFT JOIN (SELECT ror, ARRAY_AGG(STRUCT(group_id, group_name, country_code)) as groupings FROM `{{ groupings.project_id }}.{{ groupings.dataset_id }}.{{ groupings.table_id }}` CROSS JOIN UNNEST(rors) as ror GROUP BY ror) as ror_groups on institution.id = ror_groups.ror
GROUP BY doi) as base on extras.doi = base.doi
LEFT JOIN (SELECT ror, ARRAY_AGG(STRUCT(group_id, group_name, country_code)) as groupings FROM `{{ groupings.project_id }}.{{ groupings.dataset_id }}.{{ groupings.table_id }}` CROSS JOIN UNNEST(rors) as ror GROUP BY ror) as ror_groups on institution.id = ror_groups.ror GROUP BY doi) as base on extras.doi = base.doi
LEFT JOIN `{{ crossref_fundref.project_id }}.{{ crossref_fundref.dataset_id }}.{{ crossref_fundref.table_id }}` as fundref on fundref.doi = extras.doi
LEFT JOIN `{{ orcid.project_id }}.{{ orcid.dataset_id }}.{{ orcid.table_id }}` as orcid on orcid.doi = extras.doi
)
Expand Down
105 changes: 99 additions & 6 deletions academic_observatory_workflows/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,15 @@

from __future__ import annotations

import math
import os
import random
import binascii
import urllib.parse
import uuid
from dataclasses import dataclass
from datetime import datetime
from typing import Dict, List, Tuple

import math
import pandas as pd
import pendulum
from click.testing import CliRunner
Expand Down Expand Up @@ -1051,6 +1050,99 @@ def make_openalex_dataset(dataset: ObservatoryDataset) -> List[dict]:
return result


def make_orcid(dataset: ObservatoryDataset) -> List[Dict]:
# A single fake record so that the get_snapshot_date function works (can't have an empty table)
return [
{
"orcid_identifier": {
"host": "orcid.org",
"path": "0000-0000-0000-0000",
"uri": "http://orcid.org/0000-0000-0000-0000",
},
"person": {
"last_modified_date": None,
"name": {
"given_names": "Joe",
"family_name": "Blogs",
},
},
"activities_summary": {
"works": {
"last_modified_date": "2022-01-01 00:00:00.000000 UTC",
"group": [
{
"last_modified_date": "2022-01-01 00:00:00.000000 UTC",
"work_summary": [
{
"created_date": "2022-01-01 00:00:00.000000 UTC",
"last_modified_date": "2022-01-01 00:00:00.000000 UTC",
"source": {
"source_orcid": None,
"source_client_id": {
"host": "orcid.org",
"path": "0000-0000-0000-0000",
"uri": "http://orcid.org/client/0000-0000-0000-0000",
},
"source_name": "Crossref Metadata Search",
"assertion_origin_orcid": None,
"assertion_origin_client_id": None,
"assertion_origin_name": None,
},
"put_code": "00000000",
"visibility": "public",
"display_index": "0",
"path": "/0000-0000-0000-0000/work/00000000",
"title": {"title": "A Paper", "subtitle": None, "translated_title": None},
"external_ids": {
"external_id": [
{
"created_date": None,
"last_modified_date": None,
"source": None,
"put_code": None,
"visibility": None,
"display_index": None,
"path": None,
"external_id_type": "doi",
"external_id_value": "10.0000/s00000-000-00000-0",
"external_id_normalized": None,
"external_id_normalized_error": None,
"external_id_url": None,
"external_id_relationship": "self",
},
{
"created_date": None,
"last_modified_date": None,
"source": None,
"put_code": None,
"visibility": None,
"display_index": None,
"path": None,
"external_id_type": "issn",
"external_id_value": "0000-0000",
"external_id_normalized": None,
"external_id_normalized_error": None,
"external_id_url": None,
"external_id_relationship": "part-of",
},
]
},
"url": None,
"type": "journal-article",
"publication_date": {"year": "2020", "month": "9", "day": None},
"journal_title": None,
}
],
}
],
"path": "/0000-0000-0000-0000/works",
},
"path": "/0000-0000-0000-0000/activities",
},
}
]


def make_pubmed(dataset: ObservatoryDataset) -> List[Dict]:
"""Generate the Pubmed table from an ObservatoryDataset instance.
Expand Down Expand Up @@ -1198,6 +1290,7 @@ def bq_load_observatory_dataset(
crossref_metadata = make_crossref_metadata(observatory_dataset)
scihub = make_scihub(observatory_dataset)
pubmed: List[dict] = make_pubmed(observatory_dataset)
orcid: List[dict] = make_orcid(observatory_dataset)

# Load fake ROR and settings datasets
test_doi_path = test_fixtures_folder("doi")
Expand Down Expand Up @@ -1299,8 +1392,8 @@ def bq_load_observatory_dataset(
"orcid",
False,
dataset_id_all,
[],
bq_find_schema(path=os.path.join(schema_path, "orcid"), table_name="orcid", release_date=snapshot_date),
orcid,
bq_find_schema(path=os.path.join(schema_path, "orcid"), table_name="orcid"),
),
Table(
"works",
Expand Down Expand Up @@ -1414,7 +1507,7 @@ def make_doi_table(dataset: ObservatoryDataset) -> List[Dict]:
"published_year": paper.published_date.year,
"published_month": paper.published_date.month,
"published_year_month": f"{paper.published_date.year}-{paper.published_date.month}",
"funder": [{"name": funder.name, "DOI": funder.doi} for funder in paper.funders],
"funder": [{"name": funder.name, "DOI": funder.doi.upper()} for funder in paper.funders],
},
"unpaywall": {},
"unpaywall_history": {},
Expand Down Expand Up @@ -1500,7 +1593,7 @@ def make_doi_journals(in_unpaywall: bool, journal: Journal) -> List[Dict]:
"""Make the journal affiliation list for a DOI table row.
:param in_unpaywall: whether the work is in Unpaywall or not. At the moment the journal IDs come from Unpaywall,
and if the work is not in Unpaywall then the journal id and name will be null.
and if the work is not in Unpaywall then the journal id and name will be None.
:param journal: the paper's journal.
:return: the journal affiliation list.
"""
Expand Down
Loading

0 comments on commit c11cfd1

Please sign in to comment.