From 736f40fe3572d98e9bbaec26313abf74bab5dace Mon Sep 17 00:00:00 2001 From: Jamie Diprose <5715104+jdddog@users.noreply.github.com> Date: Tue, 20 Jun 2023 09:39:31 +1200 Subject: [PATCH] Filter Crossref type and fix release_date naming in oa web workflow (#167) --- .../database/sql/create_aggregate.sql.jinja2 | 10 ++++++++++ academic_observatory_workflows/model.py | 4 ++++ .../workflows/oa_web_workflow.py | 8 ++++---- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2 b/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2 index 32ab6958a..56f855e16 100644 --- a/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2 +++ b/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2 @@ -13,6 +13,10 @@ # limitations under the License. # Author: Richard Hosking #} +{#Include:journal-article,proceedings-article,report,posted-content,edited-book,book,book-chapter,reference-book,monograph,other,book-section,book-part,reference-entry#} +{#Exclude: the types below and NULL#} +{% set CROSSREF_TYPES_TO_EXCLUDE = '("dataset","database","component","report-component","peer-review","grant","proceedings","journal-issue","report-series","book-track")' %} + # Helper Function: Counting Access Types {# Output Schema: @@ -741,6 +745,8 @@ WITH tmp_disciplines AS UNNEST(dois.affiliations.{{ aggregation_field }}) as aggregrate WHERE aggregrate.identifier IS NOT NULL + AND dois.crossref.type IS NOT NULL + AND dois.crossref.type NOT IN {{ CROSSREF_TYPES_TO_EXCLUDE }} GROUP BY aggregrate.identifier, crossref.{{ group_by_time_field }} @@ -764,6 +770,8 @@ tmp_access_types AS ( UNNEST(dois.affiliations.{{ aggregation_field }}) as aggregrate WHERE aggregrate.identifier IS NOT NULL + AND dois.crossref.type IS NOT NULL + AND dois.crossref.type NOT IN {{ CROSSREF_TYPES_TO_EXCLUDE }} GROUP BY aggregrate.identifier, crossref.{{ group_by_time_field }} @@ -1017,6 +1025,8 @@ SELECT FROM `{{ project_id }}.{{ dataset_id }}.doi{{ snapshot_date.strftime('%Y%m%d') }}` as dois, UNNEST(dois.affiliations.{{ aggregation_field }}) as aggregrate WHERE aggregrate.identifier IS NOT NULL +AND dois.crossref.type IS NOT NULL +AND dois.crossref.type NOT IN {{ CROSSREF_TYPES_TO_EXCLUDE }} GROUP BY aggregrate.identifier, crossref.{{ group_by_time_field }} ) diff --git a/academic_observatory_workflows/model.py b/academic_observatory_workflows/model.py index 46c82bf51..a0e71ad44 100644 --- a/academic_observatory_workflows/model.py +++ b/academic_observatory_workflows/model.py @@ -206,6 +206,7 @@ class Paper: id: int doi: str = None title: str = None + type: str = None published_date: pendulum.Date = None output_type: str = None authors: List[Author] = None @@ -790,6 +791,7 @@ def make_papers( # Make paper paper = Paper( i, + type="journal-article", doi=doi_, title=title_, published_date=published_date_, @@ -1099,6 +1101,7 @@ def make_crossref_metadata(dataset: ObservatoryDataset) -> List[Dict]: # Add Crossref record records.append( { + "type": paper.type, "title": [paper.title], "DOI": paper.doi, "is_referenced_by_count": len(paper.cited_by), @@ -1483,6 +1486,7 @@ def make_doi_table(dataset: ObservatoryDataset) -> List[Dict]: { "doi": doi, "crossref": { + "type": paper.type, "title": paper.title, "published_year": paper.published_date.year, "published_month": paper.published_date.month, diff --git a/academic_observatory_workflows/workflows/oa_web_workflow.py b/academic_observatory_workflows/workflows/oa_web_workflow.py index 916cee745..2be7ee851 100644 --- a/academic_observatory_workflows/workflows/oa_web_workflow.py +++ b/academic_observatory_workflows/workflows/oa_web_workflow.py @@ -83,7 +83,7 @@ ("outputs_public", "n_outputs_other_platform_open"), ("outputs_other_internet", "n_outputs_other_platform_open"), ] -INCLUSION_THRESHOLD = {"country": 15, "institution": 800} +INCLUSION_THRESHOLD = {"country": 15, "institution": 1000} MAX_REPOSITORIES = 200 START_YEAR = 2000 END_YEAR = pendulum.now().year - 1 @@ -697,7 +697,7 @@ def build_datasets(self, release: OaWebRelease, **kwargs): ) for version in versions ] - last_updated = zenodo_versions[0].snapshot_date.format("D MMMM YYYY") + last_updated = zenodo_versions[0].release_date.format("D MMMM YYYY") country_stats = make_entity_stats(countries) institution_stats = make_entity_stats(institutions) stats = Stats(START_YEAR, END_YEAR, last_updated, zenodo_versions, country_stats, institution_stats) @@ -960,11 +960,11 @@ def to_dict(self) -> Dict: @dataclasses.dataclass class ZenodoVersion: - snapshot_date: pendulum.DateTime + release_date: pendulum.DateTime download_url: str def to_dict(self) -> Dict: - return {"snapshot_date": self.snapshot_date.strftime("%Y-%m-%d"), "download_url": self.download_url} + return {"release_date": self.release_date.strftime("%Y-%m-%d"), "download_url": self.download_url} @dataclasses.dataclass