Skip to content

Commit

Permalink
Filter Crossref type and fix release_date naming in oa web workflow (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
jdddog committed Jun 19, 2023
1 parent 88c0ea7 commit 736f40f
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
# limitations under the License.
# Author: Richard Hosking #}

{#Include:journal-article,proceedings-article,report,posted-content,edited-book,book,book-chapter,reference-book,monograph,other,book-section,book-part,reference-entry#}
{#Exclude: the types below and NULL#}
{% set CROSSREF_TYPES_TO_EXCLUDE = '("dataset","database","component","report-component","peer-review","grant","proceedings","journal-issue","report-series","book-track")' %}

# Helper Function: Counting Access Types
{#
Output Schema:
Expand Down Expand Up @@ -741,6 +745,8 @@ WITH tmp_disciplines AS
UNNEST(dois.affiliations.{{ aggregation_field }}) as aggregrate
WHERE
aggregrate.identifier IS NOT NULL
AND dois.crossref.type IS NOT NULL
AND dois.crossref.type NOT IN {{ CROSSREF_TYPES_TO_EXCLUDE }}
GROUP BY
aggregrate.identifier,
crossref.{{ group_by_time_field }}
Expand All @@ -764,6 +770,8 @@ tmp_access_types AS (
UNNEST(dois.affiliations.{{ aggregation_field }}) as aggregrate
WHERE
aggregrate.identifier IS NOT NULL
AND dois.crossref.type IS NOT NULL
AND dois.crossref.type NOT IN {{ CROSSREF_TYPES_TO_EXCLUDE }}
GROUP BY
aggregrate.identifier,
crossref.{{ group_by_time_field }}
Expand Down Expand Up @@ -1017,6 +1025,8 @@ SELECT

FROM `{{ project_id }}.{{ dataset_id }}.doi{{ snapshot_date.strftime('%Y%m%d') }}` as dois, UNNEST(dois.affiliations.{{ aggregation_field }}) as aggregrate
WHERE aggregrate.identifier IS NOT NULL
AND dois.crossref.type IS NOT NULL
AND dois.crossref.type NOT IN {{ CROSSREF_TYPES_TO_EXCLUDE }}
GROUP BY aggregrate.identifier, crossref.{{ group_by_time_field }}
)

Expand Down
4 changes: 4 additions & 0 deletions academic_observatory_workflows/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ class Paper:
id: int
doi: str = None
title: str = None
type: str = None
published_date: pendulum.Date = None
output_type: str = None
authors: List[Author] = None
Expand Down Expand Up @@ -790,6 +791,7 @@ def make_papers(
# Make paper
paper = Paper(
i,
type="journal-article",
doi=doi_,
title=title_,
published_date=published_date_,
Expand Down Expand Up @@ -1099,6 +1101,7 @@ def make_crossref_metadata(dataset: ObservatoryDataset) -> List[Dict]:
# Add Crossref record
records.append(
{
"type": paper.type,
"title": [paper.title],
"DOI": paper.doi,
"is_referenced_by_count": len(paper.cited_by),
Expand Down Expand Up @@ -1483,6 +1486,7 @@ def make_doi_table(dataset: ObservatoryDataset) -> List[Dict]:
{
"doi": doi,
"crossref": {
"type": paper.type,
"title": paper.title,
"published_year": paper.published_date.year,
"published_month": paper.published_date.month,
Expand Down
8 changes: 4 additions & 4 deletions academic_observatory_workflows/workflows/oa_web_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@
("outputs_public", "n_outputs_other_platform_open"),
("outputs_other_internet", "n_outputs_other_platform_open"),
]
INCLUSION_THRESHOLD = {"country": 15, "institution": 800}
INCLUSION_THRESHOLD = {"country": 15, "institution": 1000}
MAX_REPOSITORIES = 200
START_YEAR = 2000
END_YEAR = pendulum.now().year - 1
Expand Down Expand Up @@ -697,7 +697,7 @@ def build_datasets(self, release: OaWebRelease, **kwargs):
)
for version in versions
]
last_updated = zenodo_versions[0].snapshot_date.format("D MMMM YYYY")
last_updated = zenodo_versions[0].release_date.format("D MMMM YYYY")
country_stats = make_entity_stats(countries)
institution_stats = make_entity_stats(institutions)
stats = Stats(START_YEAR, END_YEAR, last_updated, zenodo_versions, country_stats, institution_stats)
Expand Down Expand Up @@ -960,11 +960,11 @@ def to_dict(self) -> Dict:

@dataclasses.dataclass
class ZenodoVersion:
snapshot_date: pendulum.DateTime
release_date: pendulum.DateTime
download_url: str

def to_dict(self) -> Dict:
return {"snapshot_date": self.snapshot_date.strftime("%Y-%m-%d"), "download_url": self.download_url}
return {"release_date": self.release_date.strftime("%Y-%m-%d"), "download_url": self.download_url}


@dataclasses.dataclass
Expand Down

0 comments on commit 736f40f

Please sign in to comment.