Skip to content

Commit

Permalink
Refactor DOI table some more
Browse files Browse the repository at this point in the history
  • Loading branch information
jdddog committed Jul 7, 2023
1 parent d5f994a commit 8b85186
Show file tree
Hide file tree
Showing 8 changed files with 198 additions and 129 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[
{
"name": "doi",
"description": "A DOI for a work that SciHub covers",
"mode": "REQUIRED",
"type": "STRING"
}
]

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -239,9 +239,11 @@ SELECT
dois.* EXCEPT (coki_affiliations, openaccess),
-- The coki struct, which contains fields for a work generated by COKI
STRUCT(
dois.openaccess.oa_color as oa_color,
dois.openaccess.oa_license as oa_license,
dois.openaccess.oa_coki as oa_coki,
STRUCT(
dois.openaccess.oa_color as color,
dois.openaccess.oa_license as license,
dois.openaccess.oa_coki as coki
) as oa,
dois.openaccess.repositories as repositories,
STRUCT(
dois.coki_affiliations.author_institutions as author_institutions,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ CREATE TEMP FUNCTION PMH_ID_TO_DOMAIN(pmh_id STRING)
WITH

-- Creates a list of ISSN-L to Normalised Journal Names. The name chosen for each ISSN-L is the most common occurance
name as (
issnl_index as (
SELECT
identifier,
ARRAY_AGG(name IGNORE NULLS ORDER BY count DESC LIMIT 1)[SAFE_OFFSET(0)] as name
Expand Down Expand Up @@ -194,29 +194,24 @@ repositories as (
-- The OA colour and license calculations
base_oa_calcs as (
SELECT
UPPER(TRIM(unpaywall.doi)) as doi,
year,
genre as output_type,
publisher,
journal_name,
name.name as normalised_journal_name,
best_oa_location.url_for_landing_page,
best_oa_location.url_for_pdf,
journal_issn_l,
UPPER(TRIM(crossref.doi)) as doi,
unpaywall.journal_issn_l, -- still used in DOI query
issnl_index.name as normalised_journal_name, -- still used in DOI query

-- ### Is Open Access:
-- We use the is_oa tag from Unpaywall directly to populate general OA status. This includes bronze.
is_oa,
journal_is_in_doaj as is_in_doaj,
journal_is_oa,
oa_status as unpaywall_oa_status,
CASE
WHEN unpaywall.is_oa THEN TRUE
ELSE FALSE
END
as is_oa,

-- ### Gold Open Access:
-- Gold OA is defined as either the journal being in DOAJ or the best_oa_location being a publisher and a
-- license being detected. This works because Unpaywall will set the publisher as the best oa location if
-- it identifies an accessible publisher copy.
CASE
WHEN journal_is_in_doaj OR (best_oa_location.host_type = "publisher" AND best_oa_location.license is not null AND not journal_is_in_doaj) THEN TRUE
WHEN unpaywall.journal_is_in_doaj OR (unpaywall.best_oa_location.host_type = "publisher" AND unpaywall.best_oa_location.license IS NOT NULL AND NOT unpaywall.journal_is_in_doaj) THEN TRUE
ELSE FALSE
END
as gold,
Expand All @@ -226,7 +221,7 @@ base_oa_calcs as (
-- checking is done on this, so articles that Unpaywall does not capture as being accessible that are in DOAJ
-- journals will be characterised as gold_just_doaj.
CASE
WHEN journal_is_in_doaj THEN TRUE
WHEN unpaywall.journal_is_in_doaj THEN TRUE
ELSE FALSE
END
as gold_just_doaj,
Expand All @@ -237,7 +232,7 @@ base_oa_calcs as (
-- license. The use of DOAJ as defining a "fully oa journal" is also narrow and future developments will
-- expand this considering, among other parameters, the Unpaywall tag 'journal-is-oa'.
CASE
WHEN (best_oa_location.host_type = "publisher" AND best_oa_location.license is not null AND not journal_is_in_doaj) THEN TRUE
WHEN (unpaywall.best_oa_location.host_type = "publisher" AND unpaywall.best_oa_location.license IS NOT NULL AND not unpaywall.journal_is_in_doaj) THEN TRUE
ELSE FALSE
END
as hybrid,
Expand All @@ -248,7 +243,7 @@ base_oa_calcs as (
-- paywall) as in these cases a more open license is not generally applied. However, this is a heuristic and
-- there are significant issues distinguishing between different modes by which publishers make content readable.
CASE
WHEN (best_oa_location.host_type = "publisher" AND best_oa_location.license is null AND not journal_is_in_doaj) THEN TRUE
WHEN (unpaywall.best_oa_location.host_type = "publisher" AND unpaywall.best_oa_location.license IS NULL AND NOT unpaywall.journal_is_in_doaj) THEN TRUE
ELSE FALSE
END
as bronze,
Expand All @@ -260,7 +255,7 @@ base_oa_calcs as (
-- defined here also explicitly includes those outputs that are also available via the publisher. For the set
-- of content which is only freely available via a repository see `green_only`.
CASE
WHEN (SELECT COUNT(1) FROM UNNEST(oa_locations) AS location WHERE location.host_type IN ('repository')) > 0 THEN TRUE
WHEN (SELECT COUNT(1) FROM UNNEST(unpaywall.oa_locations) AS location WHERE location.host_type IN ('repository')) > 0 THEN TRUE
ELSE FALSE
END
as green,
Expand All @@ -271,8 +266,8 @@ base_oa_calcs as (
-- in the generation of stacked bar charts that include gold_doaj, green, hybrid and bronze. This corresponds to
-- general usage of the term "green" in some other literature.
CASE
WHEN (SELECT COUNT(1) FROM UNNEST(oa_locations) AS location WHERE location.host_type IN ('repository')) > 0 AND
NOT (journal_is_in_doaj OR best_oa_location.host_type = "publisher") THEN TRUE
WHEN (SELECT COUNT(1) FROM UNNEST(unpaywall.oa_locations) AS location WHERE location.host_type IN ('repository')) > 0 AND
NOT (unpaywall.journal_is_in_doaj OR unpaywall.best_oa_location.host_type = "publisher") THEN TRUE
ELSE FALSE
END
as green_only,
Expand All @@ -282,58 +277,61 @@ base_oa_calcs as (
-- that are green and bronze, but not gold. This category enables analyses of gold and green as mutually
-- exclusive categories, e.g. in the generation of stacked bar charts that include gold_doaj, green and hybrid.
CASE
WHEN (SELECT COUNT(1) FROM UNNEST(oa_locations) AS location WHERE location.host_type IN ('repository')) > 0 AND
NOT (journal_is_in_doaj OR (best_oa_location.host_type = "publisher" AND best_oa_location.license is not null)) THEN TRUE
WHEN (SELECT COUNT(1) FROM UNNEST(unpaywall.oa_locations) AS location WHERE location.host_type IN ('repository')) > 0 AND
NOT (unpaywall.journal_is_in_doaj OR (unpaywall.best_oa_location.host_type = "publisher" AND unpaywall.best_oa_location.license IS NOT NULL)) THEN TRUE
ELSE FALSE
END
as green_only_ignoring_bronze,

-- ### Convenience category for analysing articles that have a license for the best OA location
CASE
WHEN (best_oa_location.license IS NOT NULL) THEN TRUE
WHEN (unpaywall.best_oa_location.license IS NOT NULL) THEN TRUE
ELSE FALSE
END
as has_license,

-- ### Convenience category for analysing articles that have a Creative Commons license for the best OA location
CASE
WHEN ((best_oa_location.license IS NOT NULL) AND (STARTS_WITH(best_oa_location.license, "cc"))) THEN TRUE
WHEN ((unpaywall.best_oa_location.license IS NOT NULL) AND (STARTS_WITH(unpaywall.best_oa_location.license, "cc"))) THEN TRUE
ELSE FALSE
END
as is_cclicensed,

-- Black OA
-- Currently tracks outputs from SciHub. TODO: add Library Genesis.
CASE
WHEN scihub.doi IS NOT NULL THEN TRUE
ELSE FALSE
END
as black,

repo.repositories,
FROM `{{ unpaywall.project_id }}.{{ unpaywall.dataset_id }}.{{ unpaywall.table_id }}` as unpaywall
LEFT JOIN name on name.identifier = unpaywall.journal_issn_l
LEFT JOIN repositories as repo on repo.doi = unpaywall.doi

FROM {{ crossref_metadata.project_id }}.{{ crossref_metadata.dataset_id }}.{{ crossref_metadata.table_id }} AS crossref
LEFT JOIN `{{ unpaywall.project_id }}.{{ unpaywall.dataset_id }}.{{ unpaywall.table_id }}` AS unpaywall ON UPPER(TRIM(unpaywall.doi)) = UPPER(TRIM(crossref.doi))
LEFT JOIN `{{ scihub.project_id }}.{{ scihub.dataset_id }}.{{ scihub.table_id }}` AS scihub ON UPPER(TRIM(scihub.doi)) = UPPER(TRIM(crossref.doi))
LEFT JOIN issnl_index ON issnl_index.identifier = unpaywall.journal_issn_l
LEFT JOIN repositories AS repo ON repo.doi = unpaywall.doi
)

-- Re-organise the base_oa_calcs table and calculate COKI Open Access categories.
SELECT
doi,
year,
output_type,
publisher,
journal_name,
normalised_journal_name,
url_for_landing_page,
url_for_pdf,
journal_issn_l,
is_oa,
is_in_doaj,
journal_is_oa,
unpaywall_oa_status,
journal_issn_l, -- Still used in DOI table
normalised_journal_name, -- Still used in DOI table
repositories,

-- Open Access colour categories
STRUCT(
is_oa as oa,
gold,
gold_just_doaj,
hybrid,
bronze,
green,
green_only,
green_only_ignoring_bronze
green_only_ignoring_bronze,
black
) as oa_color,

-- Open Access license categories
Expand All @@ -345,7 +343,7 @@ SELECT
-- The COKI Open Access categories
STRUCT(
is_oa as open,
NOT is_oa as closed,
NOT is_oa as closed,
gold_just_doaj OR hybrid OR bronze as publisher,
green as other_platform,
(gold_just_doaj OR hybrid OR bronze) AND NOT green as publisher_only,
Expand Down
38 changes: 37 additions & 1 deletion academic_observatory_workflows/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@ class Paper:
publisher_license: str = None
publisher_is_free_to_read: bool = False
repositories: List[Repository] = None
in_scihub: bool = False

@property
def access_type(self) -> AccessType:
Expand All @@ -234,9 +235,17 @@ def access_type(self) -> AccessType:
green = len(self.repositories) > 0
green_only = green and not gold_doaj and not self.publisher_is_free_to_read
oa = gold or hybrid or bronze or green
black = self.in_scihub # Add LibGen etc here

return AccessType(
oa=oa, green=green, gold=gold, gold_doaj=gold_doaj, hybrid=hybrid, bronze=bronze, green_only=green_only
oa=oa,
green=green,
gold=gold,
gold_doaj=gold_doaj,
hybrid=hybrid,
bronze=bronze,
green_only=green_only,
black=black,
)

@property
Expand Down Expand Up @@ -299,6 +308,7 @@ class AccessType:
not open access.
:param bronze: when the paper is free to read at the publisher website however there is no license.
:param green_only: where the paper is not free to read from the publisher, however it is available at an
:param black: where the paper is available at SciHub.
institutional repository.
"""

Expand All @@ -309,6 +319,7 @@ class AccessType:
hybrid: bool = None
bronze: bool = None
green_only: bool = None
black: bool = None


@dataclass
Expand Down Expand Up @@ -805,6 +816,7 @@ def make_papers(
publisher_license=license_,
publisher_is_free_to_read=publisher_is_free_to_read_,
repositories=paper_repos,
in_scihub=bool(random.getrandbits(1)),
)
papers.append(paper)

Expand Down Expand Up @@ -898,6 +910,20 @@ def make_crossref_events(dataset: ObservatoryDataset) -> List[Dict]:
return events


def make_scihub(dataset: ObservatoryDataset) -> List[Dict]:
"""Generate the SciHub table from an ObservatoryDataset instance.
:param dataset: the Observatory Dataset.
:return: table rows.
"""

data = []
for paper in dataset.papers:
if paper.access_type.black:
data.append({"doi": paper.doi})
return data


def make_unpaywall(dataset: ObservatoryDataset) -> List[Dict]:
"""Generate the Unpaywall table from an ObservatoryDataset instance.
Expand Down Expand Up @@ -1091,6 +1117,7 @@ def bq_load_observatory_dataset(
crossref_fundref = make_crossref_fundref(observatory_dataset)
unpaywall = make_unpaywall(observatory_dataset)
crossref_metadata = make_crossref_metadata(observatory_dataset)
scihub = make_scihub(observatory_dataset)

# Load fake ROR and settings datasets
test_doi_path = test_fixtures_folder("doi")
Expand Down Expand Up @@ -1152,6 +1179,15 @@ def bq_load_observatory_dataset(
release_date=snapshot_date,
),
),
Table(
"scihub",
True,
dataset_id_all,
scihub,
bq_find_schema(
path=os.path.join(schema_path, "scihub"), release_date=snapshot_date, table_name="scihub"
),
),
Table(
"unpaywall",
False,
Expand Down
5 changes: 5 additions & 0 deletions academic_observatory_workflows/workflows/doi_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ def make_dataset_transforms(
dataset_id_orcid: str = "orcid",
dataset_id_open_citations: str = "open_citations",
dataset_id_unpaywall: str = "unpaywall",
dataset_id_scihub: str = "scihub",
dataset_id_openalex: str = "openalex",
dataset_id_settings: str = "settings",
dataset_id_observatory: str = "observatory",
Expand Down Expand Up @@ -154,6 +155,7 @@ def make_dataset_transforms(
),
Transform(
inputs={
"scihub": Table(input_project_id, dataset_id_scihub, "scihub", sharded=True),
"unpaywall": Table(input_project_id, dataset_id_unpaywall, "unpaywall", sharded=False),
"ror": Table(input_project_id, dataset_id_ror, "ror", sharded=True),
"repository": Table(input_project_id, dataset_id_settings, "repository"),
Expand All @@ -163,6 +165,9 @@ def make_dataset_transforms(
"repository_institution_to_ror",
sharded=True,
),
"crossref_metadata": Table(
input_project_id, dataset_id_crossref_metadata, "crossref_metadata", sharded=True
),
},
output_table=Table(output_project_id, dataset_id_observatory_intermediate, "openaccess"),
output_clustering_fields=["doi"],
Expand Down
32 changes: 16 additions & 16 deletions academic_observatory_workflows/workflows/oa_web_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,27 +129,27 @@
agg.total_outputs as n_outputs,
-- COKI OA Categories
agg.oa_coki.open.total AS n_outputs_open,
agg.oa_coki.publisher.total AS n_outputs_publisher_open,
agg.oa_coki.publisher_only.total AS n_outputs_publisher_open_only,
agg.oa_coki.both.total AS n_outputs_both,
agg.oa_coki.other_platform.total AS n_outputs_other_platform_open,
agg.oa_coki.other_platform_only.total AS n_outputs_other_platform_open_only,
agg.oa_coki.closed.total AS n_outputs_closed,
agg.coki.oa.coki.open.total AS n_outputs_open,
agg.coki.oa.coki.publisher.total AS n_outputs_publisher_open,
agg.coki.oa.coki.publisher_only.total AS n_outputs_publisher_open_only,
agg.coki.oa.coki.both.total AS n_outputs_both,
agg.coki.oa.coki.other_platform.total AS n_outputs_other_platform_open,
agg.coki.oa.coki.other_platform_only.total AS n_outputs_other_platform_open_only,
agg.coki.oa.coki.closed.total AS n_outputs_closed,
-- Publisher Open Categories
agg.oa_coki.publisher_categories.oa_journal.total AS n_outputs_oa_journal,
agg.oa_coki.publisher_categories.hybrid.total AS n_outputs_hybrid,
agg.oa_coki.publisher_categories.no_guarantees.total AS n_outputs_no_guarantees,
agg.coki.oa.coki.publisher_categories.oa_journal.total AS n_outputs_oa_journal,
agg.coki.oa.coki.publisher_categories.hybrid.total AS n_outputs_hybrid,
agg.coki.oa.coki.publisher_categories.no_guarantees.total AS n_outputs_no_guarantees,
-- Other Platform Open Categories
agg.oa_coki.other_platform_categories.preprint.total AS n_outputs_preprint,
agg.oa_coki.other_platform_categories.domain.total AS n_outputs_domain,
agg.oa_coki.other_platform_categories.institution.total AS n_outputs_institution,
agg.oa_coki.other_platform_categories.public.total AS n_outputs_public,
agg.oa_coki.other_platform_categories.aggregator.total + agg.oa_coki.other_platform_categories.other_internet.total + agg.oa_coki.other_platform_categories.unknown.total AS n_outputs_other_internet,
agg.coki.oa.coki.other_platform_categories.preprint.total AS n_outputs_preprint,
agg.coki.oa.coki.other_platform_categories.domain.total AS n_outputs_domain,
agg.coki.oa.coki.other_platform_categories.institution.total AS n_outputs_institution,
agg.coki.oa.coki.other_platform_categories.public.total AS n_outputs_public,
agg.coki.oa.coki.other_platform_categories.aggregator.total + agg.coki.oa.coki.other_platform_categories.other_internet.total + agg.coki.oa.coki.other_platform_categories.unknown.total AS n_outputs_other_internet,
agg.repositories
agg.coki.repositories
FROM
`{agg_table_id}` as agg
WHERE agg.time_period >= {start_year} AND agg.time_period <= {end_year}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,7 @@ def test_telescope(self):
dataset_id_orcid=fake_dataset_id,
dataset_id_open_citations=fake_dataset_id,
dataset_id_unpaywall=fake_dataset_id,
dataset_id_scihub=fake_dataset_id,
dataset_id_settings=bq_settings_dataset_id,
dataset_id_observatory=bq_observatory_dataset_id,
dataset_id_observatory_intermediate=bq_intermediate_dataset_id,
Expand Down Expand Up @@ -367,8 +368,8 @@ def test_telescope(self):
self.assertEqual(expected_state, ti.state)

# Run Dummy Dags
execution_date = pendulum.datetime(year=2021, month=10, day=17)
snapshot_date = pendulum.datetime(year=2021, month=10, day=24)
execution_date = pendulum.datetime(year=2023, month=6, day=18)
snapshot_date = pendulum.datetime(year=2023, month=6, day=25)
expected_state = "success"
for dag_id in DoiWorkflow.SENSOR_DAG_IDS:
dag = make_dummy_dag(dag_id, execution_date)
Expand Down

0 comments on commit 8b85186

Please sign in to comment.