diff --git a/academic_observatory_workflows/database/schema/scihub/scihub_2022-02-12.json b/academic_observatory_workflows/database/schema/scihub/scihub_2022-02-12.json new file mode 100644 index 000000000..9ed0428f8 --- /dev/null +++ b/academic_observatory_workflows/database/schema/scihub/scihub_2022-02-12.json @@ -0,0 +1,8 @@ +[ + { + "name": "doi", + "description": "A DOI for a work that SciHub covers", + "mode": "REQUIRED", + "type": "STRING" + } +] \ No newline at end of file diff --git a/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2 b/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2 index 1f6b22ae8..33e520a9d 100644 --- a/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2 +++ b/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2 @@ -32,20 +32,20 @@ num_green_only_ignoring_bronze_outputs INTEGER NULLABLE num_has_license INTEGER NULLABLE num_is_cclicensed INTEGER NULLABLE #} -CREATE TEMP FUNCTION count_access_types(coki ANY TYPE) as ( +CREATE TEMP FUNCTION count_access_types(oa ANY TYPE) as ( (SELECT as STRUCT - COUNTIF(oa_coki.open) as num_oa_outputs, -{# COUNTIF(oa_color.is_in_doaj) as num_in_doaj,#} - COUNTIF(oa_color.green) as num_green_outputs, - COUNTIF(oa_color.gold) as num_gold_outputs, - COUNTIF(gold_just_doaj) as num_gold_just_doaj_outputs, - COUNTIF(oa_color.hybrid) as num_hybrid_outputs, - COUNTIF(oa_color.bronze) as num_bronze_outputs, - COUNTIF(oa_color.green_only) as num_green_only_outputs, - COUNTIF(oa_color.green_only_ignoring_bronze) as num_green_only_ignoring_bronze_outputs, - COUNTIF(oa_license.has_license) as num_has_license, - COUNTIF(oa_license.is_cclicensed) as num_is_cclicensed - FROM UNNEST(unpaywall)) + COUNTIF(color.oa) as num_oa_outputs, + COUNTIF(color.green) as num_green_outputs, + COUNTIF(color.gold) as num_gold_outputs, + COUNTIF(color.gold_just_doaj) as num_gold_just_doaj_outputs, + COUNTIF(color.hybrid) as num_hybrid_outputs, + COUNTIF(color.bronze) as num_bronze_outputs, + COUNTIF(color.green_only) as num_green_only_outputs, + COUNTIF(color.green_only_ignoring_bronze) as num_green_only_ignoring_bronze_outputs, + COUNTIF(color.black) as num_black, + COUNTIF(license.has_license) as num_has_license, + COUNTIF(license.is_cclicensed) as num_is_cclicensed + FROM UNNEST(oa)) ); # Helper Function: Count distribution of access types and citations for a single output type @@ -68,7 +68,7 @@ citations.total_crossref_citations INTEGER NULLABLE CREATE TEMP FUNCTION count_single_output_type( output_type STRING, items ARRAY, - oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL>>, measured_type STRING) + oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL, black BOOL>>, measured_type STRING) AS ( (SELECT as STRUCT output_type, @@ -80,6 +80,7 @@ CREATE TEMP FUNCTION count_single_output_type( COUNTIF(item.type = measured_type AND item.hybrid ) as num_hybrid_outputs, COUNTIF(item.type = measured_type AND item.bronze ) as num_bronze_outputs, COUNTIF(item.type = measured_type AND item.green_only ) as num_green_only_outputs, + COUNTIF(item.type = measured_type AND item.black ) as num_black_outputs, STRUCT( SUM(citations.openalex) as total_openalex_citations, SUM(citations.open_citations) as total_open_citations_citations, @@ -107,7 +108,7 @@ citations.total_crossref_citations INTEGER NULLABLE #} CREATE TEMP FUNCTION count_array_output_type( output_type STRING, items ARRAY, - oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL>>, measured_type ARRAY) + oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL, black BOOL>>, measured_type ARRAY) AS ( (SELECT as STRUCT output_type, @@ -119,6 +120,7 @@ CREATE TEMP FUNCTION count_array_output_type( COUNTIF(item.type in UNNEST(measured_type) AND item.hybrid ) as num_hybrid_outputs, COUNTIF(item.type in UNNEST(measured_type) AND item.bronze ) as num_bronze_outputs, COUNTIF(item.type in UNNEST(measured_type) AND item.green_only ) as num_green_only_outputs, + COUNTIF(item.type in UNNEST(measured_type) AND item.black ) as num_black_outputs, STRUCT( SUM(citations.openalex) as total_openalex_citations, SUM(citations.open_citations) as total_open_citations_citations, @@ -147,7 +149,7 @@ citations.total_crossref_citations INTEGER NULLABLE #} CREATE TEMP FUNCTION count_not_in_array_output_type( output_type STRING, items ARRAY, - oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL>>, measured_type ARRAY) + oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL, black BOOL>>, measured_type ARRAY) AS ( (SELECT as STRUCT output_type, @@ -159,6 +161,7 @@ CREATE TEMP FUNCTION count_not_in_array_output_type( COUNTIF(item.type not in UNNEST(measured_type) AND item.hybrid ) as num_hybrid_outputs, COUNTIF(item.type not in UNNEST(measured_type) AND item.bronze ) as num_bronze_outputs, COUNTIF(item.type not in UNNEST(measured_type) AND item.green_only ) as num_green_only_outputs, + COUNTIF(item.type not in UNNEST(measured_type) AND item.black ) as num_black_outputs, STRUCT( SUM(citations.openalex) as total_openalex_citations, SUM(citations.open_citations) as total_open_citations_citations, @@ -176,7 +179,7 @@ output_types RECORD REPEATED #} CREATE TEMP FUNCTION count_output_types( items ARRAY, - oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL>>) + oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL, black BOOL>>) AS ( [ count_single_output_type("journal_articles", items, 'journal-article'), @@ -321,7 +324,7 @@ breakdown RECORD REPEATED #} CREATE TEMP FUNCTION compute_access_types( items ARRAY, - is_oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL>>) + is_oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL, black BOOL>>) AS ( (SELECT AS STRUCT STRUCT( @@ -359,6 +362,11 @@ CREATE TEMP FUNCTION compute_access_types( ROUND(SAFE_DIVIDE( (COUNTIF(green_only is True)) * 100 , COUNT(doi)), 2) as percent ) as green_only, + STRUCT( + COUNTIF(black) as total_outputs, + ROUND(SAFE_DIVIDE( (COUNTIF(black is True)) * 100 , COUNT(doi)), 2) as percent + ) as black, + ARRAY_CONCAT( compute_conditional_citations(ARRAY_AGG(STRUCT(citations, IFNULL(is_oa, false) as is_x)), "oa", "Open Access", "Not Open Access"), compute_conditional_citations(ARRAY_AGG(STRUCT(citations, IFNULL(green, false) as is_x)), "green", "Green", "Not Green"), @@ -366,7 +374,8 @@ CREATE TEMP FUNCTION compute_access_types( compute_conditional_citations(ARRAY_AGG(STRUCT(citations, IFNULL(gold_just_doaj, false) as is_x)), "gold_just_doaj", "Gold just DOAJ", "Not Gold just DOAJ"), compute_conditional_citations(ARRAY_AGG(STRUCT(citations, IFNULL(hybrid, false) as is_x)), "hybrid", "Hybrid", "Not Hybrid"), compute_conditional_citations(ARRAY_AGG(STRUCT(citations, IFNULL(bronze, false) as is_x)), "bronze", "Bronze", "Not Bronze"), - compute_conditional_citations(ARRAY_AGG(STRUCT(citations, IFNULL(green_only, false) as is_x)), "green_only", "Green Only", "Not Green Only") + compute_conditional_citations(ARRAY_AGG(STRUCT(citations, IFNULL(green_only, false) as is_x)), "green_only", "Green Only", "Not Green Only"), + compute_conditional_citations(ARRAY_AGG(STRUCT(citations, IFNULL(black, false) as is_x)), "black", "Black", "Not Black") ) as breakdown FROM UNNEST(items)) @@ -510,7 +519,7 @@ international_collaboration_with_funding_outputs INTEGER NULLABLE #} CREATE TEMP FUNCTION compute_disciplines( fields ARRAY, - is_oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL, + is_oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL, black BOOL, funding BOOL, international_funding BOOL, domestic_funding BOOL, government_funding BOOL, private_funding BOOL, international_colab BOOL>>) AS ( ARRAY( @@ -536,6 +545,7 @@ CREATE TEMP FUNCTION compute_disciplines( COUNTIF(hybrid) as num_hybrid_outputs, COUNTIF(bronze) as num_bronze_outputs, COUNTIF(green_only) as num_green_only_outputs, + COUNTIF(black) as num_black_outputs, STRUCT( COUNTIF(funding) as total_funded_outputs, COUNTIF(international_funding) as num_international_outputs, @@ -609,14 +619,14 @@ CREATE TEMP FUNCTION process_relations(relationships ANY TYPE, total INT64, tota relation.identifier as id, COUNT(relation.identifier) as total_outputs, ROUND(SAFE_DIVIDE( COUNT(relation.identifier), total), 3) as percentage_of_all_outputs, - ROUND(SAFE_DIVIDE( COUNTIF(coki.oa_coki.open) , total_oa ), 3) as percentage_of_all_oa, + ROUND(SAFE_DIVIDE( COUNTIF(coki.oa.coki.open) , total_oa ), 3) as percentage_of_all_oa, MAX(relation.name) as name, MAX(relation.country) as country, MAX(relation.country_code) as country_code, MAX(relation.region) as region, MAX(relation.subregion) as subregion, MAX(relation.coordinates) as coordinates, - count_access_types(ARRAY_AGG(coki)).*, + count_access_types(ARRAY_AGG(coki.oa)).*, STRUCT( SUM(citations.openalex) as openalex, SUM(citations.crossref) as crosssref, @@ -727,7 +737,7 @@ WITH tmp_disciplines AS dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations ) as citations, - coki.oa_coki.open as is_oa, coki.oa_color.green as green, coki.oa_color.gold as gold, coki.oa_color.gold_just_doaj, coki.oa_color.hybrid, coki.oa_color.bronze, coki.oa_color.green_only, + coki.oa.coki.open as is_oa, coki.oa.color.green as green, coki.oa.color.gold as gold, coki.oa.color.gold_just_doaj, coki.oa.color.hybrid, coki.oa.color.bronze, coki.oa.color.green_only, coki.oa.color.black, -- Total Funding (SELECT COUNT(funder) > 0 from UNNEST(affiliations.funders) as funder) as funding, -- Domestic, international, both, none or unknown funding @@ -761,7 +771,7 @@ tmp_access_types AS ( ARRAY_AGG( STRUCT( dois.doi, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, - coki.oa_coki.open, coki.oa_color.green, coki.oa_color.gold, coki.oa_color.gold_just_doaj, coki.oa_color.hybrid, coki.oa_color.bronze, coki.oa_color.green_only + coki.oa.coki.open, coki.oa.color.green, coki.oa.color.gold, coki.oa.color.gold_just_doaj, coki.oa.color.hybrid, coki.oa.color.bronze, coki.oa.color.green_only, coki.oa.color.black ) ) ) as access_types @@ -794,15 +804,31 @@ SELECT -- Total outputs COUNT(dois.doi) as total_outputs, - -- COKI OA - compute_coki_oa( - ARRAY_AGG( - STRUCT( - dois.doi, - coki.oa_coki - ) - ) - ) as oa_coki, + -- COKI data + STRUCT( + STRUCT( + -- Open Access Types + compute_coki_oa( + ARRAY_AGG( + STRUCT( + dois.doi, + coki.oa.coki + ) + ) + ) as coki + ) as oa, + + -- Repositories + {# + This SQL block aggregrates the repository names, remebering we are inside a larger GROUP BY statement, + #} + process_repositories( + ARRAY_CONCAT_AGG( + coki.repositories + ), + MAX(aggregrate.identifier) + ) as repositories + ) as coki, -- Citations compute_citations( @@ -814,28 +840,15 @@ SELECT ) as citations, -- Output Types - count_output_types( ARRAY_AGG( STRUCT( crossref.type, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, - coki.oa_coki.open, coki.oa_color.green, coki.oa_color.gold, coki.oa_color.gold_just_doaj, coki.oa_color.hybrid, coki.oa_color.bronze, coki.oa_color.green_only + coki.oa.coki.open, coki.oa.color.green, coki.oa.color.gold, coki.oa.color.gold_just_doaj, coki.oa.color.hybrid, coki.oa.color.bronze, coki.oa.color.green_only, coki.oa.color.black ) ) ) as output_types, - -- Repositories - {# - This SQL block aggregrates the repository names, remebering we are inside a larger GROUP BY statement, - #} - -- Repositories - process_repositories( - ARRAY_CONCAT_AGG( - coki.repositories - ), - MAX(aggregrate.identifier) - ) as repositories, - {% if relate_to_institutions %} -- Institutions {# @@ -852,11 +865,11 @@ SELECT relation, coki, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, - ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa_coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines + ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa.coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines FROM UNNEST(affiliations.institutions) as relation WHERE relation.identifier <> aggregrate.identifier) ) - ), COUNT(dois.doi), COUNTIF(coki.oa_coki.open = True) + ), COUNT(dois.doi), COUNTIF(coki.oa.coki.open = True) ) as institutions, {% endif %} @@ -876,11 +889,11 @@ SELECT relation, coki, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, - ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa_coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines + ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa.coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines FROM UNNEST(affiliations.countries) as relation WHERE relation.identifier <> aggregrate.country_code OR aggregrate.country_code IS NULL) ) - ), COUNT(dois.doi), COUNTIF(coki.oa_coki.open = True) + ), COUNT(dois.doi), COUNTIF(coki.oa.coki.open = True) ) as countries, {% endif %} @@ -900,11 +913,11 @@ SELECT relation, coki, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, - ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa_coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines + ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa.coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines FROM UNNEST(affiliations.groupings) as relation WHERE relation.identifier <> aggregrate.identifier) ) - ), COUNT(dois.doi), COUNTIF(coki.oa_coki.open = True) + ), COUNT(dois.doi), COUNTIF(coki.oa.coki.open = True) ) as groupings, {% endif %} @@ -924,11 +937,11 @@ SELECT relation, coki, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, - ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa_coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines + ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa.coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines FROM UNNEST(affiliations.funders) as relation WHERE relation.identifier <> aggregrate.identifier) ) - ), COUNT(dois.doi), COUNTIF(coki.oa_coki.open = True) + ), COUNT(dois.doi), COUNTIF(coki.oa.coki.open = True) ) as funders, {% endif %} @@ -952,10 +965,10 @@ SELECT STRUCT( dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations ) as citations, - ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa_coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines + ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa.coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines FROM UNNEST(aggregrate.members) as relation) ) - ), COUNT(dois.doi), COUNTIF(coki.oa_coki.open = True) + ), COUNT(dois.doi), COUNTIF(coki.oa.coki.open = True) ) as members, {% endif %} @@ -975,11 +988,11 @@ SELECT relation, coki, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, - ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa_coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines + ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa.coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines FROM UNNEST(affiliations.publishers) as relation WHERE relation.identifier <> aggregrate.identifier) ) - ), COUNT(dois.doi), COUNTIF(coki.oa_coki.open = True) + ), COUNT(dois.doi), COUNTIF(coki.oa.coki.open = True) ) as publishers, {% endif %} @@ -999,11 +1012,11 @@ SELECT relation, coki, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, - ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa_coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines + ARRAY( SELECT as STRUCT display_name as DisplayName, Score, coki.oa.coki.open FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines FROM UNNEST(affiliations.journals) as relation WHERE relation.identifier <> aggregrate.identifier) ) - ), COUNT(dois.doi), COUNTIF(coki.oa_coki.open = True) + ), COUNT(dois.doi), COUNTIF(coki.oa.coki.open = True) ) as journals, {% endif %} @@ -1020,7 +1033,7 @@ SELECT event.source, event.count, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, - coki.oa_coki.open as is_oa, coki.oa_color.green as green, coki.oa_color.gold as gold, coki.oa_color.gold_just_doaj, coki.oa_color.hybrid, coki.oa_color.bronze, coki.oa_color.green_only + coki.oa.coki.open as is_oa, coki.oa.color.green as green, coki.oa.color.gold as gold, coki.oa.color.gold_just_doaj, coki.oa.color.hybrid, coki.oa.color.bronze, coki.oa.color.green_only FROM UNNEST(dois.events.events) as event))) ) as events, @@ -1033,9 +1046,15 @@ GROUP BY aggregrate.identifier, crossref.{{ group_by_time_field }} -- Putting it all together SELECT - tmp_agg_table.*, - disciplines, - access_types, + tmp_agg_table.* EXCEPT (coki), + STRUCT( + STRUCT( + access_types as color, + tmp_agg_table.coki.oa.coki as coki + ) as oa, + tmp_agg_table.coki.repositories as repositories + ) as coki, + disciplines FROM tmp_agg_table INNER JOIN diff --git a/academic_observatory_workflows/database/sql/create_doi.sql.jinja2 b/academic_observatory_workflows/database/sql/create_doi.sql.jinja2 index c461fa016..5a7ec5c13 100644 --- a/academic_observatory_workflows/database/sql/create_doi.sql.jinja2 +++ b/academic_observatory_workflows/database/sql/create_doi.sql.jinja2 @@ -239,9 +239,11 @@ SELECT dois.* EXCEPT (coki_affiliations, openaccess), -- The coki struct, which contains fields for a work generated by COKI STRUCT( - dois.openaccess.oa_color as oa_color, - dois.openaccess.oa_license as oa_license, - dois.openaccess.oa_coki as oa_coki, + STRUCT( + dois.openaccess.oa_color as color, + dois.openaccess.oa_license as license, + dois.openaccess.oa_coki as coki + ) as oa, dois.openaccess.repositories as repositories, STRUCT( dois.coki_affiliations.author_institutions as author_institutions, diff --git a/academic_observatory_workflows/database/sql/create_openaccess.sql.jinja2 b/academic_observatory_workflows/database/sql/create_openaccess.sql.jinja2 index 34b20650c..650337e33 100644 --- a/academic_observatory_workflows/database/sql/create_openaccess.sql.jinja2 +++ b/academic_observatory_workflows/database/sql/create_openaccess.sql.jinja2 @@ -79,7 +79,7 @@ CREATE TEMP FUNCTION PMH_ID_TO_DOMAIN(pmh_id STRING) WITH -- Creates a list of ISSN-L to Normalised Journal Names. The name chosen for each ISSN-L is the most common occurance -name as ( +issnl_index as ( SELECT identifier, ARRAY_AGG(name IGNORE NULLS ORDER BY count DESC LIMIT 1)[SAFE_OFFSET(0)] as name @@ -194,29 +194,24 @@ repositories as ( -- The OA colour and license calculations base_oa_calcs as ( SELECT - UPPER(TRIM(unpaywall.doi)) as doi, - year, - genre as output_type, - publisher, - journal_name, - name.name as normalised_journal_name, - best_oa_location.url_for_landing_page, - best_oa_location.url_for_pdf, - journal_issn_l, + UPPER(TRIM(crossref.doi)) as doi, + unpaywall.journal_issn_l, -- still used in DOI query + issnl_index.name as normalised_journal_name, -- still used in DOI query -- ### Is Open Access: -- We use the is_oa tag from Unpaywall directly to populate general OA status. This includes bronze. - is_oa, - journal_is_in_doaj as is_in_doaj, - journal_is_oa, - oa_status as unpaywall_oa_status, + CASE + WHEN unpaywall.is_oa THEN TRUE + ELSE FALSE + END + as is_oa, -- ### Gold Open Access: -- Gold OA is defined as either the journal being in DOAJ or the best_oa_location being a publisher and a -- license being detected. This works because Unpaywall will set the publisher as the best oa location if -- it identifies an accessible publisher copy. CASE - WHEN journal_is_in_doaj OR (best_oa_location.host_type = "publisher" AND best_oa_location.license is not null AND not journal_is_in_doaj) THEN TRUE + WHEN unpaywall.journal_is_in_doaj OR (unpaywall.best_oa_location.host_type = "publisher" AND unpaywall.best_oa_location.license IS NOT NULL AND NOT unpaywall.journal_is_in_doaj) THEN TRUE ELSE FALSE END as gold, @@ -226,7 +221,7 @@ base_oa_calcs as ( -- checking is done on this, so articles that Unpaywall does not capture as being accessible that are in DOAJ -- journals will be characterised as gold_just_doaj. CASE - WHEN journal_is_in_doaj THEN TRUE + WHEN unpaywall.journal_is_in_doaj THEN TRUE ELSE FALSE END as gold_just_doaj, @@ -237,7 +232,7 @@ base_oa_calcs as ( -- license. The use of DOAJ as defining a "fully oa journal" is also narrow and future developments will -- expand this considering, among other parameters, the Unpaywall tag 'journal-is-oa'. CASE - WHEN (best_oa_location.host_type = "publisher" AND best_oa_location.license is not null AND not journal_is_in_doaj) THEN TRUE + WHEN (unpaywall.best_oa_location.host_type = "publisher" AND unpaywall.best_oa_location.license IS NOT NULL AND not unpaywall.journal_is_in_doaj) THEN TRUE ELSE FALSE END as hybrid, @@ -248,7 +243,7 @@ base_oa_calcs as ( -- paywall) as in these cases a more open license is not generally applied. However, this is a heuristic and -- there are significant issues distinguishing between different modes by which publishers make content readable. CASE - WHEN (best_oa_location.host_type = "publisher" AND best_oa_location.license is null AND not journal_is_in_doaj) THEN TRUE + WHEN (unpaywall.best_oa_location.host_type = "publisher" AND unpaywall.best_oa_location.license IS NULL AND NOT unpaywall.journal_is_in_doaj) THEN TRUE ELSE FALSE END as bronze, @@ -260,7 +255,7 @@ base_oa_calcs as ( -- defined here also explicitly includes those outputs that are also available via the publisher. For the set -- of content which is only freely available via a repository see `green_only`. CASE - WHEN (SELECT COUNT(1) FROM UNNEST(oa_locations) AS location WHERE location.host_type IN ('repository')) > 0 THEN TRUE + WHEN (SELECT COUNT(1) FROM UNNEST(unpaywall.oa_locations) AS location WHERE location.host_type IN ('repository')) > 0 THEN TRUE ELSE FALSE END as green, @@ -271,8 +266,8 @@ base_oa_calcs as ( -- in the generation of stacked bar charts that include gold_doaj, green, hybrid and bronze. This corresponds to -- general usage of the term "green" in some other literature. CASE - WHEN (SELECT COUNT(1) FROM UNNEST(oa_locations) AS location WHERE location.host_type IN ('repository')) > 0 AND - NOT (journal_is_in_doaj OR best_oa_location.host_type = "publisher") THEN TRUE + WHEN (SELECT COUNT(1) FROM UNNEST(unpaywall.oa_locations) AS location WHERE location.host_type IN ('repository')) > 0 AND + NOT (unpaywall.journal_is_in_doaj OR unpaywall.best_oa_location.host_type = "publisher") THEN TRUE ELSE FALSE END as green_only, @@ -282,58 +277,61 @@ base_oa_calcs as ( -- that are green and bronze, but not gold. This category enables analyses of gold and green as mutually -- exclusive categories, e.g. in the generation of stacked bar charts that include gold_doaj, green and hybrid. CASE - WHEN (SELECT COUNT(1) FROM UNNEST(oa_locations) AS location WHERE location.host_type IN ('repository')) > 0 AND - NOT (journal_is_in_doaj OR (best_oa_location.host_type = "publisher" AND best_oa_location.license is not null)) THEN TRUE + WHEN (SELECT COUNT(1) FROM UNNEST(unpaywall.oa_locations) AS location WHERE location.host_type IN ('repository')) > 0 AND + NOT (unpaywall.journal_is_in_doaj OR (unpaywall.best_oa_location.host_type = "publisher" AND unpaywall.best_oa_location.license IS NOT NULL)) THEN TRUE ELSE FALSE END as green_only_ignoring_bronze, -- ### Convenience category for analysing articles that have a license for the best OA location CASE - WHEN (best_oa_location.license IS NOT NULL) THEN TRUE + WHEN (unpaywall.best_oa_location.license IS NOT NULL) THEN TRUE ELSE FALSE END as has_license, -- ### Convenience category for analysing articles that have a Creative Commons license for the best OA location CASE - WHEN ((best_oa_location.license IS NOT NULL) AND (STARTS_WITH(best_oa_location.license, "cc"))) THEN TRUE + WHEN ((unpaywall.best_oa_location.license IS NOT NULL) AND (STARTS_WITH(unpaywall.best_oa_location.license, "cc"))) THEN TRUE ELSE FALSE END as is_cclicensed, + -- Black OA + -- Currently tracks outputs from SciHub. TODO: add Library Genesis. + CASE + WHEN scihub.doi IS NOT NULL THEN TRUE + ELSE FALSE + END + as black, + repo.repositories, - FROM `{{ unpaywall.project_id }}.{{ unpaywall.dataset_id }}.{{ unpaywall.table_id }}` as unpaywall - LEFT JOIN name on name.identifier = unpaywall.journal_issn_l - LEFT JOIN repositories as repo on repo.doi = unpaywall.doi + + FROM {{ crossref_metadata.project_id }}.{{ crossref_metadata.dataset_id }}.{{ crossref_metadata.table_id }} AS crossref + LEFT JOIN `{{ unpaywall.project_id }}.{{ unpaywall.dataset_id }}.{{ unpaywall.table_id }}` AS unpaywall ON UPPER(TRIM(unpaywall.doi)) = UPPER(TRIM(crossref.doi)) + LEFT JOIN `{{ scihub.project_id }}.{{ scihub.dataset_id }}.{{ scihub.table_id }}` AS scihub ON UPPER(TRIM(scihub.doi)) = UPPER(TRIM(crossref.doi)) + LEFT JOIN issnl_index ON issnl_index.identifier = unpaywall.journal_issn_l + LEFT JOIN repositories AS repo ON repo.doi = unpaywall.doi ) -- Re-organise the base_oa_calcs table and calculate COKI Open Access categories. SELECT doi, - year, - output_type, - publisher, - journal_name, - normalised_journal_name, - url_for_landing_page, - url_for_pdf, - journal_issn_l, - is_oa, - is_in_doaj, - journal_is_oa, - unpaywall_oa_status, + journal_issn_l, -- Still used in DOI table + normalised_journal_name, -- Still used in DOI table repositories, -- Open Access colour categories STRUCT( + is_oa as oa, gold, gold_just_doaj, hybrid, bronze, green, green_only, - green_only_ignoring_bronze + green_only_ignoring_bronze, + black ) as oa_color, -- Open Access license categories @@ -345,7 +343,7 @@ SELECT -- The COKI Open Access categories STRUCT( is_oa as open, - NOT is_oa as closed, + NOT is_oa as closed, gold_just_doaj OR hybrid OR bronze as publisher, green as other_platform, (gold_just_doaj OR hybrid OR bronze) AND NOT green as publisher_only, diff --git a/academic_observatory_workflows/model.py b/academic_observatory_workflows/model.py index dfca7918a..f2318eaab 100644 --- a/academic_observatory_workflows/model.py +++ b/academic_observatory_workflows/model.py @@ -219,6 +219,7 @@ class Paper: publisher_license: str = None publisher_is_free_to_read: bool = False repositories: List[Repository] = None + in_scihub: bool = False @property def access_type(self) -> AccessType: @@ -234,9 +235,17 @@ def access_type(self) -> AccessType: green = len(self.repositories) > 0 green_only = green and not gold_doaj and not self.publisher_is_free_to_read oa = gold or hybrid or bronze or green + black = self.in_scihub # Add LibGen etc here return AccessType( - oa=oa, green=green, gold=gold, gold_doaj=gold_doaj, hybrid=hybrid, bronze=bronze, green_only=green_only + oa=oa, + green=green, + gold=gold, + gold_doaj=gold_doaj, + hybrid=hybrid, + bronze=bronze, + green_only=green_only, + black=black, ) @property @@ -299,6 +308,7 @@ class AccessType: not open access. :param bronze: when the paper is free to read at the publisher website however there is no license. :param green_only: where the paper is not free to read from the publisher, however it is available at an + :param black: where the paper is available at SciHub. institutional repository. """ @@ -309,6 +319,7 @@ class AccessType: hybrid: bool = None bronze: bool = None green_only: bool = None + black: bool = None @dataclass @@ -805,6 +816,7 @@ def make_papers( publisher_license=license_, publisher_is_free_to_read=publisher_is_free_to_read_, repositories=paper_repos, + in_scihub=bool(random.getrandbits(1)), ) papers.append(paper) @@ -898,6 +910,20 @@ def make_crossref_events(dataset: ObservatoryDataset) -> List[Dict]: return events +def make_scihub(dataset: ObservatoryDataset) -> List[Dict]: + """Generate the SciHub table from an ObservatoryDataset instance. + + :param dataset: the Observatory Dataset. + :return: table rows. + """ + + data = [] + for paper in dataset.papers: + if paper.access_type.black: + data.append({"doi": paper.doi}) + return data + + def make_unpaywall(dataset: ObservatoryDataset) -> List[Dict]: """Generate the Unpaywall table from an ObservatoryDataset instance. @@ -1091,6 +1117,7 @@ def bq_load_observatory_dataset( crossref_fundref = make_crossref_fundref(observatory_dataset) unpaywall = make_unpaywall(observatory_dataset) crossref_metadata = make_crossref_metadata(observatory_dataset) + scihub = make_scihub(observatory_dataset) # Load fake ROR and settings datasets test_doi_path = test_fixtures_folder("doi") @@ -1152,6 +1179,15 @@ def bq_load_observatory_dataset( release_date=snapshot_date, ), ), + Table( + "scihub", + True, + dataset_id_all, + scihub, + bq_find_schema( + path=os.path.join(schema_path, "scihub"), release_date=snapshot_date, table_name="scihub" + ), + ), Table( "unpaywall", False, diff --git a/academic_observatory_workflows/workflows/doi_workflow.py b/academic_observatory_workflows/workflows/doi_workflow.py index 1008f925a..5db3a7c89 100644 --- a/academic_observatory_workflows/workflows/doi_workflow.py +++ b/academic_observatory_workflows/workflows/doi_workflow.py @@ -109,6 +109,7 @@ def make_dataset_transforms( dataset_id_orcid: str = "orcid", dataset_id_open_citations: str = "open_citations", dataset_id_unpaywall: str = "unpaywall", + dataset_id_scihub: str = "scihub", dataset_id_openalex: str = "openalex", dataset_id_settings: str = "settings", dataset_id_observatory: str = "observatory", @@ -154,6 +155,7 @@ def make_dataset_transforms( ), Transform( inputs={ + "scihub": Table(input_project_id, dataset_id_scihub, "scihub", sharded=True), "unpaywall": Table(input_project_id, dataset_id_unpaywall, "unpaywall", sharded=False), "ror": Table(input_project_id, dataset_id_ror, "ror", sharded=True), "repository": Table(input_project_id, dataset_id_settings, "repository"), @@ -163,6 +165,9 @@ def make_dataset_transforms( "repository_institution_to_ror", sharded=True, ), + "crossref_metadata": Table( + input_project_id, dataset_id_crossref_metadata, "crossref_metadata", sharded=True + ), }, output_table=Table(output_project_id, dataset_id_observatory_intermediate, "openaccess"), output_clustering_fields=["doi"], diff --git a/academic_observatory_workflows/workflows/oa_web_workflow.py b/academic_observatory_workflows/workflows/oa_web_workflow.py index ce077c8e7..386f1f6b3 100644 --- a/academic_observatory_workflows/workflows/oa_web_workflow.py +++ b/academic_observatory_workflows/workflows/oa_web_workflow.py @@ -129,27 +129,27 @@ agg.total_outputs as n_outputs, -- COKI OA Categories - agg.oa_coki.open.total AS n_outputs_open, - agg.oa_coki.publisher.total AS n_outputs_publisher_open, - agg.oa_coki.publisher_only.total AS n_outputs_publisher_open_only, - agg.oa_coki.both.total AS n_outputs_both, - agg.oa_coki.other_platform.total AS n_outputs_other_platform_open, - agg.oa_coki.other_platform_only.total AS n_outputs_other_platform_open_only, - agg.oa_coki.closed.total AS n_outputs_closed, + agg.coki.oa.coki.open.total AS n_outputs_open, + agg.coki.oa.coki.publisher.total AS n_outputs_publisher_open, + agg.coki.oa.coki.publisher_only.total AS n_outputs_publisher_open_only, + agg.coki.oa.coki.both.total AS n_outputs_both, + agg.coki.oa.coki.other_platform.total AS n_outputs_other_platform_open, + agg.coki.oa.coki.other_platform_only.total AS n_outputs_other_platform_open_only, + agg.coki.oa.coki.closed.total AS n_outputs_closed, -- Publisher Open Categories - agg.oa_coki.publisher_categories.oa_journal.total AS n_outputs_oa_journal, - agg.oa_coki.publisher_categories.hybrid.total AS n_outputs_hybrid, - agg.oa_coki.publisher_categories.no_guarantees.total AS n_outputs_no_guarantees, + agg.coki.oa.coki.publisher_categories.oa_journal.total AS n_outputs_oa_journal, + agg.coki.oa.coki.publisher_categories.hybrid.total AS n_outputs_hybrid, + agg.coki.oa.coki.publisher_categories.no_guarantees.total AS n_outputs_no_guarantees, -- Other Platform Open Categories - agg.oa_coki.other_platform_categories.preprint.total AS n_outputs_preprint, - agg.oa_coki.other_platform_categories.domain.total AS n_outputs_domain, - agg.oa_coki.other_platform_categories.institution.total AS n_outputs_institution, - agg.oa_coki.other_platform_categories.public.total AS n_outputs_public, - agg.oa_coki.other_platform_categories.aggregator.total + agg.oa_coki.other_platform_categories.other_internet.total + agg.oa_coki.other_platform_categories.unknown.total AS n_outputs_other_internet, + agg.coki.oa.coki.other_platform_categories.preprint.total AS n_outputs_preprint, + agg.coki.oa.coki.other_platform_categories.domain.total AS n_outputs_domain, + agg.coki.oa.coki.other_platform_categories.institution.total AS n_outputs_institution, + agg.coki.oa.coki.other_platform_categories.public.total AS n_outputs_public, + agg.coki.oa.coki.other_platform_categories.aggregator.total + agg.coki.oa.coki.other_platform_categories.other_internet.total + agg.coki.oa.coki.other_platform_categories.unknown.total AS n_outputs_other_internet, - agg.repositories + agg.coki.repositories FROM `{agg_table_id}` as agg WHERE agg.time_period >= {start_year} AND agg.time_period <= {end_year} diff --git a/academic_observatory_workflows/workflows/tests/test_doi_workflow.py b/academic_observatory_workflows/workflows/tests/test_doi_workflow.py index e20e69a93..3254eeaca 100644 --- a/academic_observatory_workflows/workflows/tests/test_doi_workflow.py +++ b/academic_observatory_workflows/workflows/tests/test_doi_workflow.py @@ -328,6 +328,7 @@ def test_telescope(self): dataset_id_orcid=fake_dataset_id, dataset_id_open_citations=fake_dataset_id, dataset_id_unpaywall=fake_dataset_id, + dataset_id_scihub=fake_dataset_id, dataset_id_settings=bq_settings_dataset_id, dataset_id_observatory=bq_observatory_dataset_id, dataset_id_observatory_intermediate=bq_intermediate_dataset_id, @@ -367,8 +368,8 @@ def test_telescope(self): self.assertEqual(expected_state, ti.state) # Run Dummy Dags - execution_date = pendulum.datetime(year=2021, month=10, day=17) - snapshot_date = pendulum.datetime(year=2021, month=10, day=24) + execution_date = pendulum.datetime(year=2023, month=6, day=18) + snapshot_date = pendulum.datetime(year=2023, month=6, day=25) expected_state = "success" for dag_id in DoiWorkflow.SENSOR_DAG_IDS: dag = make_dummy_dag(dag_id, execution_date)