From 6a423117cc5c84d46016f0b238354139e29fb55d Mon Sep 17 00:00:00 2001
From: Keegan Smith <keegan.r.s21@gmail.com>
Date: Tue, 6 Jun 2023 06:08:22 +0800
Subject: [PATCH 1/3] Added .env to gitignore (#165)

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 6f02fcf1d..2179ea14a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,4 @@ ChangeLog
 .DS_Store
 /observatory-dags/observatory/dags/workflows/oapen_cloud_function.zip
 docs/schemas
+.env
\ No newline at end of file

From 88c0ea7a17341434f46298b88c590ea03fad43a9 Mon Sep 17 00:00:00 2001
From: Jamie Diprose <5715104+jdddog@users.noreply.github.com>
Date: Tue, 13 Jun 2023 11:22:58 +1200
Subject: [PATCH 2/3] Fix/deploy may 2023 (#166)

---
 .../crossref_fundref_2014-03-01.json          |  38 +-
 .../database/schema/openalex/authors.json     |  54 ++-
 .../database/schema/openalex/concepts.json    |  54 ++-
 .../schema/openalex/institutions.json         | 107 +++++-
 .../database/schema/openalex/publishers.json  |  92 ++++-
 .../database/schema/openalex/sources.json     |  54 ++-
 .../database/schema/openalex/works.json       | 363 +++++++++++++-----
 .../database/schema/unpaywall/unpaywall.json  |  37 ++
 .../database/sql/create_aggregate.sql.jinja2  | 132 ++-----
 .../sql/export_access_types.sql.jinja2        |   1 -
 .../sql/export_disciplines.sql.jinja2         |   1 -
 .../updated_date=2023-04-02/part_000.json     |   4 +-
 .../openalex/2023-04-02/expected/authors.json |   4 +-
 .../2023-04-02/expected/concepts.json         |   4 +-
 .../2023-04-02/expected/institutions.json     |   4 +-
 .../2023-04-02/expected/publishers.json       |   4 +-
 .../openalex/2023-04-02/expected/sources.json |   4 +-
 .../openalex/2023-04-02/expected/works.json   |   4 +-
 .../updated_date=2023-04-02/part_000.json     |   4 +-
 .../updated_date=2023-04-16/part_000.json     |   4 +-
 .../openalex/2023-04-16/expected/authors.json |   4 +-
 .../2023-04-16/expected/concepts.json         |   4 +-
 .../2023-04-16/expected/institutions.json     |   4 +-
 .../2023-04-16/expected/publishers.json       |   4 +-
 .../openalex/2023-04-16/expected/sources.json |   4 +-
 .../openalex/2023-04-16/expected/works.json   |   4 +-
 .../expected/run1_bq_load_main_table.json     |   4 +-
 .../expected/run1_bq_upsert_records.json      |   4 +-
 .../expected/run3_bq_upsert_records.json      |   4 +-
 .../workflows/crossref_events_telescope.py    |   6 +-
 .../workflows/crossref_fundref_telescope.py   |   4 +-
 .../workflows/crossref_metadata_telescope.py  |  52 ++-
 .../workflows/doi_workflow.py                 |  10 +-
 .../workflows/oa_web_workflow.py              |   7 +-
 .../workflows/openalex_telescope.py           |  58 ++-
 .../workflows/ror_telescope.py                |   7 +
 .../workflows/scopus_telescope.py             |   2 +-
 .../tests/test_crossref_metadata_telescope.py |   7 +-
 .../tests/test_openalex_telescope.py          |  98 ++---
 .../workflows/unpaywall_telescope.py          |   8 +-
 .../workflows/web_of_science_telescope.py     |   2 +-
 41 files changed, 908 insertions(+), 358 deletions(-)

diff --git a/academic_observatory_workflows/database/schema/crossref_fundref/crossref_fundref_2014-03-01.json b/academic_observatory_workflows/database/schema/crossref_fundref/crossref_fundref_2014-03-01.json
index cf43f2d78..083e983e2 100644
--- a/academic_observatory_workflows/database/schema/crossref_fundref/crossref_fundref_2014-03-01.json
+++ b/academic_observatory_workflows/database/schema/crossref_fundref/crossref_fundref_2014-03-01.json
@@ -8,9 +8,26 @@
               {
                 "fields": [
                   {
+                    "fields": [
+                      {
+                        "mode": "REPEATED",
+                        "name": "parent",
+                        "type": "STRING"
+                      },
+                      {
+                        "mode": "NULLABLE",
+                        "name": "name",
+                        "type": "STRING"
+                      },
+                      {
+                        "mode": "NULLABLE",
+                        "name": "funder",
+                        "type": "STRING"
+                      }
+                    ],
                     "mode": "REPEATED",
                     "name": "parent",
-                    "type": "STRING"
+                    "type": "RECORD"
                   },
                   {
                     "mode": "NULLABLE",
@@ -111,9 +128,26 @@
               {
                 "fields": [
                   {
+                    "fields": [
+                      {
+                        "mode": "REPEATED",
+                        "name": "children",
+                        "type": "STRING"
+                      },
+                      {
+                        "mode": "NULLABLE",
+                        "name": "name",
+                        "type": "STRING"
+                      },
+                      {
+                        "mode": "NULLABLE",
+                        "name": "funder",
+                        "type": "STRING"
+                      }
+                    ],
                     "mode": "REPEATED",
                     "name": "children",
-                    "type": "STRING"
+                    "type": "RECORD"
                   },
                   {
                     "mode": "NULLABLE",
diff --git a/academic_observatory_workflows/database/schema/openalex/authors.json b/academic_observatory_workflows/database/schema/openalex/authors.json
index 4e329575f..c0b0cfc2d 100644
--- a/academic_observatory_workflows/database/schema/openalex/authors.json
+++ b/academic_observatory_workflows/database/schema/openalex/authors.json
@@ -16,6 +16,12 @@
         "mode": "NULLABLE",
         "description": "The total number  Works that cite a work this author has created."
       },
+      {
+        "name": "oa_works_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
       {
         "name": "works_count",
         "type": "INTEGER",
@@ -154,23 +160,65 @@
     "type": "RECORD",
     "mode": "NULLABLE",
     "fields": [
+      {
+        "name": "2yr_cited_by_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "2yr_h_index",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "2yr_i10_index",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
       {
         "name": "2yr_mean_citedness",
         "type": "FLOAT",
         "mode": "NULLABLE",
-        "description": "The 2-year mean citedness for this author. Also known as impact factor."
+        "description": ""
+      },
+      {
+        "name": "2yr_works_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "cited_by_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
       },
       {
         "name": "h_index",
         "type": "INTEGER",
         "mode": "NULLABLE",
-        "description": "The h-index for this author."
+        "description": ""
       },
       {
         "name": "i10_index",
         "type": "INTEGER",
         "mode": "NULLABLE",
-        "description": "The i-10 index for this author."
+        "description": ""
+      },
+      {
+        "name": "oa_percent",
+        "type": "FLOAT",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "works_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
       }
     ],
     "description": "Citation metrics for this author."
diff --git a/academic_observatory_workflows/database/schema/openalex/concepts.json b/academic_observatory_workflows/database/schema/openalex/concepts.json
index 1f149b27a..57707ebda 100644
--- a/academic_observatory_workflows/database/schema/openalex/concepts.json
+++ b/academic_observatory_workflows/database/schema/openalex/concepts.json
@@ -48,6 +48,12 @@
         "mode": "NULLABLE",
         "description": "The number citations to works that have been tagged with this concept. Or less formally: the number of citations to this concept. For example, if there are just two works tagged with this concept and one of them has been cited 10 times, and the other has been cited 1 time, cited_by_count for this concept would be 11."
       },
+      {
+        "name": "oa_works_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
       {
         "name": "works_count",
         "type": "INTEGER",
@@ -238,23 +244,65 @@
     "type": "RECORD",
     "mode": "NULLABLE",
     "fields": [
+      {
+        "name": "2yr_cited_by_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "2yr_h_index",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "2yr_i10_index",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
       {
         "name": "2yr_mean_citedness",
         "type": "FLOAT",
         "mode": "NULLABLE",
-        "description": "The 2-year mean citedness for this concept. Also known as impact factor."
+        "description": ""
+      },
+      {
+        "name": "2yr_works_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "cited_by_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
       },
       {
         "name": "h_index",
         "type": "INTEGER",
         "mode": "NULLABLE",
-        "description": "The h-index for this concept."
+        "description": ""
       },
       {
         "name": "i10_index",
         "type": "INTEGER",
         "mode": "NULLABLE",
-        "description": "The i-10 index for this concept."
+        "description": ""
+      },
+      {
+        "name": "oa_percent",
+        "type": "FLOAT",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "works_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
       }
     ],
     "description": "Citation metrics for this concept."
diff --git a/academic_observatory_workflows/database/schema/openalex/institutions.json b/academic_observatory_workflows/database/schema/openalex/institutions.json
index 34e1e4f7c..8d711360c 100644
--- a/academic_observatory_workflows/database/schema/openalex/institutions.json
+++ b/academic_observatory_workflows/database/schema/openalex/institutions.json
@@ -65,6 +65,12 @@
         "mode": "NULLABLE",
         "description": "The total number Works that cite a work created by an author affiliated with this institution. Or less formally: the number of citations this institution has collected."
       },
+      {
+        "name": "oa_works_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
       {
         "name": "works_count",
         "type": "INTEGER",
@@ -104,6 +110,28 @@
     "mode": "REPEATED",
     "description": "Other names people may use for this institution.  "
   },
+  {
+    "name": "roles",
+    "type": "RECORD",
+    "mode": "REPEATED",
+    "fields": [
+      {
+        "name": "role",
+        "type": "STRING",
+        "mode": "NULLABLE"
+      },
+      {
+        "name": "id",
+        "type": "STRING",
+        "mode": "NULLABLE"
+      },
+      {
+        "name": "works_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE"
+      }
+    ]
+  },
   {
     "name": "geo",
     "type": "RECORD",
@@ -152,7 +180,7 @@
         "description": "The sub-national region (state, province) where this institution lives."
       }
     ],
-    "description": "A bunch of stuff we know about the location of this institution:"
+    "description": "A bunch of stuff we know about the location of this institution"
   },
   {
     "name": "homepage_url",
@@ -258,21 +286,40 @@
       {
         "name": "id",
         "type": "STRING",
+        "mode": "NULLABLE",
         "description": "The OpenAlex ID of the repository."
       },
+      {
+        "name": "issn_l",
+        "type": "STRING",
+        "mode": "NULLABLE"
+      },
+      {
+        "name": "issn",
+        "type": "STRING",
+        "mode": "REPEATED"
+      },
       {
         "name": "display_name",
         "type": "STRING",
+        "mode": "NULLABLE",
         "description": "The repositories display name."
       },
+      {
+        "name": "publisher",
+        "type": "STRING",
+        "mode": "NULLABLE"
+      },
       {
         "name": "host_organization",
         "type": "STRING",
+        "mode": "NULLABLE",
         "description": "The OpenAlex ID of the host organisation."
       },
       {
         "name": "host_organization_name",
         "type": "STRING",
+        "mode": "NULLABLE",
         "description": "The host organisations name."
       },
       {
@@ -280,6 +327,16 @@
         "type": "STRING",
         "mode": "REPEATED",
         "description": "The host organisations lineage."
+      },
+      {
+        "name": "publisher_id",
+        "type": "STRING",
+        "mode": "NULLABLE"
+      },
+      {
+        "name": "type",
+        "type": "STRING",
+        "mode": "NULLABLE"
       }
     ]
   },
@@ -294,23 +351,65 @@
     "type": "RECORD",
     "mode": "NULLABLE",
     "fields": [
+      {
+        "name": "2yr_cited_by_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "2yr_h_index",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "2yr_i10_index",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
       {
         "name": "2yr_mean_citedness",
         "type": "FLOAT",
         "mode": "NULLABLE",
-        "description": "The 2-year mean citedness for this institutions. Also known as impact factor."
+        "description": ""
+      },
+      {
+        "name": "2yr_works_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "cited_by_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
       },
       {
         "name": "h_index",
         "type": "INTEGER",
         "mode": "NULLABLE",
-        "description": "The h-index for this institutions."
+        "description": ""
       },
       {
         "name": "i10_index",
         "type": "INTEGER",
         "mode": "NULLABLE",
-        "description": "The i-10 index for this institutions."
+        "description": ""
+      },
+      {
+        "name": "oa_percent",
+        "type": "FLOAT",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "works_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
       }
     ],
     "description": "Citation metrics for this institutions."
diff --git a/academic_observatory_workflows/database/schema/openalex/publishers.json b/academic_observatory_workflows/database/schema/openalex/publishers.json
index 91451bdb6..6a8d3e041 100644
--- a/academic_observatory_workflows/database/schema/openalex/publishers.json
+++ b/academic_observatory_workflows/database/schema/openalex/publishers.json
@@ -28,6 +28,12 @@
         "mode": "NULLABLE",
         "description": "The total number of Works that cite a Work published by this publisher."
       },
+      {
+        "name": "oa_works_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
       {
         "name": "works_count",
         "type": "INTEGER",
@@ -113,9 +119,43 @@
   },
   {
     "name": "parent_publisher",
-    "type": "STRING",
+    "type": "RECORD",
     "mode": "NULLABLE",
-    "description": "An OpenAlex ID linking to the direct parent of the publisher. This will be null if the publisher's hierarchy_level is 0."
+    "fields": [
+      {
+        "name": "id",
+        "type": "STRING",
+        "mode": "NULLABLE"
+      },
+      {
+        "name": "display_name",
+        "type": "STRING",
+        "mode": "NULLABLE"
+      }
+    ],
+    "description": "An OpenAlex ID linking to the direct parent of the publisher and display name. This will be null if the publisher's hierarchy_level is 0."
+  },
+  {
+    "name": "roles",
+    "type": "RECORD",
+    "mode": "REPEATED",
+    "fields": [
+      {
+        "name": "id",
+        "type": "STRING",
+        "mode": "NULLABLE"
+      },
+      {
+        "name": "role",
+        "type": "STRING",
+        "mode": "NULLABLE"
+      },
+      {
+        "name": "works_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE"
+      }
+    ]
   },
   {
     "name": "sources_api_url",
@@ -129,23 +169,65 @@
     "mode": "NULLABLE",
     "description": "Citation metrics for this publisher",
     "fields": [
+      {
+        "name": "2yr_cited_by_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "2yr_h_index",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "2yr_i10_index",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
       {
         "name": "2yr_mean_citedness",
         "type": "FLOAT",
         "mode": "NULLABLE",
-        "description": "The 2-year mean citedness for this publisher. Also known as impact factor. While the h-index and the i-10 index are normally author-level metrics and the 2-year mean citedness is normally a journal-level metric, they can be calculated for any set of papers, so we include them for publishers."
+        "description": ""
+      },
+      {
+        "name": "2yr_works_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "cited_by_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
       },
       {
         "name": "h_index",
         "type": "INTEGER",
         "mode": "NULLABLE",
-        "description": "The h-index for this publisher."
+        "description": ""
       },
       {
         "name": "i10_index",
         "type": "INTEGER",
         "mode": "NULLABLE",
-        "description": "The i-10 index for this publisher."
+        "description": ""
+      },
+      {
+        "name": "oa_percent",
+        "type": "FLOAT",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "works_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
       }
     ]
   },
diff --git a/academic_observatory_workflows/database/schema/openalex/sources.json b/academic_observatory_workflows/database/schema/openalex/sources.json
index 100a37b58..2380660b1 100644
--- a/academic_observatory_workflows/database/schema/openalex/sources.json
+++ b/academic_observatory_workflows/database/schema/openalex/sources.json
@@ -60,6 +60,12 @@
         "mode": "NULLABLE",
         "description": "The total number of Works that cite a Work hosted in this source."
       },
+      {
+        "name": "oa_works_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
       {
         "name": "works_count",
         "type": "INTEGER",
@@ -222,23 +228,65 @@
     "type": "RECORD",
     "mode": "NULLABLE",
     "fields": [
+      {
+        "name": "2yr_cited_by_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "2yr_h_index",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "2yr_i10_index",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
       {
         "name": "2yr_mean_citedness",
         "type": "FLOAT",
         "mode": "NULLABLE",
-        "description": "The 2-year mean citedness for this source. Also known as impact factor."
+        "description": ""
+      },
+      {
+        "name": "2yr_works_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "cited_by_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
       },
       {
         "name": "h_index",
         "type": "INTEGER",
         "mode": "NULLABLE",
-        "description": "The h-index for this source."
+        "description": ""
       },
       {
         "name": "i10_index",
         "type": "INTEGER",
         "mode": "NULLABLE",
-        "description": "The i-10 index for this source."
+        "description": ""
+      },
+      {
+        "name": "oa_percent",
+        "type": "FLOAT",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "works_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
       }
     ],
     "description": "Citation metrics for this source."
diff --git a/academic_observatory_workflows/database/schema/openalex/works.json b/academic_observatory_workflows/database/schema/openalex/works.json
index 2c3b70c11..18cc4be6b 100644
--- a/academic_observatory_workflows/database/schema/openalex/works.json
+++ b/academic_observatory_workflows/database/schema/openalex/works.json
@@ -1,57 +1,32 @@
 [
   {
-    "name": "host_venue",
+    "name": "abstract_inverted_index",
     "type": "RECORD",
-    "description": "DEPRECATED",
+    "mode": "NULLABLE",
     "fields": [
       {
-        "name": "display_name",
-        "type": "STRING",
-        "description": "DEPRECATED"
-      },
-      {
-        "name": "id",
-        "type": "STRING",
-        "description": "DEPRECATED"
-      },
-      {
-        "name": "is_oa",
-        "type": "BOOLEAN",
-        "description": "DEPRECATED"
-      },
-      {
-        "name": "issn",
+        "name": "keys",
         "type": "STRING",
         "mode": "REPEATED",
-        "description": "DEPRECATED"
-      },
-      {
-        "name": "issn_l",
-        "type": "STRING",
-        "description": "DEPRECATED"
-      },
-      {
-        "name": "license",
-        "type": "STRING",
-        "description": "DEPRECATED"
-      },
-      {
-        "name": "publisher",
-        "type": "STRING",
-        "description": "DEPRECATED"
-      },
-      {
-        "name": "type",
-        "type": "STRING",
-        "description": "DEPRECATED"
+        "description": "Custom field created by COKI. Originally each word in the abstract was a key and the indices of where this word occurred inside the abstract the corresponding value."
       },
       {
-        "name": "url",
+        "name": "values",
         "type": "STRING",
-        "description": "DEPRECATED"
-      },
+        "mode": "REPEATED",
+        "description": "Custom field created by COKI. Originally each word in the abstract was a key and the indices of where this word occurred inside the abstract the corresponding value."
+      }
+    ],
+    "description": "The abstract of the work, as an inverted index, which encodes information about the abstract's words and their positions within the text. Like Microsoft Academic Graph, OpenAlex doesn't include plaintext abstracts due to legal constraints."
+  },
+  {
+    "name": "alternate_host_venues",
+    "type": "RECORD",
+    "mode": "REPEATED",
+    "description": "DEPRECATED",
+    "fields": [
       {
-        "name": "version",
+        "name": "display_name",
         "type": "STRING",
         "description": "DEPRECATED"
       },
@@ -61,25 +36,13 @@
         "description": "DEPRECATED"
       },
       {
-        "name": "host_organization_name",
+        "name": "host_organization_lineage",
         "type": "STRING",
+        "mode": "REPEATED",
         "description": "DEPRECATED"
       },
       {
-        "name": "publisher_id",
-        "type": "STRING",
-        "description": "DEPRECATED"
-      }
-    ]
-  },
-  {
-    "name": "alternate_host_venues",
-    "type": "RECORD",
-    "mode": "REPEATED",
-    "description": "DEPRECATED",
-    "fields": [
-      {
-        "name": "display_name",
+        "name": "host_organization_name",
         "type": "STRING",
         "description": "DEPRECATED"
       },
@@ -115,56 +78,55 @@
         "description": "DEPRECATED"
       },
       {
-        "name": "type",
-        "type": "STRING",
-        "description": "DEPRECATED"
-      },
-      {
-        "name": "url",
-        "type": "STRING",
-        "description": "DEPRECATED"
-      },
-      {
-        "name": "version",
+        "name": "publisher_id",
         "type": "STRING",
         "description": "DEPRECATED"
       },
       {
-        "name": "host_organization",
+        "name": "type",
         "type": "STRING",
         "description": "DEPRECATED"
       },
       {
-        "name": "host_organization_name",
+        "name": "url",
         "type": "STRING",
         "description": "DEPRECATED"
       },
       {
-        "name": "publisher_id",
+        "name": "version",
         "type": "STRING",
         "description": "DEPRECATED"
       }
     ]
   },
   {
-    "name": "abstract_inverted_index",
+    "name": "apc_payment",
     "type": "RECORD",
     "mode": "NULLABLE",
     "fields": [
       {
-        "name": "keys",
+        "name": "currency",
         "type": "STRING",
-        "mode": "REPEATED",
-        "description": "Custom field created by COKI. Originally each word in the abstract was a key and the indices of where this word occurred inside the abstract the corresponding value."
+        "mode": "NULLABLE"
       },
       {
-        "name": "values",
+        "name": "price",
+        "type": "INTEGER",
+        "mode": "NULLABLE"
+      },
+      {
+        "name": "price_usd",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": "APC converted to USD"
+      },
+      {
+        "name": "provenance",
         "type": "STRING",
-        "mode": "REPEATED",
-        "description": "Custom field created by COKI. Originally each word in the abstract was a key and the indices of where this word occurred inside the abstract the corresponding value."
+        "mode": "NULLABLE"
       }
     ],
-    "description": "The abstract of the work, as an inverted index, which encodes information about the abstract's words and their positions within the text. Like Microsoft Academic Graph, OpenAlex doesn't include plaintext abstracts due to legal constraints."
+    "description": "Objects containing information about the APC (article processing charge) for this work. If we can get the APC price from OpenAPC, we use that. Those APCs are specific to an article and are the actual APC paid by an author or institution to publish the article. As a fallback, we use the DOAJ APC prices that are available in sources. Those are an estimate of what authors would have had to pay to publish the article, since the DOAJ apc prices apply to an entire journal."
   },
   {
     "name": "authorships",
@@ -283,6 +245,12 @@
         "mode": "NULLABLE",
         "description": "The location's publishing license. This can be a Create Commons license such as cc0 or cc-by, a publisher-specific license, or null which means we are not able to determine a license for this location."
       },
+      {
+        "name": "pdf_url",
+        "type": "STRING",
+        "mode": "NULLABLE",
+        "description": "A URL where you can find this location as a PDF."
+      },
       {
         "name": "source",
         "type": "RECORD",
@@ -300,6 +268,11 @@
             "mode": "NULLABLE",
             "description": "The host organization for this source as an OpenAlex ID. This will be an Institution.id if the source is a repository, and a Publisher.id if the source is a journal, conference, or eBook platform (based on the type field)."
           },
+          {
+            "name": "host_organization_lineage",
+            "type": "STRING",
+            "mode": "REPEATED"
+          },
           {
             "name": "host_organization_name",
             "type": "STRING",
@@ -345,12 +318,6 @@
         ],
         "description": "Information about the source of this location, as a DehydratedSource object."
       },
-      {
-        "name": "pdf_url",
-        "type": "STRING",
-        "mode": "NULLABLE",
-        "description": "A URL where you can find this location as a PDF."
-      },
       {
         "name": "version",
         "type": "STRING",
@@ -437,6 +404,18 @@
     ],
     "description": "List of dehydrated Concept objects. \nEach Concept object in the list also has one additional property"
   },
+  {
+    "name": "corresponding_author_ids",
+    "type": "STRING",
+    "mode": "REPEATED",
+    "description": "OpenAlex IDs of any authors for which authorships.is_corresponding is true."
+  },
+  {
+    "name": "corresponding_institution_ids",
+    "type": "STRING",
+    "mode": "REPEATED",
+    "description": "OpenAlex IDs of any institutions found within an authorship for which authorships.is_corresponding is true."
+  },
   {
     "name": "counts_by_year",
     "type": "RECORD",
@@ -448,6 +427,12 @@
         "mode": "NULLABLE",
         "description": "The number of times this work is cited in this year."
       },
+      {
+        "name": "oa_works_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
       {
         "name": "year",
         "type": "INTEGER",
@@ -519,6 +504,108 @@
       }
     ]
   },
+  {
+    "name": "grants",
+    "type": "RECORD",
+    "mode": "REPEATED",
+    "fields": [
+      {
+        "name": "award_id",
+        "type": "STRING",
+        "mode": "NULLABLE"
+      },
+      {
+        "name": "funder",
+        "type": "STRING",
+        "mode": "NULLABLE"
+      },
+      {
+        "name": "funder_display_name",
+        "type": "STRING",
+        "mode": "NULLABLE"
+      }
+    ],
+    "description": "List of grant objects, which include the Funder and the award ID, if available. Our grants data comes from Crossref, and is currently fairly limited."
+  },
+  {
+    "name": "host_venue",
+    "type": "RECORD",
+    "description": "DEPRECATED",
+    "fields": [
+      {
+        "name": "display_name",
+        "type": "STRING",
+        "description": "DEPRECATED"
+      },
+      {
+        "name": "host_organization",
+        "type": "STRING",
+        "description": "DEPRECATED"
+      },
+      {
+        "name": "host_organization_lineage",
+        "type": "STRING",
+        "mode": "REPEATED",
+        "description": "DEPRECATED"
+      },
+      {
+        "name": "host_organization_name",
+        "type": "STRING",
+        "description": "DEPRECATED"
+      },
+      {
+        "name": "id",
+        "type": "STRING",
+        "description": "DEPRECATED"
+      },
+      {
+        "name": "is_oa",
+        "type": "BOOLEAN",
+        "description": "DEPRECATED"
+      },
+      {
+        "name": "issn",
+        "type": "STRING",
+        "mode": "REPEATED",
+        "description": "DEPRECATED"
+      },
+      {
+        "name": "issn_l",
+        "type": "STRING",
+        "description": "DEPRECATED"
+      },
+      {
+        "name": "license",
+        "type": "STRING",
+        "description": "DEPRECATED"
+      },
+      {
+        "name": "publisher",
+        "type": "STRING",
+        "description": "DEPRECATED"
+      },
+      {
+        "name": "publisher_id",
+        "type": "STRING",
+        "description": "DEPRECATED"
+      },
+      {
+        "name": "type",
+        "type": "STRING",
+        "description": "DEPRECATED"
+      },
+      {
+        "name": "url",
+        "type": "STRING",
+        "description": "DEPRECATED"
+      },
+      {
+        "name": "version",
+        "type": "STRING",
+        "description": "DEPRECATED"
+      }
+    ]
+  },
   {
     "name": "id",
     "type": "STRING",
@@ -563,6 +650,12 @@
     ],
     "description": "All the persistent identifiers (PIDs) that we know about for this work, as key: value pairs, where key is the PID namespace, and value is the PID. IDs are expressed as URIs where possible."
   },
+  {
+    "name": "is_oa",
+    "type": "BOOLEAN",
+    "mode": "NULLABLE",
+    "description": "Set to true if the work hosted here can be read for free, without registration."
+  },
   {
     "name": "is_paratext",
     "type": "BOOLEAN",
@@ -581,6 +674,12 @@
     "mode": "NULLABLE",
     "description": "The language of the work in ISO 639-1 format. The language is automatically detected using the information we have about the work. We use the langdetect software library on the words in the work's abstract, or the title if we do not have the abstract. The source code for this procedure is here. Keep in mind that this method is not perfect, and that in some cases the language of the title or abstract could be different from the body of the work."
   },
+  {
+    "name": "license",
+    "type": "STRING",
+    "mode": "NULLABLE",
+    "description": "The license applied to this work at this host. Most toll-access works don't have an explicit license (they're under \"all rights reserved\" copyright), so this field generally has content only if is_oa is true."
+  },
   {
     "name": "locations",
     "type": "RECORD",
@@ -627,6 +726,11 @@
             "mode": "NULLABLE",
             "description": "The host organization for this source as an OpenAlex ID. This will be an Institution.id if the source is a repository, and a Publisher.id if the source is a journal, conference, or eBook platform (based on the type field)."
           },
+          {
+            "name": "host_organization_lineage",
+            "type": "STRING",
+            "mode": "REPEATED"
+          },
           {
             "name": "host_organization_name",
             "type": "STRING",
@@ -675,11 +779,17 @@
         "name": "version",
         "type": "STRING",
         "mode": "NULLABLE",
-        "description": "The version of the work, based on the DRIVER Guidelines versioning scheme. Possible values are: publishedVersion: The document’s version of record. This is the most authoritative version.\nacceptedVersion: The document after having completed peer review and being officially accepted for publication. It will lack publisher formatting, but the content should be interchangeable with the that of the publishedVersion.\nsubmittedVersion: the document as submitted to the publisher by the authors, but before peer-review. Its content may differ significantly from that of the accepted article."
+        "description": "The version of the work, based on the DRIVER Guidelines versioning scheme. Possible values are: publishedVersion: The document\u2019s version of record. This is the most authoritative version.\nacceptedVersion: The document after having completed peer review and being officially accepted for publication. It will lack publisher formatting, but the content should be interchangeable with the that of the publishedVersion.\nsubmittedVersion: the document as submitted to the publisher by the authors, but before peer-review. Its content may differ significantly from that of the accepted article."
       }
     ],
     "description": "A list of Location objects describing all unique places where this work lives."
   },
+  {
+    "name": "locations_count",
+    "type": "INTEGER",
+    "mode": "NULLABLE",
+    "description": "Number of locations for this work."
+  },
   {
     "name": "mesh",
     "type": "RECORD",
@@ -718,6 +828,11 @@
     "type": "RECORD",
     "mode": "NULLABLE",
     "fields": [
+      {
+        "name": "any_repository_has_fulltext",
+        "type": "BOOLEAN",
+        "mode": "NULLABLE"
+      },
       {
         "name": "is_oa",
         "type": "BOOLEAN",
@@ -785,6 +900,11 @@
             "mode": "NULLABLE",
             "description": "The host organization for this source as an OpenAlex ID. This will be an Institution.id if the source is a repository, and a Publisher.id if the source is a journal, conference, or eBook platform (based on the type field)."
           },
+          {
+            "name": "host_organization_lineage",
+            "type": "STRING",
+            "mode": "REPEATED"
+          },
           {
             "name": "host_organization_name",
             "type": "STRING",
@@ -833,7 +953,7 @@
         "name": "version",
         "type": "STRING",
         "mode": "NULLABLE",
-        "description": "The version of the work, based on the DRIVER Guidelines versioning scheme. Possible values are:.\npublishedVersion: The document’s version of record. This is the most authoritative version.\nacceptedVersion: The document after having completed peer review and being officially accepted for publication. It will lack publisher formatting, but the content should be interchangeable with the that of the publishedVersion.\nsubmittedVersion: the document as submitted to the publisher by the authors, but before peer-review. Its content may differ significantly from that of the accepted article."
+        "description": "The version of the work, based on the DRIVER Guidelines versioning scheme. Possible values are:.\npublishedVersion: The document\u2019s version of record. This is the most authoritative version.\nacceptedVersion: The document after having completed peer review and being officially accepted for publication. It will lack publisher formatting, but the content should be interchangeable with the that of the publishedVersion.\nsubmittedVersion: the document as submitted to the publisher by the authors, but before peer-review. Its content may differ significantly from that of the accepted article."
       }
     ],
     "description": "A Location object with the primary location of this work."
@@ -862,6 +982,61 @@
     "mode": "REPEATED",
     "description": "OpenAlex IDs for works related to this work. "
   },
+  {
+    "name": "summary_stats",
+    "type": "RECORD",
+    "mode": "NULLABLE",
+    "fields": [
+      {
+        "name": "2yr_cited_by_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "2yr_h_index",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "2yr_i10_index",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "2yr_mean_citedness",
+        "type": "FLOAT",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "cited_by_count",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "h_index",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "i10_index",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": ""
+      },
+      {
+        "name": "oa_percent",
+        "type": "FLOAT",
+        "mode": "NULLABLE",
+        "description": ""
+      }
+    ]
+  },
   {
     "name": "title",
     "type": "STRING",
@@ -879,5 +1054,17 @@
     "type": "TIMESTAMP",
     "mode": "NULLABLE",
     "description": "The last time anything in this Work object changed, expressed as an ISO 8601 date string. This date is updated for any change at all, including increases in various counts."
+  },
+  {
+    "name": "url",
+    "type": "STRING",
+    "mode": "NULLABLE",
+    "description": "The URL where you can access this work."
+  },
+  {
+    "name": "version",
+    "type": "STRING",
+    "mode": "NULLABLE",
+    "description": "The version of the work, based on the DRIVER Guidelines versioning scheme. Possible values are: publishedVersion, acceptedVersion or submittedVersion."
   }
 ]
\ No newline at end of file
diff --git a/academic_observatory_workflows/database/schema/unpaywall/unpaywall.json b/academic_observatory_workflows/database/schema/unpaywall/unpaywall.json
index 9c274a8b8..0bc2e58e9 100644
--- a/academic_observatory_workflows/database/schema/unpaywall/unpaywall.json
+++ b/academic_observatory_workflows/database/schema/unpaywall/unpaywall.json
@@ -398,6 +398,43 @@
             "mode": "NULLABLE",
             "name": "name",
             "type": "STRING"
+          },
+          {
+            "fields": [
+              {
+                "mode": "NULLABLE",
+                "name": "id",
+                "type": "STRING"
+              },
+              {
+                "mode": "NULLABLE",
+                "name": "id-type",
+                "type": "STRING"
+              },
+              {
+                "mode": "NULLABLE",
+                "name": "asserted-by",
+                "type": "STRING"
+              }
+            ],
+            "mode": "REPEATED",
+            "name": "id",
+            "type": "RECORD"
+          },
+          {
+            "mode": "REPEATED",
+            "name": "place",
+            "type": "STRING"
+          },
+          {
+            "mode": "REPEATED",
+            "name": "department",
+            "type": "STRING"
+          },
+          {
+            "mode": "REPEATED",
+            "name": "acronym",
+            "type": "STRING"
           }
         ],
         "mode": "REPEATED",
diff --git a/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2 b/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2
index 8228a1b4f..32ab6958a 100644
--- a/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2
+++ b/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2
@@ -13,23 +13,6 @@
 # limitations under the License.
 # Author: Richard Hosking #}
 
-# Helper Function: Processing Output Types
-{#
-Output Schema:
-per_25th	FLOAT	NULLABLE
-median	    FLOAT	NULLABLE
-per_90th	FLOAT	NULLABLE
-per_95th	FLOAT	NULLABLE
-#}
-CREATE TEMP FUNCTION compute_percentiles(counts ARRAY<INT64>) AS (
-  (SELECT as STRUCT
-    ROUND(PERCENTILE_CONT(count, 0.25) OVER(), 2) as per_25th,
-    ROUND(PERCENTILE_CONT(count, 0.50) OVER(), 2) as median,
-    ROUND(PERCENTILE_CONT(count, 0.90) OVER(), 2) as per_90th,
-    ROUND(PERCENTILE_CONT(count, 0.95) OVER(), 2) as per_95th
-  FROM UNNEST(counts) as count LIMIT 1)
-);
-
 # Helper Function: Counting Access Types
 {#
 Output Schema:
@@ -77,11 +60,10 @@ citations	                RECORD	NULLABLE
 citations.total_openalex_citations	        INTEGER	NULLABLE
 citations.total_open_citations_citations	INTEGER	NULLABLE
 citations.total_crossref_citations	        INTEGER	NULLABLE
-citations.total_mag_citations	        INTEGER	NULLABLE
 #}
 CREATE TEMP FUNCTION count_single_output_type(
         output_type STRING,
-        items ARRAY<STRUCT<type STRING, citations STRUCT<openalex INT64, crossref INT64, open_citations INT64, mag INT64>,
+        items ARRAY<STRUCT<type STRING, citations STRUCT<openalex INT64, crossref INT64, open_citations INT64>,
         oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL>>, measured_type STRING)
   AS (
   (SELECT as STRUCT
@@ -97,8 +79,7 @@ CREATE TEMP FUNCTION count_single_output_type(
     STRUCT(
       SUM(citations.openalex) as total_openalex_citations,
       SUM(citations.open_citations) as total_open_citations_citations,
-      SUM(citations.crossref) as total_crossref_citations,
-      SUM(citations.mag) as total_mag_citations
+      SUM(citations.crossref) as total_crossref_citations
     ) as citations
   FROM UNNEST(items) as item)
 );
@@ -119,10 +100,9 @@ citations	                RECORD	NULLABLE
 citations.total_openalex_citations	        INTEGER	NULLABLE
 citations.total_open_citations_citations	INTEGER	NULLABLE
 citations.total_crossref_citations	        INTEGER	NULLABLE
-citations.total_mag_citations	        INTEGER	NULLABLE
 #}
 CREATE TEMP FUNCTION count_array_output_type(
-        output_type STRING, items ARRAY<STRUCT<type STRING, citations STRUCT<openalex INT64, crossref INT64, open_citations INT64, mag INT64>,
+        output_type STRING, items ARRAY<STRUCT<type STRING, citations STRUCT<openalex INT64, crossref INT64, open_citations INT64>,
         oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL>>, measured_type ARRAY<STRING>)
   AS (
   (SELECT as STRUCT
@@ -138,8 +118,7 @@ CREATE TEMP FUNCTION count_array_output_type(
     STRUCT(
       SUM(citations.openalex) as total_openalex_citations,
       SUM(citations.open_citations) as total_open_citations_citations,
-      SUM(citations.crossref) as total_crossref_citations,
-      SUM(citations.mag) as total_mag_citations
+      SUM(citations.crossref) as total_crossref_citations
     ) as citations
 
   FROM UNNEST(items) as item)
@@ -161,10 +140,9 @@ citations	                RECORD	NULLABLE
 citations.total_openalex_citations	        INTEGER	NULLABLE
 citations.total_open_citations_citations	INTEGER	NULLABLE
 citations.total_crossref_citations	        INTEGER	NULLABLE
-citations.total_mag_citations	        INTEGER	NULLABLE
 #}
 CREATE TEMP FUNCTION count_not_in_array_output_type(
-        output_type STRING, items ARRAY<STRUCT<type STRING, citations STRUCT<openalex INT64, crossref INT64, open_citations INT64, mag INT64>,
+        output_type STRING, items ARRAY<STRUCT<type STRING, citations STRUCT<openalex INT64, crossref INT64, open_citations INT64>,
         oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL>>, measured_type ARRAY<STRING>)
   AS (
   (SELECT as STRUCT
@@ -180,8 +158,7 @@ CREATE TEMP FUNCTION count_not_in_array_output_type(
     STRUCT(
       SUM(citations.openalex) as total_openalex_citations,
       SUM(citations.open_citations) as total_open_citations_citations,
-      SUM(citations.crossref) as total_crossref_citations,
-      SUM(citations.mag) as total_mag_citations
+      SUM(citations.crossref) as total_crossref_citations
     ) as citations
   FROM UNNEST(items) as item)
 );
@@ -194,7 +171,7 @@ output_types	RECORD	REPEATED
 * Each record has the same schema, and is captured in the count_* methods
 #}
 CREATE TEMP FUNCTION count_output_types(
-        items ARRAY<STRUCT<type STRING, citations STRUCT<openalex INT64, crossref INT64, open_citations INT64, mag INT64>,
+        items ARRAY<STRUCT<type STRING, citations STRUCT<openalex INT64, crossref INT64, open_citations INT64>,
         oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL>>)
     AS (
     [
@@ -221,21 +198,15 @@ outputs_without_citations	INTEGER	NULLABLE
 citations	                RECORD	NULLABLE
 citations.openalex                  RECORD	NULLABLE
 citations.openalex.total_citations	INTEGER	NULLABLE
-citations.openalex.percentiles	    RECORD	NULLABLE
 citations.open_citations	                RECORD	NULLABLE
 citations.open_citations.total_citations	INTEGER	NULLABLE
-citations.open_citations.percentiles	    RECORD	NULLABLE
 citations.crossref                  RECORD	NULLABLE
 citations.crossref.total_citations	INTEGER	NULLABLE
-citations.crossref.percentiles	    RECORD	NULLABLE
-citations.mag                  RECORD	NULLABLE
-citations.mag.total_citations	INTEGER	NULLABLE
-citations.mag.percentiles	    RECORD	NULLABLE
 
 *percetiles schema captured above
 #}
 CREATE TEMP FUNCTION compute_conditional_citations(
-           items ARRAY<STRUCT<citations STRUCT<openalex INT64, crossref INT64, open_citations INT64, mag INT64>, is_x BOOL>>,
+           items ARRAY<STRUCT<citations STRUCT<openalex INT64, crossref INT64, open_citations INT64>, is_x BOOL>>,
            access_type STRING, positive_label STRING, negative_label STRING)
   AS (
   ARRAY((
@@ -248,25 +219,18 @@ CREATE TEMP FUNCTION compute_conditional_citations(
       access_type,
       is_x as status,
       COUNT(*) as total_outputs,
-      COUNTIF(citations.crossref > 0 OR citations.open_citations > 0 OR citations.openalex > 0 OR citations.mag > 0) as outputs_with_citations,
-      COUNTIF( (citations.crossref IS NULL OR citations.crossref = 0) AND (citations.open_citations IS NULL OR citations.open_citations = 0) AND (citations.openalex IS NULL OR citations.openalex = 0) OR (citations.mag IS NULL OR citations.mag = 0) ) as outputs_without_citations,
+      COUNTIF(citations.crossref > 0 OR citations.open_citations > 0 OR citations.openalex > 0) as outputs_with_citations,
+      COUNTIF( (citations.crossref IS NULL OR citations.crossref = 0) AND (citations.open_citations IS NULL OR citations.open_citations = 0) AND (citations.openalex IS NULL OR citations.openalex = 0)) as outputs_without_citations,
       STRUCT(
         STRUCT(
-          SUM(citations.openalex) as total_citations,
-          compute_percentiles(ARRAY_AGG(citations.openalex)) as percentiles
+          SUM(citations.openalex) as total_citations
         ) as openalex,
         STRUCT(
-          SUM(citations.open_citations) as total_citations,
-          compute_percentiles(ARRAY_AGG(citations.open_citations)) as percentiles
+          SUM(citations.open_citations) as total_citations
         ) as open_citations, 
         STRUCT(
-          SUM(citations.crossref) as total_citations,
-          compute_percentiles(ARRAY_AGG(citations.crossref)) as percentiles
-        ) as crossref,
-        STRUCT(
-          SUM(citations.mag) as total_citations,
-          compute_percentiles(ARRAY_AGG(citations.mag)) as percentiles
-        ) as mag
+          SUM(citations.crossref) as total_citations
+        ) as crossref
       ) as citations,
 
     FROM UNNEST(items)
@@ -295,14 +259,8 @@ open_citations.citations_per_output	        FLOAT	NULLABLE
 open_citations.outputs_with_citations	    INTEGER	NULLABLE
 open_citations.outputs_without_citations	INTEGER	NULLABLE
 open_citations.citations_per_cited_output	FLOAT	NULLABLE
-mag	                            RECORD	NULLABLE
-mag.total_citations	            INTEGER	NULLABLE
-mag.citations_per_output	        FLOAT	NULLABLE
-mag.outputs_with_citations	    INTEGER	NULLABLE
-mag.outputs_without_citations	INTEGER	NULLABLE
-mag.citations_per_cited_output	FLOAT	NULLABLE
 #}
-CREATE TEMP FUNCTION compute_citations(items ARRAY<STRUCT<doi STRING, citations STRUCT<openalex INT64, crossref INT64, open_citations INT64, mag INT64>>>) as (
+CREATE TEMP FUNCTION compute_citations(items ARRAY<STRUCT<doi STRING, citations STRUCT<openalex INT64, crossref INT64, open_citations INT64>>>) as (
   (SELECT AS STRUCT
     -- Citation counts
     STRUCT(
@@ -325,14 +283,7 @@ CREATE TEMP FUNCTION compute_citations(items ARRAY<STRUCT<doi STRING, citations
       COUNTIF(citations.open_citations > 0) as outputs_with_citations,
       COUNTIF(citations.open_citations is null) as outputs_without_citations,
       ROUND(SAFE_DIVIDE(SUM(citations.open_citations), COUNTIF(citations.open_citations > 0)), 2) as citations_per_cited_output
-    ) as open_citations,
-    STRUCT(
-      SUM(citations.mag) as total_citations,
-      ROUND(SAFE_DIVIDE( SUM(citations.mag) , COUNT(doi)), 2) as citations_per_output,
-      COUNTIF(citations.mag > 0) as outputs_with_citations,
-      COUNTIF(citations.mag is null) as outputs_without_citations,
-      ROUND(SAFE_DIVIDE(SUM(citations.mag), COUNTIF(citations.mag > 0)), 2) as citations_per_cited_output
-    ) as mag
+    ) as open_citations
   FROM UNNEST(items))
 );
 
@@ -365,7 +316,7 @@ breakdown	                RECORD	REPEATED
 * breakdown object array captured in compute_conditional_citations schema above
 #}
 CREATE TEMP FUNCTION compute_access_types(
-           items ARRAY<STRUCT<doi STRING, citations STRUCT<openalex INT64, crossref INT64, open_citations INT64, mag INT64>,
+           items ARRAY<STRUCT<doi STRING, citations STRUCT<openalex INT64, crossref INT64, open_citations INT64>,
            is_oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL>>)
   AS (
   (SELECT AS STRUCT
@@ -530,16 +481,11 @@ sum_of_scores	                FLOAT	NULLABLE
 citations	                    RECORD	NULLABLE
 openalex	                            RECORD	NULLABLE
 total_citations	                INTEGER	NULLABLE
-percentiles	                    RECORD	NULLABLE
 open_citations	                RECORD	NULLABLE
 total_citations	                INTEGER	NULLABLE
-percentiles	                    RECORD	NULLABLE
 crossref	                    RECORD	NULLABLE
 total_citations	                INTEGER	NULLABLE
-percentiles	                    RECORD	NULLABLE
-mag	                    RECORD	NULLABLE
 total_citations	                INTEGER	NULLABLE
-percentiles	                    RECORD	NULLABLE
 num_oa_outputs	                INTEGER	NULLABLE
 num_green_outputs	            INTEGER	NULLABLE
 num_gold_outputs	            INTEGER	NULLABLE
@@ -559,7 +505,7 @@ num_international_collaboration_outputs	            INTEGER	NULLABLE
 international_collaboration_with_funding_outputs	INTEGER	NULLABLE
 #}
 CREATE TEMP FUNCTION compute_disciplines(
-            fields ARRAY<STRUCT<DisplayName STRING, Score FLOAT64, citations STRUCT<openalex INT64, crossref INT64, open_citations INT64, mag INT64>,
+            fields ARRAY<STRUCT<DisplayName STRING, Score FLOAT64, citations STRUCT<openalex INT64, crossref INT64, open_citations INT64>,
             is_oa BOOL, green BOOL, gold BOOL, gold_just_doaj BOOL, hybrid BOOL, bronze BOOL, green_only BOOL,
             funding BOOL, international_funding BOOL, domestic_funding BOOL, government_funding BOOL, private_funding BOOL, international_colab BOOL>>)
   AS (
@@ -570,20 +516,13 @@ CREATE TEMP FUNCTION compute_disciplines(
         SUM(Score) as sum_of_scores,
         STRUCT(
           STRUCT(
-            SUM(citations.mag) as total_citations,
-            compute_percentiles(ARRAY_AGG(citations.mag)) as percentiles
-          ) as mag,
-          STRUCT(
-            SUM(citations.openalex) as total_citations,
-            compute_percentiles(ARRAY_AGG(citations.openalex)) as percentiles
+            SUM(citations.openalex) as total_citations
           ) as openalex,
           STRUCT(
-            SUM(citations.open_citations) as total_citations,
-            compute_percentiles(ARRAY_AGG(citations.open_citations)) as percentiles
+            SUM(citations.open_citations) as total_citations
           ) as open_citations, 
           STRUCT(
-            SUM(citations.crossref) as total_citations,
-            compute_percentiles(ARRAY_AGG(citations.crossref)) as percentiles
+            SUM(citations.crossref) as total_citations
           ) as crossref
         ) as citations,
         COUNTIF(is_oa) as num_oa_outputs, 
@@ -657,7 +596,6 @@ citations	                            RECORD	NULLABLE
 citations.openalex	            INTEGER	NULLABLE
 citations.crosssref	                    INTEGER	NULLABLE
 citations.open_citations	            INTEGER	NULLABLE
-citations.mag	            INTEGER	NULLABLE
 disciplines	                            RECORD	REPEATED
 
 * Schema for disciplines captured above
@@ -678,8 +616,7 @@ CREATE TEMP FUNCTION process_relations(relationships ANY TYPE, total INT64, tota
           STRUCT(
             SUM(citations.openalex) as openalex,
             SUM(citations.crossref) as crosssref,
-            SUM(citations.open_citations) as open_citations,
-            SUM(citations.mag) as mag
+            SUM(citations.open_citations) as open_citations
           ) as citations,
           group_disciplines(ARRAY_CONCAT_AGG(disciplines)) as disciplines
       FROM UNNEST(relationships) as relations 
@@ -784,8 +721,7 @@ WITH tmp_disciplines AS
               STRUCT(
                 dois.openalex.cited_by_count as openalex,
                 dois.crossref.references_count as crossref,
-                dois.open_citations.citations_total as open_citations,
-                dois.mag.CitationCount as mag
+                dois.open_citations.citations_total as open_citations
               ) as citations,
               unpaywall.is_oa as is_oa, unpaywall.green as green, unpaywall.gold as gold, unpaywall.gold_just_doaj, unpaywall.hybrid, unpaywall.bronze, unpaywall.green_only,
               -- Total Funding
@@ -818,7 +754,7 @@ tmp_access_types AS (
     compute_access_types(
       ARRAY_AGG(
           STRUCT(
-              dois.doi, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations, dois.mag.CitationCount as mag) as citations,
+              dois.doi, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations,
               unpaywall.is_oa, unpaywall.green, unpaywall.gold, unpaywall.gold_just_doaj, unpaywall.hybrid, unpaywall.bronze, unpaywall.green_only
           )
       )
@@ -864,7 +800,7 @@ SELECT
   compute_citations(
      ARRAY_AGG(
         STRUCT(
-            dois.doi, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations, dois.mag.CitationCount as mag) as citations
+            dois.doi, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations
         )
      )
   ) as citations,
@@ -873,7 +809,7 @@ SELECT
   count_output_types(
      ARRAY_AGG(
         STRUCT(
-            unpaywall.output_type, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations, dois.mag.CItationCount as mag) as citations,
+            unpaywall.output_type, STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations,
             unpaywall.is_oa, unpaywall.green, unpaywall.gold, unpaywall.gold_just_doaj, unpaywall.hybrid, unpaywall.bronze, unpaywall.green_only
         )
      )
@@ -906,7 +842,7 @@ SELECT
         (SELECT as STRUCT 
           relation, 
           unpaywall,
-          STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations, dois.mag.CitationCount as mag) as citations,
+          STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations,
           ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines
         FROM UNNEST(affiliations.institutions) as relation
         WHERE relation.identifier <> aggregrate.identifier) 
@@ -930,7 +866,7 @@ SELECT
         (SELECT as STRUCT 
           relation, 
           unpaywall,
-          STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations, dois.mag.CitationCount as mag) as citations,
+          STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations,
           ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines
         FROM UNNEST(affiliations.countries) as relation
         WHERE relation.identifier <> aggregrate.country_code OR aggregrate.country_code IS NULL) 
@@ -954,7 +890,7 @@ SELECT
         (SELECT as STRUCT 
           relation, 
           unpaywall,
-          STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations, dois.mag.CitationCount as mag) as citations,
+          STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations,
           ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines
         FROM UNNEST(affiliations.groupings) as relation
         WHERE relation.identifier <> aggregrate.identifier) 
@@ -978,7 +914,7 @@ SELECT
         (SELECT as STRUCT 
           relation, 
           unpaywall,
-          STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations, dois.mag.CitationCount as mag) as citations,
+          STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations,
           ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines
         FROM UNNEST(affiliations.funders) as relation
         WHERE relation.identifier <> aggregrate.identifier) 
@@ -1005,7 +941,7 @@ SELECT
           ) as relation,
           unpaywall,
           STRUCT(
-                dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations, dois.mag.CitationCount as mag
+                dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations
           ) as citations,
           ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines
         FROM UNNEST(aggregrate.members) as relation) 
@@ -1029,7 +965,7 @@ SELECT
         (SELECT as STRUCT 
           relation, 
           unpaywall,
-          STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations, dois.mag.CitationCount as mag) as citations,
+          STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations,
           ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines
         FROM UNNEST(affiliations.publishers) as relation
         WHERE relation.identifier <> aggregrate.identifier) 
@@ -1053,7 +989,7 @@ SELECT
         (SELECT as STRUCT 
           relation, 
           unpaywall,
-          STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations, dois.mag.CitationCount as mag) as citations,
+          STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations,
           ARRAY( SELECT as STRUCT display_name as DisplayName, Score, unpaywall.is_oa FROM UNNEST(openalex.concepts) as fields where fields.level = 0) as disciplines
         FROM UNNEST(affiliations.journals) as relation
         WHERE relation.identifier <> aggregrate.identifier) 
@@ -1074,7 +1010,7 @@ SELECT
         SELECT AS STRUCT 
           event.source, 
           event.count, 
-          STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations, dois.mag.CitationCount as mag) as citations, 
+          STRUCT(dois.openalex.cited_by_count as openalex, dois.crossref.references_count as crossref, dois.open_citations.citations_total as open_citations) as citations, 
           unpaywall.is_oa as is_oa, unpaywall.green as green, unpaywall.gold as gold, unpaywall.gold_just_doaj, unpaywall.hybrid, unpaywall.bronze, unpaywall.green_only 
         FROM UNNEST(dois.events.events) as event)))
   ) as events,
diff --git a/academic_observatory_workflows/database/sql/export_access_types.sql.jinja2 b/academic_observatory_workflows/database/sql/export_access_types.sql.jinja2
index 8583b711e..5d1b6c0af 100644
--- a/academic_observatory_workflows/database/sql/export_access_types.sql.jinja2
+++ b/academic_observatory_workflows/database/sql/export_access_types.sql.jinja2
@@ -23,6 +23,5 @@ SELECT
   access_type.outputs_with_citations as access_types_outputs_with_citations,
   access_type.outputs_without_citations as access_types_outputs_without_citations,
   access_type.citations.openalex.total_citations as access_types_total_citations,
-  access_type.citations.openalex.percentiles.median as access_types_median_citations_per_output
 FROM `{{ table_id }}`, UNNEST( access_types.breakdown ) as access_type
 ORDER BY id, published_year ASC
\ No newline at end of file
diff --git a/academic_observatory_workflows/database/sql/export_disciplines.sql.jinja2 b/academic_observatory_workflows/database/sql/export_disciplines.sql.jinja2
index fd2ac6fbe..9741fa76a 100644
--- a/academic_observatory_workflows/database/sql/export_disciplines.sql.jinja2
+++ b/academic_observatory_workflows/database/sql/export_disciplines.sql.jinja2
@@ -30,7 +30,6 @@ SELECT
   ROUND(SAFE_DIVIDE( ( discipline.num_green_outputs ) * 100 , discipline.total_outputs ), 2) as disciplines_percent_green,
   ROUND(SAFE_DIVIDE( ( discipline.num_gold_outputs ) * 100 , discipline.total_outputs ), 2) as disciplines_percent_gold,
   discipline.citations.openalex.total_citations as disciplines_total_citations,
-  discipline.citations.openalex.percentiles.median as disciplines_median_citations_per_output,
   discipline.funding.total_funded_outputs as disciplines_total_funded_outputs,
   discipline.funding.num_international_outputs as disciplines_num_international_funded_outputs,
   discipline.funding.num_domestic_outputs as disciplines_num_domestic_funded_outputs,
diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/publishers/updated_date=2023-04-02/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/publishers/updated_date=2023-04-02/part_000.json
index 375e75a25..9e2de6d6e 100644
--- a/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/publishers/updated_date=2023-04-02/part_000.json
+++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/data/publishers/updated_date=2023-04-02/part_000.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0b49ae1aedf81fc4a8dd35379fa8a48dbf2ce13d02319c4691c5ed5a8e981da7
-size 7823
+oid sha256:befe9a21ace1f764aa2898292df137d17934d1b7d6f7fad4550064bd597b95f5
+size 7831
diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/authors.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/authors.json
index a90eed227..04bee3b27 100644
--- a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/authors.json
+++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/authors.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3bd7abd62f450450d2a8a17645ee358af0fd4707c37b94ab5e6bbfe93729e072
-size 3549
+oid sha256:5f8156a2d393590d0bdf59542509bdb0104d4e463fb7d3913b4ac77df24f9647
+size 4365
diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/concepts.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/concepts.json
index f4a60de46..7de1c0ffc 100644
--- a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/concepts.json
+++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/concepts.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e169145a2bf24ffcb0e44c20a7c1adcf86d6249f2ac6bd8b1b17af20965d2981
-size 63659
+oid sha256:e7af41245c3279bdef11dce94afa87282ea22463eed22ad692c2a765c1f5cee5
+size 66011
diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/institutions.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/institutions.json
index 8f2e78cfd..6e8272084 100644
--- a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/institutions.json
+++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/institutions.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:15489b3de773811b568079c6faec6eed6b3eae93d5be839cd270ce73df73692f
-size 40406
+oid sha256:2ea8182e507898c6e4638c280b38db7b6d197e495c9673a675b11af4530bfdfd
+size 42826
diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/publishers.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/publishers.json
index 29743dac5..1bc0793f3 100644
--- a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/publishers.json
+++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/publishers.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:073423dac00bcc41a554dece518f24ac7647fbc900da1417c8bfaeeb9066ad8b
-size 8180
+oid sha256:7a59eabc9986ab52e0abfa11a568043f0cb30d0855babfdbc8f95e0737d622ef
+size 10630
diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/sources.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/sources.json
index 22f864c3f..6466cf2e1 100644
--- a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/sources.json
+++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/sources.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a75e223b6812f803ae5b13340a53f6cb9703f9bcb0855bc6b0bc83d469e902e9
-size 23983
+oid sha256:eda11d6389dea9d28fa2d523a3e1da4a82aad7c40c43edac4026d9d02597bf3a
+size 26239
diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/works.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/works.json
index e1d4fa5cb..f33846a96 100644
--- a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/works.json
+++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/works.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a748ded838d21c426a3a62b9e4c6cad1d29cfe26ed6151b4fa445f5f580ba093
-size 61739
+oid sha256:2b99194cd7199fa282aaa1656f33f3f119734ce7051e5e81a8b626a52cd3d977
+size 64316
diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/publishers/updated_date=2023-04-02/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/publishers/updated_date=2023-04-02/part_000.json
index 375e75a25..9e2de6d6e 100644
--- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/publishers/updated_date=2023-04-02/part_000.json
+++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/publishers/updated_date=2023-04-02/part_000.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0b49ae1aedf81fc4a8dd35379fa8a48dbf2ce13d02319c4691c5ed5a8e981da7
-size 7823
+oid sha256:befe9a21ace1f764aa2898292df137d17934d1b7d6f7fad4550064bd597b95f5
+size 7831
diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/works/updated_date=2023-04-16/part_000.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/works/updated_date=2023-04-16/part_000.json
index ea1d97d70..2777d7f88 100644
--- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/works/updated_date=2023-04-16/part_000.json
+++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/data/works/updated_date=2023-04-16/part_000.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9584a66707b4e6cb62c5b482ac55ea6ff59d37102ca1d8b73ab16eecb373a663
-size 48934
+oid sha256:2af6dd859abdd351c261e7bb3e43ab90bce6531bf6c5fda0a26302bea59be6c7
+size 49088
diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/authors.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/authors.json
index 38a5cf053..9cecf53f3 100644
--- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/authors.json
+++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/authors.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ade95bb53f2ca4fb19b4cd73becd361fa4099fb5738c412a055008441fd50c57
-size 3660
+oid sha256:cc51e11fa79a49ea33c9e1385d28b99171064ad9374652af39392d972f43b6f7
+size 4476
diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/concepts.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/concepts.json
index 2ad233117..66bde0294 100644
--- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/concepts.json
+++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/concepts.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fd3b6945eb25e2925921d59465a39c5d848fa1e6bbf9bc5cfc701cbd13871a9c
-size 75948
+oid sha256:e1f583377c7d3c34b64615c61f0c9ae78529aa0bdbf5a89475c626712c39b6b5
+size 78888
diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/institutions.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/institutions.json
index 62f8ca72d..1ebd87369 100644
--- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/institutions.json
+++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/institutions.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fdf6a182144f02c18d0ac02fc813a79f9f78062a2ea72bbf80aac909af60b869
-size 39267
+oid sha256:3dd9bd6a49028df3f3d4de54188d337f9099429d5e41e603abc2f1f65f5641ae
+size 41687
diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/publishers.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/publishers.json
index 2a185c8fa..9c756b976 100644
--- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/publishers.json
+++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/publishers.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:73c4b4795ade705963241dfd4808c2f8f62395194b094ac93442d93b1489b3d9
-size 10176
+oid sha256:f8139671554c19329f1b95543db743f8b01ac792f6461bbab24b777e6a6a97c5
+size 13231
diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/sources.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/sources.json
index 2d0e789bc..f202d0d0a 100644
--- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/sources.json
+++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/sources.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:34587aa2c068d00e63bc9cac65d9c4562f1090e60221719b01d29df16db415f6
-size 22112
+oid sha256:7c22e51ff3de26e723ae1b25661bfba04b8c96560646395df8c6ace923d467f5
+size 24464
diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/works.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/works.json
index 746eca5de..39e728463 100644
--- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/works.json
+++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/works.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:569f53208acf8520eb265ddbf4b3e511423e4944d79198f1e2421718abb0cc70
-size 51589
+oid sha256:e6d8c9c486421f6bf238155f62add07542ec0f466b588672cb524032921fce48
+size 53806
diff --git a/academic_observatory_workflows/fixtures/unpaywall/expected/run1_bq_load_main_table.json b/academic_observatory_workflows/fixtures/unpaywall/expected/run1_bq_load_main_table.json
index ba8a9f3ef..13c03eb99 100644
--- a/academic_observatory_workflows/fixtures/unpaywall/expected/run1_bq_load_main_table.json
+++ b/academic_observatory_workflows/fixtures/unpaywall/expected/run1_bq_load_main_table.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:26e30270c0b0dfb4ee9512c89526c32a2a42bb93bdec557054a79d16660f1197
-size 55555
+oid sha256:ae5d43f64d6225554385980f3a4534b7dab20b1397c344aaefd7fa2d998963fc
+size 57427
diff --git a/academic_observatory_workflows/fixtures/unpaywall/expected/run1_bq_upsert_records.json b/academic_observatory_workflows/fixtures/unpaywall/expected/run1_bq_upsert_records.json
index 0e88a540c..0f485bc0c 100644
--- a/academic_observatory_workflows/fixtures/unpaywall/expected/run1_bq_upsert_records.json
+++ b/academic_observatory_workflows/fixtures/unpaywall/expected/run1_bq_upsert_records.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d3e6f1102a9addd369c709b71c222d8b0d909da5d9be07b124b50bb25761ff4c
-size 55568
+oid sha256:39981fb96d12876a528aabc7878910cfa796a208714c95e7a474824c4f27d781
+size 57440
diff --git a/academic_observatory_workflows/fixtures/unpaywall/expected/run3_bq_upsert_records.json b/academic_observatory_workflows/fixtures/unpaywall/expected/run3_bq_upsert_records.json
index b06eab2aa..b5ef8e47d 100644
--- a/academic_observatory_workflows/fixtures/unpaywall/expected/run3_bq_upsert_records.json
+++ b/academic_observatory_workflows/fixtures/unpaywall/expected/run3_bq_upsert_records.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dd759c19ead8b35cf0b6489a6a7460118b3e851bcff51211399a857cd0f88bfd
-size 58096
+oid sha256:a54512b3aa901a302ac37f63c9f99b47c34ac8a18191b5b921885fdb0fc227f9
+size 59968
diff --git a/academic_observatory_workflows/workflows/crossref_events_telescope.py b/academic_observatory_workflows/workflows/crossref_events_telescope.py
index 7b23d5761..2504fb77b 100644
--- a/academic_observatory_workflows/workflows/crossref_events_telescope.py
+++ b/academic_observatory_workflows/workflows/crossref_events_telescope.py
@@ -165,11 +165,11 @@ def __init__(
         dag_id: str,
         cloud_workspace: CloudWorkspace,
         events_start_date: pendulum.DateTime = pendulum.datetime(2017, 2, 17),
-        bq_dataset_id: str = "crossref",
+        bq_dataset_id: str = "crossref_events",
         bq_table_name: str = "crossref_events",
         api_dataset_id: str = "crossref_events",
         schema_folder: str = os.path.join(default_schema_folder(), "crossref_events"),
-        dataset_description: str = "Datasets created by Crossref: https://www.crossref.org/",
+        dataset_description: str = "The Crossref Events dataset: https://www.eventdata.crossref.org/guide/",
         table_description: str = "The Crossref Events dataset: https://www.eventdata.crossref.org/guide/",
         snapshot_expiry_days: int = 31,
         n_rows: int = 1000,
@@ -515,7 +515,7 @@ def bq_load_main_table(self, release: CrossrefEventsRelease, **kwargs):
             schema_file_path=self.schema_file_path,
             source_format=SourceFormat.NEWLINE_DELIMITED_JSON,
             write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
-            ignore_unknown_values=False,
+            ignore_unknown_values=True,
         )
         set_task_state(success, self.bq_load_main_table.__name__, release)
 
diff --git a/academic_observatory_workflows/workflows/crossref_fundref_telescope.py b/academic_observatory_workflows/workflows/crossref_fundref_telescope.py
index 44942750a..fe30dac1a 100644
--- a/academic_observatory_workflows/workflows/crossref_fundref_telescope.py
+++ b/academic_observatory_workflows/workflows/crossref_fundref_telescope.py
@@ -76,11 +76,11 @@ def __init__(
         *,
         dag_id: str,
         cloud_workspace: CloudWorkspace,
-        bq_dataset_id: str = "crossref",
+        bq_dataset_id: str = "crossref_fundref",
         bq_table_name: str = "crossref_fundref",
         api_dataset_id: str = "crossref_fundref",
         schema_folder: str = os.path.join(default_schema_folder(), "crossref_fundref"),
-        dataset_description: str = "Datasets created by Crossref: https://www.crossref.org/",
+        dataset_description: str = "The Crossref Funder Registry dataset: https://www.crossref.org/services/funder-registry/",
         table_description: str = "The Crossref Funder Registry dataset: https://www.crossref.org/services/funder-registry/",
         observatory_api_conn_id: str = AirflowConns.OBSERVATORY_API,
         start_date: pendulum.DateTime = pendulum.datetime(2014, 2, 23),
diff --git a/academic_observatory_workflows/workflows/crossref_metadata_telescope.py b/academic_observatory_workflows/workflows/crossref_metadata_telescope.py
index c1f7dd5e0..6b3ba81c8 100644
--- a/academic_observatory_workflows/workflows/crossref_metadata_telescope.py
+++ b/academic_observatory_workflows/workflows/crossref_metadata_telescope.py
@@ -18,7 +18,6 @@
 from __future__ import annotations
 
 import functools
-import gzip
 import json
 import logging
 import os
@@ -76,7 +75,7 @@ def __init__(self, *, dag_id: str, run_id: str, snapshot_date: pendulum.DateTime
         self.download_file_name = "crossref_metadata.json.tar.gz"
         self.download_file_path = os.path.join(self.download_folder, self.download_file_name)
         self.extract_files_regex = r".*\.json$"
-        self.transform_files_regex = r".*\.jsonl.gz$"
+        self.transform_files_regex = r".*\.jsonl$"
 
 
 class CrossrefMetadataTelescope(Workflow):
@@ -91,16 +90,16 @@ def __init__(
         *,
         dag_id: str,
         cloud_workspace: CloudWorkspace,
-        bq_dataset_id: str = "crossref",
+        bq_dataset_id: str = "crossref_metadata",
         bq_table_name: str = "crossref_metadata",
         api_dataset_id: str = "crossref_metadata",
         schema_folder: str = os.path.join(default_schema_folder(), "crossref_metadata"),
-        dataset_description: str = "Datasets created by Crossref: https://www.crossref.org/",
+        dataset_description: str = "The Crossref Metadata Plus dataset: https://www.crossref.org/services/metadata-retrieval/metadata-plus/",
         table_description: str = "The Crossref Metadata Plus dataset: https://www.crossref.org/services/metadata-retrieval/metadata-plus/",
         crossref_metadata_conn_id: str = "crossref_metadata",
         observatory_api_conn_id: str = AirflowConns.OBSERVATORY_API,
         max_processes: int = os.cpu_count(),
-        batch_size: int = 200,
+        batch_size: int = 20,
         start_date: pendulum.DateTime = pendulum.datetime(2020, 6, 7),
         schedule_interval: str = "0 0 7 * *",
         catchup: bool = True,
@@ -241,10 +240,11 @@ def upload_downloaded(self, release: CrossrefMetadataRelease, **kwargs):
 
     def transform(self, release: CrossrefMetadataRelease, **kwargs):
         """Task to transform the CrossrefMetadataRelease release for a given month.
-        Each extracted file is transformed. This is done in parallel using the ThreadPoolExecutor."""
+        Each extracted file is transformed."""
 
         logging.info(f"Transform input folder: {release.extract_folder}, output folder: {release.transform_folder}")
         clean_dir(release.transform_folder)
+        finished = 0
 
         # List files and sort so that they are processed in ascending order
         input_file_paths = natsorted(list_files(release.extract_folder, release.extract_files_regex))
@@ -256,21 +256,16 @@ def transform(self, release: CrossrefMetadataRelease, **kwargs):
 
                 # Create tasks for each file
                 for input_file in chunk:
-                    future = executor.submit(transform_file, input_file)
+                    output_file = os.path.join(release.transform_folder, os.path.basename(input_file) + "l")
+                    future = executor.submit(transform_file, input_file, output_file)
                     futures.append(future)
 
-                # Write data from batch into a single jsonl.gz file
-                # The output file will be a json lines gzip file, hence adding the 'l.gz' to the file extension
-                file_path = os.path.join(release.transform_folder, f"crossref_metadata_{i:012}.jsonl.gz")
-                with gzip.open(file_path, "wb") as gzip_file:
-                    with jsonlines.Writer(gzip_file) as writer:
-                        # Write data to the jsonlines.Writer as it becomes available
-                        for future in as_completed(futures):
-                            data = future.result()
-                            writer.write_all(data)
-
-                if i % 1000 == 0:
-                    logging.info(f"Transformed {i + 1} files")
+                # Wait for completed tasks
+                for future in as_completed(futures):
+                    future.result()
+                    finished += 1
+                    if finished % 1000 == 0:
+                        logging.info(f"Transformed {finished} files")
 
     def upload_transformed(self, release: CrossrefMetadataRelease, **kwargs) -> None:
         """Upload the transformed data to Cloud Storage."""
@@ -294,7 +289,7 @@ def bq_load(self, release: CrossrefMetadataRelease, **kwargs):
         # subfolders: https://cloud.google.com/bigquery/docs/batch-loading-data#load-wildcards
         uri = gcs_blob_uri(
             self.cloud_workspace.transform_bucket,
-            f"{gcs_blob_name_from_path(release.transform_folder)}/*.jsonl.gz",
+            f"{gcs_blob_name_from_path(release.transform_folder)}/*.jsonl",
         )
         table_id = bq_sharded_table_id(
             self.cloud_workspace.output_project_id, self.bq_dataset_id, self.bq_table_name, release.snapshot_date
@@ -363,24 +358,23 @@ def check_release_exists(month: pendulum.DateTime, api_key: str) -> bool:
         return False
 
 
-def transform_file(input_file_path: str):
+def transform_file(input_file_path: str, output_file_path: str):
     """Transform a single Crossref Metadata json file.
     The json file is converted to a jsonl file and field names are transformed so they are accepted by BigQuery.
 
     :param input_file_path: the path of the file to transform.
+    :param output_file_path: where to save the transformed file.
     :return: None.
     """
 
     # Open json
-    with open(input_file_path, mode="r") as input_file:
-        input_data = json.load(input_file)
-
-    # Transform data
-    output_data = []
-    for item in input_data["items"]:
-        output_data.append(transform_item(item))
+    with open(input_file_path, mode="r") as in_file:
+        input_data = json.load(in_file)
 
-    return output_data
+    # Transform and write
+    with jsonlines.open(output_file_path, mode="w", compact=True) as out_file:
+        for item in input_data["items"]:
+            out_file.write(transform_item(item))
 
 
 def transform_item(item):
diff --git a/academic_observatory_workflows/workflows/doi_workflow.py b/academic_observatory_workflows/workflows/doi_workflow.py
index e140243da..b8eb27eb2 100644
--- a/academic_observatory_workflows/workflows/doi_workflow.py
+++ b/academic_observatory_workflows/workflows/doi_workflow.py
@@ -102,14 +102,14 @@ class Aggregation:
 def make_dataset_transforms(
     input_project_id: str,
     output_project_id: str,
-    dataset_id_crossref_events: str = "crossref",
-    dataset_id_crossref_metadata: str = "crossref",
-    dataset_id_crossref_fundref: str = "crossref",
+    dataset_id_crossref_events: str = "crossref_events",
+    dataset_id_crossref_metadata: str = "crossref_metadata",
+    dataset_id_crossref_fundref: str = "crossref_fundref",
     dataset_id_ror: str = "ror",
     dataset_id_mag: str = "mag",
     dataset_id_orcid: str = "orcid",
     dataset_id_open_citations: str = "open_citations",
-    dataset_id_unpaywall: str = "our_research",
+    dataset_id_unpaywall: str = "unpaywall",
     dataset_id_openalex: str = "openalex",
     dataset_id_settings: str = "settings",
     dataset_id_observatory: str = "observatory",
@@ -515,7 +515,7 @@ def __init__(
         bq_dashboards_dataset_id: str = "coki_dashboards",
         bq_observatory_dataset_id: str = "observatory",
         bq_elastic_dataset_id: str = "data_export",
-        bq_unpaywall_dataset_id: str = "our_research",
+        bq_unpaywall_dataset_id: str = "unpaywall",
         bq_ror_dataset_id: str = "ror",
         api_dataset_id: str = "doi",
         transforms: Tuple = None,
diff --git a/academic_observatory_workflows/workflows/oa_web_workflow.py b/academic_observatory_workflows/workflows/oa_web_workflow.py
index 1ad9282cd..916cee745 100644
--- a/academic_observatory_workflows/workflows/oa_web_workflow.py
+++ b/academic_observatory_workflows/workflows/oa_web_workflow.py
@@ -83,7 +83,7 @@
     ("outputs_public", "n_outputs_other_platform_open"),
     ("outputs_other_internet", "n_outputs_other_platform_open"),
 ]
-INCLUSION_THRESHOLD = {"country": 1, "institution": 700}
+INCLUSION_THRESHOLD = {"country": 15, "institution": 800}
 MAX_REPOSITORIES = 200
 START_YEAR = 2000
 END_YEAR = pendulum.now().year - 1
@@ -576,6 +576,8 @@ def build_indexes(self, release: OaWebRelease, **kwargs):
             # Aggregate data file
             df_index = make_index_df(category, df_index, df_data)
 
+            logging.info(f"Total {category} entities: {len(df_index)}")
+
             # Save index to intermediate
             index_path = os.path.join(release.intermediate_path, index_name)
             rows: List[Dict] = df_index.to_dict("records")
@@ -1826,7 +1828,8 @@ def save_coki_oa_dataset(path: str, countries: List[Entity], institutions: List[
     subset = {
         "id": None,
         "name": None,
-        "country": None,
+        "country_name": None,
+        "country_code": None,
         "subregion": None,
         "region": None,
         "institution_type": None,
diff --git a/academic_observatory_workflows/workflows/openalex_telescope.py b/academic_observatory_workflows/workflows/openalex_telescope.py
index 6db9411b4..aea3ed277 100644
--- a/academic_observatory_workflows/workflows/openalex_telescope.py
+++ b/academic_observatory_workflows/workflows/openalex_telescope.py
@@ -656,7 +656,7 @@ def bq_load_upsert_tables(self, release: OpenAlexRelease, **kwargs):
                 schema_file_path=entity.schema_file_path,
                 source_format=SourceFormat.NEWLINE_DELIMITED_JSON,
                 write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
-                ignore_unknown_values=False,
+                ignore_unknown_values=True,
             )
             assert (
                 success
@@ -699,7 +699,7 @@ def bq_load_delete_tables(self, release: OpenAlexRelease, **kwargs):
                     source_format=SourceFormat.CSV,
                     csv_skip_leading_rows=1,
                     write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
-                    ignore_unknown_values=False,
+                    ignore_unknown_values=True,
                 )
                 assert (
                     success
@@ -943,9 +943,12 @@ def fetch_merged_ids(
     results = []
     for page in paginator.paginate(Bucket=bucket, Prefix=f"{prefix}/{entity_name}"):
         for content in page.get("Contents", []):
-            url = f"s3://{bucket}/{content['Key']}"
-            content_length = content["Size"]
-            results.append(MergedId(url, content_length))
+            obj_key = content["Key"]
+            # There is a dud file in data/merged_ids/sources/
+            if obj_key != "data/merged_ids/sources/.csv":
+                url = f"s3://{bucket}/{obj_key}"
+                content_length = content["Size"]
+                results.append(MergedId(url, content_length))
 
     # Sort from oldest to newest
     results.sort(key=lambda m: m.updated_date, reverse=False)
@@ -970,16 +973,13 @@ def transform_file(download_path: str, transform_path: str):
     with gzip.open(download_path, "rb") as f_in, gzip.open(transform_path, "wt", encoding="ascii") as f_out:
         reader = jsonlines.Reader(f_in)
         for obj in reader.iter(skip_empty=True):
-            if "works" in download_path:
-                transform_object(obj, "abstract_inverted_index")
-            else:
-                transform_object(obj, "international")
+            transform_object(obj)
             json.dump(obj, f_out)
             f_out.write("\n")
     logging.info(f"Finished transform, saved to {transform_path}")
 
 
-def transform_object(obj: dict, field: str):
+def transform_object(obj: dict):
     """Transform an entry/object for one of the OpenAlex entities.
     For the Work entity only the "abstract_inverted_index" field is transformed.
     For the Concept and Institution entities only the "international" field is transformed.
@@ -988,18 +988,40 @@ def transform_object(obj: dict, field: str):
     :param field: The field of interested that is transformed.
     :return: None.
     """
-    if field == "international":
-        for nested_field in obj.get(field, {}).keys():
-            if not isinstance(obj[field][nested_field], dict):
-                continue
-            keys = list(obj[field][nested_field].keys())
-            values = list(obj[field][nested_field].values())
 
-            obj[field][nested_field] = {"keys": keys, "values": values}
-    elif field == "abstract_inverted_index":
+    # Remove nulls from arrays
+    # And handle null value
+    field = "corresponding_institution_ids"
+    if field in obj:
+        value = obj.get(field, [])
+        if value is None:
+            value = []
+        obj[field] = [x for x in value if x is not None]
+
+    # Remove nulls from arrays
+    # And handle null value
+    field = "corresponding_author_ids"
+    if field in obj:
+        value = obj.get(field, [])
+        if value is None:
+            value = []
+        obj[field] = [x for x in value if x is not None]
+
+    field = "abstract_inverted_index"
+    if field in obj:
         if not isinstance(obj.get(field), dict):
             return
         keys = list(obj[field].keys())
         values = [str(value)[1:-1] for value in obj[field].values()]
 
         obj[field] = {"keys": keys, "values": values}
+
+    field = "international"
+    if field in obj:
+        for nested_field in obj.get(field, {}).keys():
+            if not isinstance(obj[field][nested_field], dict):
+                continue
+            keys = list(obj[field][nested_field].keys())
+            values = list(obj[field][nested_field].values())
+
+            obj[field][nested_field] = {"keys": keys, "values": values}
diff --git a/academic_observatory_workflows/workflows/ror_telescope.py b/academic_observatory_workflows/workflows/ror_telescope.py
index 1b4be120e..fb9221996 100644
--- a/academic_observatory_workflows/workflows/ror_telescope.py
+++ b/academic_observatory_workflows/workflows/ror_telescope.py
@@ -20,6 +20,7 @@
 import logging
 import math
 import os
+import shutil
 import urllib.parse
 from typing import List, Any, Dict
 from zipfile import BadZipFile, ZipFile
@@ -222,6 +223,12 @@ def extract(self, releases: List[RorRelease], **kwargs):
                 raise AirflowException("Not a zip file")
             logging.info(f"File extracted to: {release.extract_folder}")
 
+            # Remove dud __MACOSX folder that shouldn't be there
+            try:
+                shutil.rmtree(os.path.join(release.extract_folder, "__MACOSX"))
+            except FileNotFoundError:
+                pass
+
     def transform(self, releases: List[RorRelease], **kwargs):
         """Task to transform the ROR releases."""
 
diff --git a/academic_observatory_workflows/workflows/scopus_telescope.py b/academic_observatory_workflows/workflows/scopus_telescope.py
index 2fae4fd39..0a686cdc2 100644
--- a/academic_observatory_workflows/workflows/scopus_telescope.py
+++ b/academic_observatory_workflows/workflows/scopus_telescope.py
@@ -101,7 +101,7 @@ def __init__(
         scopus_conn_ids: List[str],
         view: str = "STANDARD",
         earliest_date: pendulum.DateTime = pendulum.datetime(1800, 1, 1),
-        bq_dataset_id: str = "elsevier",
+        bq_dataset_id: str = "scopus",
         bq_table_name: str = "scopus",
         api_dataset_id: str = "scopus",
         schema_folder: str = os.path.join(default_schema_folder(), "scopus"),
diff --git a/academic_observatory_workflows/workflows/tests/test_crossref_metadata_telescope.py b/academic_observatory_workflows/workflows/tests/test_crossref_metadata_telescope.py
index a768661e4..68cc92042 100644
--- a/academic_observatory_workflows/workflows/tests/test_crossref_metadata_telescope.py
+++ b/academic_observatory_workflows/workflows/tests/test_crossref_metadata_telescope.py
@@ -173,10 +173,9 @@ def test_telescope(self):
                 ti = env.run_task(workflow.transform.__name__)
                 self.assertEqual(State.SUCCESS, ti.state)
                 file_paths = list_files(release.transform_folder, release.transform_files_regex)
-                self.assertEqual(1, len(file_paths))
+                self.assertEqual(5, len(file_paths))
                 for file_path in file_paths:
                     self.assertTrue(os.path.isfile(file_path))
-                    self.assertTrue(is_gzip(file_path))
 
                 # Test that transformed files uploaded
                 ti = env.run_task(workflow.upload_transformed.__name__)
@@ -311,7 +310,9 @@ def test_transform_file(self):
                     "issn_type": [{"value": "0003-987X", "type": "print"}],
                 }
             ]
-            actual_results = transform_file(input_file_path)
+            output_file_path = os.path.join(t, "output.jsonl")
+            transform_file(input_file_path, output_file_path)
+            actual_results = load_jsonl(output_file_path)
             self.assertEqual(expected_results, actual_results)
 
     def test_transform_item(self):
diff --git a/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py b/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py
index ce3430989..2ad95c33c 100644
--- a/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py
+++ b/academic_observatory_workflows/workflows/tests/test_openalex_telescope.py
@@ -369,51 +369,57 @@ def test_fetch_merged_ids(self):
             actual = fetch_merged_ids(bucket=bucket_name, aws_key=self.aws_key, entity_name="authors")
             self.assertEqual(expected, actual)
 
-    @patch("academic_observatory_workflows.workflows.openalex_telescope.transform_object")
-    def test_transform_file(self, mock_transform_object):
-        """Test the transform_file function."""
-
-        mock_transform_object.return_value = {}
-        with CliRunner().isolated_filesystem() as t:
-            transform_path = "transform/out.jsonl.gz"
-
-            # Create works entity file
-            works = {"works": "content"}
-            works_download_path = "works.jsonl.gz"
-            with gzip.open(works_download_path, "wt", encoding="ascii") as f_out:
-                json.dump(works, f_out)
-
-            # Create other entity file (concepts or institution)
-            concepts = {"concepts": "content"}
-            concepts_download_path = "concepts.jsonl.gz"
-            with gzip.open(concepts_download_path, "wt", encoding="ascii") as f_out:
-                json.dump(concepts, f_out)
-
-            # Test when dir of transform path does not exist yet, using 'works' entity'
-            self.assertFalse(os.path.isdir(os.path.dirname(transform_path)))
-
-            transform_file(works_download_path, transform_path)
-            mock_transform_object.assert_called_once_with(works, "abstract_inverted_index")
-            mock_transform_object.reset_mock()
-            os.remove(transform_path)
-
-            # Test when dir of transform path does exist, using 'works' entity
-            self.assertTrue(os.path.isdir(os.path.dirname(transform_path)))
-
-            transform_file(works_download_path, transform_path)
-            self.assert_file_integrity(transform_path, "682a6d42", "gzip_crc")
-            mock_transform_object.assert_called_once_with(works, "abstract_inverted_index")
-            mock_transform_object.reset_mock()
-            os.remove(transform_path)
-
-            # Test for "concepts" and "institution" entities
-            transform_file(concepts_download_path, transform_path)
-            self.assert_file_integrity(transform_path, "d8cafe16", "gzip_crc")
-            mock_transform_object.assert_called_once_with(concepts, "international")
-
     def test_transform_object(self):
         """Test the transform_object function."""
 
+        # Null
+        obj = {
+            "corresponding_institution_ids": None
+        }
+        transform_object(obj)
+        self.assertDictEqual(
+            {
+                "corresponding_institution_ids": []
+            },
+            obj,
+        )
+
+        # Null
+        obj = {
+            "corresponding_author_ids": None
+        }
+        transform_object(obj)
+        self.assertDictEqual(
+            {
+                "corresponding_author_ids": []
+            },
+            obj,
+        )
+
+        # Null in array
+        obj = {
+            "corresponding_institution_ids": [None]
+        }
+        transform_object(obj)
+        self.assertDictEqual(
+            {
+                "corresponding_institution_ids": []
+            },
+            obj,
+        )
+
+        # Null in array
+        obj = {
+            "corresponding_author_ids": [None]
+        }
+        transform_object(obj)
+        self.assertDictEqual(
+            {
+                "corresponding_author_ids": []
+            },
+            obj,
+        )
+
         # Test object with nested "international" fields
         obj1 = {
             "international": {
@@ -424,7 +430,7 @@ def test_transform_object(self):
                 }
             }
         }
-        transform_object(obj1, "international")
+        transform_object(obj1)
         self.assertDictEqual(
             {
                 "international": {
@@ -443,7 +449,7 @@ def test_transform_object(self):
 
         # Test object with nested "international" none
         obj2 = {"international": {"display_name": None}}
-        transform_object(obj2, "international")
+        transform_object(obj2)
         self.assertDictEqual({"international": {"display_name": None}}, obj2)
 
         # Test object with nested "abstract_inverted_index" fields
@@ -457,7 +463,7 @@ def test_transform_object(self):
                 "primarily": [5],
             }
         }
-        transform_object(obj3, "abstract_inverted_index")
+        transform_object(obj3)
         self.assertDictEqual(
             {
                 "abstract_inverted_index": {
@@ -470,7 +476,7 @@ def test_transform_object(self):
 
         # Test object with nested "abstract_inverted_index" none
         obj4 = {"abstract_inverted_index": None}
-        transform_object(obj4, "abstract_inverted_index")
+        transform_object(obj4)
         self.assertDictEqual({"abstract_inverted_index": None}, obj4)
 
 
diff --git a/academic_observatory_workflows/workflows/unpaywall_telescope.py b/academic_observatory_workflows/workflows/unpaywall_telescope.py
index e0c4e2477..94eec490b 100644
--- a/academic_observatory_workflows/workflows/unpaywall_telescope.py
+++ b/academic_observatory_workflows/workflows/unpaywall_telescope.py
@@ -203,11 +203,11 @@ def __init__(
         *,
         dag_id: str,
         cloud_workspace: CloudWorkspace,
-        bq_dataset_id: str = "our_research",
+        bq_dataset_id: str = "unpaywall",
         bq_table_name: str = "unpaywall",
         api_dataset_id: str = "unpaywall",
         schema_folder: str = os.path.join(default_schema_folder(), "unpaywall"),
-        dataset_description: str = "Our Research datasets: http://ourresearch.org/",
+        dataset_description: str = "Unpaywall Data Feed: https://unpaywall.org/products/data-feed",
         table_description: str = "Unpaywall Data Feed: https://unpaywall.org/products/data-feed",
         primary_key: str = "doi",
         snapshot_expiry_days: int = 7,
@@ -565,7 +565,7 @@ def bq_load_main_table(self, release: UnpaywallRelease, **kwargs) -> None:
             source_format=SourceFormat.NEWLINE_DELIMITED_JSON,
             table_description=self.table_description,
             write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
-            ignore_unknown_values=False,
+            ignore_unknown_values=True,
         )
         set_task_state(success, self.bq_load_upsert_table.__name__, release)
 
@@ -655,7 +655,7 @@ def bq_load_upsert_table(self, release: UnpaywallRelease, **kwargs) -> None:
             source_format=SourceFormat.NEWLINE_DELIMITED_JSON,
             table_description=self.table_description,
             write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
-            ignore_unknown_values=False,
+            ignore_unknown_values=True,
         )
         set_task_state(success, self.bq_load_upsert_table.__name__, release)
 
diff --git a/academic_observatory_workflows/workflows/web_of_science_telescope.py b/academic_observatory_workflows/workflows/web_of_science_telescope.py
index a7308a682..b256ee552 100644
--- a/academic_observatory_workflows/workflows/web_of_science_telescope.py
+++ b/academic_observatory_workflows/workflows/web_of_science_telescope.py
@@ -99,7 +99,7 @@ def __init__(
         institution_ids: List[str],
         wos_conn_id: str,
         earliest_date: pendulum.DateTime = pendulum.datetime(1800, 1, 1),
-        bq_dataset_id: str = "clarivate",
+        bq_dataset_id: str = "web_of_science",
         bq_table_name: str = "web_of_science",
         api_dataset_id: str = "web_of_science",
         schema_folder: str = os.path.join(default_schema_folder(), "web_of_science"),

From 736f40fe3572d98e9bbaec26313abf74bab5dace Mon Sep 17 00:00:00 2001
From: Jamie Diprose <5715104+jdddog@users.noreply.github.com>
Date: Tue, 20 Jun 2023 09:39:31 +1200
Subject: [PATCH 3/3] Filter Crossref type and fix release_date naming in oa
 web workflow (#167)

---
 .../database/sql/create_aggregate.sql.jinja2           | 10 ++++++++++
 academic_observatory_workflows/model.py                |  4 ++++
 .../workflows/oa_web_workflow.py                       |  8 ++++----
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2 b/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2
index 32ab6958a..56f855e16 100644
--- a/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2
+++ b/academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2
@@ -13,6 +13,10 @@
 # limitations under the License.
 # Author: Richard Hosking #}
 
+{#Include:journal-article,proceedings-article,report,posted-content,edited-book,book,book-chapter,reference-book,monograph,other,book-section,book-part,reference-entry#}
+{#Exclude: the types below and NULL#}
+{% set CROSSREF_TYPES_TO_EXCLUDE = '("dataset","database","component","report-component","peer-review","grant","proceedings","journal-issue","report-series","book-track")' %}
+
 # Helper Function: Counting Access Types
 {#
 Output Schema:
@@ -741,6 +745,8 @@ WITH tmp_disciplines AS
     UNNEST(dois.affiliations.{{ aggregation_field }}) as aggregrate
   WHERE
     aggregrate.identifier IS NOT NULL
+    AND dois.crossref.type IS NOT NULL
+    AND dois.crossref.type NOT IN {{ CROSSREF_TYPES_TO_EXCLUDE }}
   GROUP BY
     aggregrate.identifier,
     crossref.{{ group_by_time_field }}
@@ -764,6 +770,8 @@ tmp_access_types AS (
     UNNEST(dois.affiliations.{{ aggregation_field }}) as aggregrate
   WHERE
     aggregrate.identifier IS NOT NULL
+    AND dois.crossref.type IS NOT NULL
+    AND dois.crossref.type NOT IN {{ CROSSREF_TYPES_TO_EXCLUDE }}
   GROUP BY
     aggregrate.identifier,
     crossref.{{ group_by_time_field }}
@@ -1017,6 +1025,8 @@ SELECT
 
 FROM `{{ project_id }}.{{ dataset_id }}.doi{{ snapshot_date.strftime('%Y%m%d') }}` as dois, UNNEST(dois.affiliations.{{ aggregation_field }}) as aggregrate
 WHERE aggregrate.identifier IS NOT NULL
+AND dois.crossref.type IS NOT NULL
+AND dois.crossref.type NOT IN {{ CROSSREF_TYPES_TO_EXCLUDE }}
 GROUP BY aggregrate.identifier, crossref.{{ group_by_time_field }}
 )
 
diff --git a/academic_observatory_workflows/model.py b/academic_observatory_workflows/model.py
index 46c82bf51..a0e71ad44 100644
--- a/academic_observatory_workflows/model.py
+++ b/academic_observatory_workflows/model.py
@@ -206,6 +206,7 @@ class Paper:
     id: int
     doi: str = None
     title: str = None
+    type: str = None
     published_date: pendulum.Date = None
     output_type: str = None
     authors: List[Author] = None
@@ -790,6 +791,7 @@ def make_papers(
         # Make paper
         paper = Paper(
             i,
+            type="journal-article",
             doi=doi_,
             title=title_,
             published_date=published_date_,
@@ -1099,6 +1101,7 @@ def make_crossref_metadata(dataset: ObservatoryDataset) -> List[Dict]:
         # Add Crossref record
         records.append(
             {
+                "type": paper.type,
                 "title": [paper.title],
                 "DOI": paper.doi,
                 "is_referenced_by_count": len(paper.cited_by),
@@ -1483,6 +1486,7 @@ def make_doi_table(dataset: ObservatoryDataset) -> List[Dict]:
             {
                 "doi": doi,
                 "crossref": {
+                    "type": paper.type,
                     "title": paper.title,
                     "published_year": paper.published_date.year,
                     "published_month": paper.published_date.month,
diff --git a/academic_observatory_workflows/workflows/oa_web_workflow.py b/academic_observatory_workflows/workflows/oa_web_workflow.py
index 916cee745..2be7ee851 100644
--- a/academic_observatory_workflows/workflows/oa_web_workflow.py
+++ b/academic_observatory_workflows/workflows/oa_web_workflow.py
@@ -83,7 +83,7 @@
     ("outputs_public", "n_outputs_other_platform_open"),
     ("outputs_other_internet", "n_outputs_other_platform_open"),
 ]
-INCLUSION_THRESHOLD = {"country": 15, "institution": 800}
+INCLUSION_THRESHOLD = {"country": 15, "institution": 1000}
 MAX_REPOSITORIES = 200
 START_YEAR = 2000
 END_YEAR = pendulum.now().year - 1
@@ -697,7 +697,7 @@ def build_datasets(self, release: OaWebRelease, **kwargs):
             )
             for version in versions
         ]
-        last_updated = zenodo_versions[0].snapshot_date.format("D MMMM YYYY")
+        last_updated = zenodo_versions[0].release_date.format("D MMMM YYYY")
         country_stats = make_entity_stats(countries)
         institution_stats = make_entity_stats(institutions)
         stats = Stats(START_YEAR, END_YEAR, last_updated, zenodo_versions, country_stats, institution_stats)
@@ -960,11 +960,11 @@ def to_dict(self) -> Dict:
 
 @dataclasses.dataclass
 class ZenodoVersion:
-    snapshot_date: pendulum.DateTime
+    release_date: pendulum.DateTime
     download_url: str
 
     def to_dict(self) -> Dict:
-        return {"snapshot_date": self.snapshot_date.strftime("%Y-%m-%d"), "download_url": self.download_url}
+        return {"release_date": self.release_date.strftime("%Y-%m-%d"), "download_url": self.download_url}
 
 
 @dataclasses.dataclass