Skip to content

Commit

Permalink
Fix/OpenAlex: abstract_inverted_index field now a JSON structure (#183)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexmassen-hane authored Sep 15, 2023
1 parent 9991d03 commit 6970db5
Show file tree
Hide file tree
Showing 8 changed files with 173 additions and 44 deletions.
87 changes: 76 additions & 11 deletions academic_observatory_workflows/database/schema/openalex/works.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,28 @@
"mode": "NULLABLE",
"fields": [
{
"name": "keys",
"type": "STRING",
"mode": "REPEATED",
"description": "Custom field created by COKI. Originally each word in the abstract was a key and the indices of where this word occurred inside the abstract the corresponding value."
"name": "IndexLength",
"type": "INTEGER",
"mode": "NULLABLE"
},
{
"name": "values",
"type": "STRING",
"mode": "REPEATED",
"description": "Custom field created by COKI. Originally each word in the abstract was a key and the indices of where this word occurred inside the abstract the corresponding value."
"name": "InvertedIndex",
"type": "RECORD",
"mode": "NULLABLE",
"fields": [
{
"name": "keys",
"type": "STRING",
"mode": "REPEATED",
"description": "Custom field created by COKI. Originally each word in the abstract was a key and the indices of where this word occurred inside the abstract the corresponding value."
},
{
"name": "values",
"type": "STRING",
"mode": "REPEATED",
"description": "Custom field created by COKI. Originally each word in the abstract was a key and the indices of where this word occurred inside the abstract the corresponding value."
}
]
}
],
"description": "The abstract of the work, as an inverted index, which encodes information about the abstract's words and their positions within the text. Like Microsoft Academic Graph, OpenAlex doesn't include plaintext abstracts due to legal constraints."
Expand Down Expand Up @@ -100,7 +112,36 @@
]
},
{
"name": "apc_payment",
"name": "apc_list",
"type": "RECORD",
"mode": "NULLABLE",
"fields": [
{
"name": "currency",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "price",
"type": "INTEGER",
"mode": "NULLABLE"
},
{
"name": "price_usd",
"type": "INTEGER",
"mode": "NULLABLE",
"description": "APC converted to USD"
},
{
"name": "provenance",
"type": "STRING",
"mode": "NULLABLE"
}
],
"description": "Objects containing information about the APC (article processing charge) for this work. This value is the APC list price–the price as listed by the journal’s publisher. That’s not always the price actually paid, because publishers may offer various discounts to authors. Unfortunately we don’t always know this discounted price, but when we do you can find it in apc_paid. Currently our only source for this data is DOAJ, and so doaj is the only value for apc_list.provenance, but we’ll add other sources over time."
},
{
"name": "apc_paid",
"type": "RECORD",
"mode": "NULLABLE",
"fields": [
Expand All @@ -126,7 +167,7 @@
"mode": "NULLABLE"
}
],
"description": "Objects containing information about the APC (article processing charge) for this work. If we can get the APC price from OpenAPC, we use that. Those APCs are specific to an article and are the actual APC paid by an author or institution to publish the article. As a fallback, we use the DOAJ APC prices that are available in sources. Those are an estimate of what authors would have had to pay to publish the article, since the DOAJ apc prices apply to an entire journal."
"description": "Object: Information about the paid APC (article processing charge) for this work. You can find the listed APC price (when we know it) for a given work using apc_list. However, authors don’t always pay the listed price; often they get a discounted price from publishers. So it’s useful to know the APC actually paid by authors, as distinct from the list price. This is our effort to provide this. Our best source for the actually paid price is the OpenAPC project. Where available, we use that data, and so apc_paid.provenance is openapc. Where OpenAPC data is unavailable (and unfortunately this is common) we make our best guess by assuming the author paid the APC list price, and apc_paid.provenance will be set to wherever we got the list price from."
},
{
"name": "authorships",
Expand Down Expand Up @@ -285,6 +326,12 @@
"mode": "NULLABLE",
"description": "The OpenAlex ID for this source."
},
{
"name": "is_in_doaj",
"type": "BOOLEAN",
"mode": "NULLABLE",
"description": "Whether this is a journal listed in the Directory of Open Access Journals (DOAJ)."
},
{
"name": "issn",
"type": "STRING",
Expand Down Expand Up @@ -743,6 +790,12 @@
"mode": "NULLABLE",
"description": "The OpenAlex ID for this source."
},
{
"name": "is_in_doaj",
"type": "BOOLEAN",
"mode": "NULLABLE",
"description": "Whether this is a journal listed in the Directory of Open Access Journals (DOAJ)."
},
{
"name": "issn",
"type": "STRING",
Expand Down Expand Up @@ -917,6 +970,12 @@
"mode": "NULLABLE",
"description": "The OpenAlex ID for this source."
},
{
"name": "is_in_doaj",
"type": "BOOLEAN",
"mode": "NULLABLE",
"description": "Whether this is a journal listed in the Directory of Open Access Journals (DOAJ)."
},
{
"name": "issn",
"type": "STRING",
Expand Down Expand Up @@ -1049,6 +1108,12 @@
"mode": "NULLABLE",
"description": "The type or genre of the work. This field uses Crossref's \"type\" controlled vocabulary; you can see all possible values via the Crossref api here: https://api.crossref.org/types. Where possible, we just pass along Crossref's type value for each work. When that's impossible (eg the work isn't in Crossref), we do our best to figure out the type ourselves. Unfortunately the accuracy of Crossref's data for this isn't great, and ours isn't much better. We're working to develop better type classification."
},
{
"name": "type_crossref",
"type": "STRING",
"mode": "NULLABLE",
"description": "Legacy type information, using Crossref's \"type\" controlled vocabulary."
},
{
"name": "updated_date",
"type": "TIMESTAMP",
Expand All @@ -1067,4 +1132,4 @@
"mode": "NULLABLE",
"description": "The version of the work, based on the DRIVER Guidelines versioning scheme. Possible values are: publishedVersion, acceptedVersion or submittedVersion."
}
]
]
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
26 changes: 13 additions & 13 deletions academic_observatory_workflows/workflows/openalex_telescope.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import pendulum
from airflow.hooks.base import BaseHook
from airflow.models.taskinstance import TaskInstance
from airflow.operators.dummy import DummyOperator
from airflow.operators.empty import EmptyOperator
from google.cloud import bigquery
from google.cloud.bigquery import SourceFormat

Expand Down Expand Up @@ -399,7 +399,7 @@ def __init__(

# The last task that the next DAG run's ExternalTaskSensor waits for.
self.add_operator(
DummyOperator(
EmptyOperator(
task_id=external_task_id,
)
)
Expand Down Expand Up @@ -1014,17 +1014,17 @@ def transform_object(obj: dict):
# TODO: when re-ingesting entire dataset: change schema to new version
field = "abstract_inverted_index"
if field in obj:

def parse_abstract(dict_: dict):
keys_ = list(dict_.keys())
values_ = [str(value_)[1:-1] for value_ in dict_.values()]
return {"keys": keys_, "values": values_}

if isinstance(obj.get(field), str):
data = json.loads(obj[field])
obj[field] = parse_abstract(data["InvertedIndex"])
elif isinstance(obj.get(field), dict):
obj[field] = parse_abstract(obj[field])
if isinstance(obj.get(field), (str, dict)):
load_field = json.loads(obj[field]) if isinstance(obj[field], str) else obj[field]
data = load_field.get("InvertedIndex", load_field)

# Clear object to only have required fields.
obj[field] = {}
obj[field]["InvertedIndex"] = {
"keys": list(data.keys()),
"values": [str(value)[1:-1] for value in data.values()],
}
obj[field]["IndexLength"] = load_field.get("IndexLength", None)
else:
return

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,7 @@ def test_transform_object(self):
transform_object(obj2)
self.assertDictEqual({"international": {"display_name": None}}, obj2)

# Test object with nested "abstract_inverted_index" fields
# Test object with nested "abstract_inverted_index" fields, without InvertedIndex key in obj
obj3 = {
"abstract_inverted_index": {
"Malignant": [0],
Expand All @@ -455,32 +455,96 @@ def test_transform_object(self):
self.assertDictEqual(
{
"abstract_inverted_index": {
"keys": ["Malignant", "hyperthermia", "susceptibility", "(MHS)", "is", "primarily"],
"values": ["0", "1", "2", "3", "4, 6", "5"],
"IndexLength": None,
"InvertedIndex": {
"keys": ["Malignant", "hyperthermia", "susceptibility", "(MHS)", "is", "primarily"],
"values": ["0", "1", "2", "3", "4, 6", "5"],
},
}
},
obj3,
)

# Test object when "abstract_inverted_index" is a json string dump
# Test object when "abstract_inverted_index" is a json object
obj4 = {
"abstract_inverted_index": '{"IndexLength": 7, "InvertedIndex": { "Malignant": [0], "hyperthermia": [1], "susceptibility": [2],"(MHS)": [3], "is": [4, 6], "primarily": [5]}}'
"abstract_inverted_index": {
"IndexLength": 7,
"InvertedIndex": {
"Malignant": [0],
"hyperthermia": [1],
"susceptibility": [2],
"(MHS)": [3],
"is": [4, 6],
"primarily": [5],
},
}
}

transform_object(obj4)
self.assertDictEqual(
{
"abstract_inverted_index": {
"keys": ["Malignant", "hyperthermia", "susceptibility", "(MHS)", "is", "primarily"],
"values": ["0", "1", "2", "3", "4, 6", "5"],
"IndexLength": 7,
"InvertedIndex": {
"keys": ["Malignant", "hyperthermia", "susceptibility", "(MHS)", "is", "primarily"],
"values": ["0", "1", "2", "3", "4, 6", "5"],
},
}
},
obj4,
)

# Test object with nested "abstract_inverted_index" none
obj5 = {"abstract_inverted_index": None}
# Test object when "abstract_inverted_index" is a json object and IndexLength is None
obj5 = {
"abstract_inverted_index": {
"IndexLength": None,
"InvertedIndex": {
"Malignant": [0],
"hyperthermia": [1],
"susceptibility": [2],
"(MHS)": [3],
"is": [4, 6],
"primarily": [5],
},
}
}

transform_object(obj5)
self.assertDictEqual({"abstract_inverted_index": None}, obj5)
self.assertDictEqual(
{
"abstract_inverted_index": {
"IndexLength": None,
"InvertedIndex": {
"keys": ["Malignant", "hyperthermia", "susceptibility", "(MHS)", "is", "primarily"],
"values": ["0", "1", "2", "3", "4, 6", "5"],
},
}
},
obj5,
)

# Test object when "abstract_inverted_index" is a json string dump
obj6 = {
"abstract_inverted_index": '{"IndexLength": 7, "InvertedIndex": { "Malignant": [0], "hyperthermia": [1], "susceptibility": [2],"(MHS)": [3], "is": [4, 6], "primarily": [5]}}'
}
transform_object(obj6)
self.assertDictEqual(
{
"abstract_inverted_index": {
"IndexLength": 7,
"InvertedIndex": {
"keys": ["Malignant", "hyperthermia", "susceptibility", "(MHS)", "is", "primarily"],
"values": ["0", "1", "2", "3", "4, 6", "5"],
},
}
},
obj6,
)

# Test object with nested "abstract_inverted_index" none
obj7 = {"abstract_inverted_index": None}
transform_object(obj7)
self.assertDictEqual({"abstract_inverted_index": None}, obj7)


def upload_folder_to_s3(bucket_name: str, folder_path: str, s3_prefix=None):
Expand Down

0 comments on commit 6970db5

Please sign in to comment.