Skip to content

Commit

Permalink
Add suggested changes and add funders table to openalex_telescope.py
Browse files Browse the repository at this point in the history
  • Loading branch information
jdddog committed May 26, 2023
1 parent 13bb4f0 commit 369000b
Show file tree
Hide file tree
Showing 15 changed files with 405 additions and 34 deletions.
226 changes: 226 additions & 0 deletions academic_observatory_workflows/database/schema/openalex/funders.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
[
{
"name": "alternate_titles",
"type": "STRING",
"mode": "REPEATED",
"description": "A list of alternate titles for this funder."
},
{
"name": "cited_by_count",
"type": "INTEGER",
"mode": "NULLABLE",
"description": "The total number Works that cite a work linked to this funder."
},
{
"name": "country_code",
"type": "STRING",
"mode": "NULLABLE",
"description": "The country where this funder is located, represented as an ISO two-letter country code."
},
{
"name": "counts_by_year",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "cited_by_count",
"type": "INTEGER",
"mode": "NULLABLE",
"description": ""
},
{
"name": "oa_works_count",
"type": "INTEGER",
"mode": "NULLABLE",
"description": ""
},
{
"name": "works_count",
"type": "INTEGER",
"mode": "NULLABLE",
"description": ""
},
{
"name": "year",
"type": "INTEGER",
"mode": "NULLABLE",
"description": ""
}
],
"description": "The values of works_count and cited_by_count for each of the last ten years, binned by year. To put it another way: for every listed year, you can see how many new works are linked to this funder, and how many times any work linked to this funder was cited.\nYears with zero citations and zero works have been removed so you will need to add those back in if you need them."
},
{
"name": "created_date",
"type": "DATE",
"mode": "NULLABLE",
"description": "The date this Funder object was created in the OpenAlex dataset, expressed as an ISO 8601 date string."
},
{
"name": "description",
"type": "STRING",
"mode": "NULLABLE",
"description": "A short description of this funder, taken from Wikidata."
},
{
"name": "display_name",
"type": "STRING",
"mode": "NULLABLE",
"description": "The primary name of the funder."
},
{
"name": "homepage_url",
"type": "STRING",
"mode": "NULLABLE",
"description": "The URL for this funder's primary homepage."
},
{
"name": "id",
"type": "STRING",
"mode": "NULLABLE",
"description": "The OpenAlex ID for this funder."
},
{
"name": "ids",
"type": "RECORD",
"mode": "NULLABLE",
"fields": [
{
"name": "openalex",
"type": "STRING",
"mode": "NULLABLE",
"description": "this funder's OpenAlex ID"
},
{
"name": "ror",
"type": "STRING",
"mode": "NULLABLE",
"description": "this funder's ROR ID"
},
{
"name": "wikidata",
"type": "STRING",
"mode": "NULLABLE",
"description": "this funder's Wikidata ID"
}
],
"description": "All the external identifiers that we know about for this funder. IDs are expressed as URIs whenever possible."
},
{
"name": "image_thumbnail_url",
"type": "STRING",
"mode": "NULLABLE",
"description": "Same as image_url, but it's a smaller image.\nThis is usually a hotlink to a wikimedia image. You can change the width=300 parameter in the URL if you want a different thumbnail size."
},
{
"name": "image_url",
"type": "STRING",
"mode": "NULLABLE",
"description": "URL where you can get an image representing this funder. Usually this a hotlink to a Wikimedia image, and usually it's a seal or logo."
},
{
"name": "roles",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "id",
"type": "STRING",
"mode": "NULLABLE",
"description": ""
},
{
"name": "role",
"type": "STRING",
"mode": "NULLABLE",
"description": ""
},
{
"name": "works_count",
"type": "INTEGER",
"mode": "NULLABLE",
"description": ""
}
],
"description": "List of role objects, which include the role (one of institution, funder, or publisher), the id (OpenAlex ID), and the works_count.\nIn many cases, a single organization does not fit neatly into one role. For example, Yale University is a single organization that is a research university, funds research studies, and publishes an academic journal. The roles property links the OpenAlex entities together for a single organization, and includes counts for the works associated with each role.\nThe roles list of an entity (Funder, Publisher, or Institution) always includes itself. In the case where an organization only has one role, the roles will be a list of length one, with itself as the only item."
},
{
"name": "summary_stats",
"type": "RECORD",
"mode": "NULLABLE",
"fields": [
{
"name": "2yr_cited_by_count",
"type": "INTEGER",
"mode": "NULLABLE",
"description": ""
},
{
"name": "2yr_h_index",
"type": "INTEGER",
"mode": "NULLABLE",
"description": ""
},
{
"name": "2yr_i10_index",
"type": "INTEGER",
"mode": "NULLABLE",
"description": ""
},
{
"name": "2yr_mean_citedness",
"type": "FLOAT",
"mode": "NULLABLE",
"description": "The 2-year mean citedness for this funder. Also known as impact factor."
},
{
"name": "2yr_works_count",
"type": "INTEGER",
"mode": "NULLABLE",
"description": ""
},
{
"name": "cited_by_count",
"type": "INTEGER",
"mode": "NULLABLE",
"description": ""
},
{
"name": "h_index",
"type": "INTEGER",
"mode": "NULLABLE",
"description": "The h-index for this funder."
},
{
"name": "i10_index",
"type": "INTEGER",
"mode": "NULLABLE",
"description": "The i-10 index for this funder."
},
{
"name": "oa_percent",
"type": "FLOAT",
"mode": "NULLABLE",
"description": ""
},
{
"name": "works_count",
"type": "INTEGER",
"mode": "NULLABLE",
"description": ""
}
],
"description": "Citation metrics for this funder. While the h-index and the i-10 index are normally author-level metrics and the 2-year mean citedness is normally a journal-level metric, they can be calculated for any set of papers, so we include them for funders."
},
{
"name": "updated_date",
"type": "TIMESTAMP",
"mode": "NULLABLE",
"description": "The last time anything in this funder object changed, expressed as an ISO 8601 date string. This date is updated for any change at all, including increases in various counts."
},
{
"name": "works_count",
"type": "INTEGER",
"mode": "NULLABLE",
"description": "The number of works linked to this funder."
}
]
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,12 @@
"mode": "NULLABLE",
"description": "True if we know this work has been retracted. This field has high precision but low recall. In other words, if is_retracted is true, the article is definitely retracted. But if is_retracted is False, it still might be retracted, but we just don't know. This is because unfortunately, the open sources for retraction data aren't currently very comprehensive, and the more comprehensive ones aren't sufficiently open for us to use here."
},
{
"name": "language",
"type": "STRING",
"mode": "NULLABLE",
"description": "The language of the work in ISO 639-1 format. The language is automatically detected using the information we have about the work. We use the langdetect software library on the words in the work's abstract, or the title if we do not have the abstract. The source code for this procedure is here. Keep in mind that this method is not perfect, and that in some cases the language of the title or abstract could be different from the body of the work."
},
{
"name": "locations",
"type": "RECORD",
Expand Down
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import datetime
import logging
import os
import pathlib
import time
from concurrent.futures import ProcessPoolExecutor, as_completed, ThreadPoolExecutor
from datetime import timedelta
Expand Down Expand Up @@ -62,8 +63,8 @@
CROSSREF_EVENTS_HOST = "https://api.eventdata.crossref.org/v1/events"
DATE_FORMAT = "YYYY-MM-DD"

backend = storage_from_string("memory://")
moving_window = FixedWindowElasticExpiryRateLimiter(backend)
BACKEND = storage_from_string("memory://")
MOVING_WINDOW = FixedWindowElasticExpiryRateLimiter(BACKEND)


class CrossrefEventsRelease(ChangefileRelease):
Expand Down Expand Up @@ -754,14 +755,23 @@ def download_events(request: EventRequest, download_folder: str, n_rows: int):
logging.info(f"{request}: skipped, already finished")
return

# If data file exists then the previous request must have failed
# Remove them both and start again
if os.path.isfile(data_path) and os.path.isfile(cursor_path):
# If cursor exists then the previous request must have failed
# Remove data file and cursor and start again
if os.path.isfile(cursor_path):
logging.warning(f"{request}: deleting data and trying again")
logging.warning(f"{request}: deleting {data_path}")
os.remove(data_path)
try:
os.remove(data_path)
except FileNotFoundError:
pass

Check warning on line 766 in academic_observatory_workflows/workflows/crossref_events_telescope.py

View check run for this annotation

Codecov / codecov/patch

academic_observatory_workflows/workflows/crossref_events_telescope.py#L765-L766

Added lines #L765 - L766 were not covered by tests
logging.warning(f"{request}: deleting {cursor_path}")
os.remove(cursor_path)
try:
os.remove(cursor_path)
except FileNotFoundError:
pass

Check warning on line 771 in academic_observatory_workflows/workflows/crossref_events_telescope.py

View check run for this annotation

Codecov / codecov/patch

academic_observatory_workflows/workflows/crossref_events_telescope.py#L770-L771

Added lines #L770 - L771 were not covered by tests

# Create empty cursor file before doing anything else
pathlib.Path(cursor_path).touch()

logging.info(f"{request}: downloading")
next_cursor = None
Expand Down Expand Up @@ -829,12 +839,12 @@ def crossref_events_limiter(calls_per_second: int = 10):
item = RateLimitItemPerSecond(calls_per_second) # 10 per second

while True:
if not moving_window.test(item, identifier):
if not MOVING_WINDOW.test(item, identifier):
time.sleep(0.01)
else:
break

Check warning on line 845 in academic_observatory_workflows/workflows/crossref_events_telescope.py

View check run for this annotation

Codecov / codecov/patch

academic_observatory_workflows/workflows/crossref_events_telescope.py#L845

Added line #L845 was not covered by tests

moving_window.hit(item, identifier)
MOVING_WINDOW.hit(item, identifier)


def transform_events(download_path: str, transform_folder: str):
Expand Down
2 changes: 1 addition & 1 deletion academic_observatory_workflows/workflows/doi_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ def get_snapshot_date(project_id: str, dataset_id: str, table_id: str, snapshot_
shard_date = table_shard_dates[0]
else:
raise AirflowException(
f"{project_id}.{dataset_id}.{table_id} " f"with a table shard date <= {snapshot_date} not found"
f"{table_id} with a table shard date <= {snapshot_date} not found"
)

return shard_date
Expand Down
Loading

0 comments on commit 369000b

Please sign in to comment.