## OpenAlex API data gathering


In [None]:
# imports
from pyalex import Works, Authors, Sources, Institutions, Topics, Publishers, Funders, config
import pandas as pd
import json
from pathlib import Path
import time
import os

In [None]:
config
config.email = "thoc@itu.dk"
config.max_retries = 0
config.retry_backoff_factor = 0.1
config.retry_http_codes = [429, 500, 503]

In [None]:
# filter works by institution country code
us_data = Works().filter(institutions = {"country_code": [ "US"]}).get()
eu_data = Works().filter(institutions = {"country_code": [ "AT|BE|BG|HR|CY|CZ|DK|EE|FI|FR|DE|GR|HU|IE|IT|LV|LT|LU|MT|NL|PL|PT|RO|SK|SI|ES|SE"]}).get()

In [None]:
#metadata for the datasets
print(us_data.meta)
print(eu_data.meta)

In [None]:
# print first author name
us_data[0]['authorships'][0]["author"]["display_name"]

In [None]:
# print paper first author name, canonical id, name, doi and publication year
print(us_data[0]['authorships'][0]["author"]["display_name"])
print(us_data[0]["id"])
print(us_data[0]["doi"])
print(us_data[0]["title"])
print(us_data[0]["publication_date"])

In [None]:
# create list of presidential terms from gemini
PRESIDENTIAL_TERMS = [
    {"president": "George Washington", "start_date": "1789-04-30", "end_date": "1797-03-04"},
    {"president": "John Adams", "start_date": "1797-03-04", "end_date": "1801-03-04"},
    {"president": "Thomas Jefferson", "start_date": "1801-03-04", "end_date": "1809-03-04"},
    {"president": "James Madison", "start_date": "1809-03-04", "end_date": "1817-03-04"},
    {"president": "James Monroe", "start_date": "1817-03-04", "end_date": "1825-03-04"},
    {"president": "John Quincy Adams", "start_date": "1825-03-04", "end_date": "1829-03-04"},
    {"president": "Andrew Jackson", "start_date": "1829-03-04", "end_date": "1837-03-04"},
    {"president": "Martin Van Buren", "start_date": "1837-03-04", "end_date": "1841-03-04"},
    {"president": "William Henry Harrison", "start_date": "1841-03-04", "end_date": "1841-04-04"}, # Died in office
    {"president": "John Tyler", "start_date": "1841-04-04", "end_date": "1845-03-04"}, # Finished Harrison's term
    {"president": "James K. Polk", "start_date": "1845-03-04", "end_date": "1849-03-04"},
    {"president": "Zachary Taylor", "start_date": "1849-03-04", "end_date": "1850-07-09"}, # Died in office
    {"president": "Millard Fillmore", "start_date": "1850-07-09", "end_date": "1853-03-04"}, # Finished Taylor's term
    {"president": "Franklin Pierce", "start_date": "1853-03-04", "end_date": "1857-03-04"},
    {"president": "James Buchanan", "start_date": "1857-03-04", "end_date": "1861-03-04"},
    {"president": "Abraham Lincoln", "start_date": "1861-03-04", "end_date": "1865-04-15"}, # Assassinated
    {"president": "Andrew Johnson", "start_date": "1865-04-15", "end_date": "1869-03-04"}, # Finished Lincoln's term
    {"president": "Ulysses S. Grant", "start_date": "1869-03-04", "end_date": "1877-03-04"},
    {"president": "Rutherford B. Hayes", "start_date": "1877-03-04", "end_date": "1881-03-04"},
    {"president": "James A. Garfield", "start_date": "1881-03-04", "end_date": "1881-09-19"}, # Assassinated
    {"president": "Chester A. Arthur", "start_date": "1881-09-19", "end_date": "1885-03-04"}, # Finished Garfield's term
    {"president": "Grover Cleveland", "start_date": "1885-03-04", "end_date": "1889-03-04"}, # First term
    {"president": "Benjamin Harrison", "start_date": "1889-03-04", "end_date": "1893-03-04"},
    {"president": "Grover Cleveland", "start_date": "1893-03-04", "end_date": "1897-03-04"}, # Second, non-consecutive term
    {"president": "William McKinley", "start_date": "1897-03-04", "end_date": "1901-09-14"}, # Assassinated
    {"president": "Theodore Roosevelt", "start_date": "1901-09-14", "end_date": "1909-03-04"}, # Finished McKinley's term + elected term
    {"president": "William Howard Taft", "start_date": "1909-03-04", "end_date": "1913-03-04"},
    {"president": "Woodrow Wilson", "start_date": "1913-03-04", "end_date": "1921-03-04"},
    {"president": "Warren G. Harding", "start_date": "1921-03-04", "end_date": "1923-08-02"}, # Died in office
    {"president": "Calvin Coolidge", "start_date": "1923-08-02", "end_date": "1929-03-04"}, # Finished Harding's term + elected term
    {"president": "Herbert Hoover", "start_date": "1929-03-04", "end_date": "1933-03-04"},
    {"president": "Franklin D. Roosevelt", "start_date": "1933-03-04", "end_date": "1945-04-12"}, # Died during 4th term
    {"president": "Harry S. Truman", "start_date": "1945-04-12", "end_date": "1953-01-20"}, # Finished FDR's term + elected term
    {"president": "Dwight D. Eisenhower", "start_date": "1953-01-20", "end_date": "1961-01-20"},
    {"president": "John F. Kennedy", "start_date": "1961-01-20", "end_date": "1963-11-22"}, # Assassinated
    {"president": "Lyndon B. Johnson", "start_date": "1963-11-22", "end_date": "1969-01-20"}, # Finished JFK's term + elected term
    {"president": "Richard Nixon", "start_date": "1969-01-20", "end_date": "1974-08-09"}, # Resigned
    {"president": "Gerald Ford", "start_date": "1974-08-09", "end_date": "1977-01-20"}, # Finished Nixon's term
    {"president": "Jimmy Carter", "start_date": "1977-01-20", "end_date": "1981-01-20"},
    {"president": "Ronald Reagan", "start_date": "1981-01-20", "end_date": "1989-01-20"},
    {"president": "George H.W. Bush", "start_date": "1989-01-20", "end_date": "1993-01-20"},
    {"president": "Bill Clinton", "start_date": "1993-01-20", "end_date": "2001-01-20"},
    {"president": "George W. Bush", "start_date": "2001-01-20", "end_date": "2009-01-20"},
    {"president": "Barack Obama", "start_date": "2009-01-20", "end_date": "2017-01-20"},
    {"president": "Donald Trump", "start_date": "2017-01-20", "end_date": "2021-01-20"},
    {"president": "Joe Biden", "start_date": "2021-01-20", "end_date": "2025-01-20"} # Current term end date    
]

In [None]:
# date filtering by presidential term for Barack Obama
test_data = Works() \
.filter(institutions = {"country_code": [ "US"]}) \
.filter(from_publication_date = "2009-01-20") \
.filter(to_publication_date = "2017-01-20") \
.get()


In [None]:
# Create json files for all US papers released in each presidential term and put them in a folder
os.makedirs("US_data", exist_ok=True)
for i, term in enumerate(PRESIDENTIAL_TERMS):
    with open(Path(f"US_data/{i}_{term["president"]}.json"), "w") as f:
        data = Works() \
        .filter(institutions = {"country_code": [ "US"]}) \
        .filter(from_publication_date = term["start_date"]) \
        .filter(to_publication_date = term["end_date"]) \
        .get()
        json.dump(data, f)
        time.sleep(0.5)

In [None]:
# Create json files for all EU papers released in each presidential term and put them in a folder
os.makedirs("EU_data", exist_ok=True)
for i, term in enumerate(PRESIDENTIAL_TERMS):
    with open(Path(f"EU_data/{i}_{term["president"]}.json"), "w") as f:
        data = Works() \
        .filter(institutions = {"country_code": [ "AT|BE|BG|HR|CY|CZ|DK|EE|FI|FR|DE|GR|HU|IE|IT|LV|LT|LU|MT|NL|PL|PT|RO|SK|SI|ES|SE"]}) \
        .filter(from_publication_date = term["start_date"]) \
        .filter(to_publication_date = term["end_date"]) \
        .get()
        json.dump(data, f)
        time.sleep(0.5)

In [None]:
print(us_data[0]['authorships'][0]["author"]["display_name"])
print(us_data[0]["id"])
print(us_data[0]["doi"])
print(us_data[0]["title"])
print(us_data[0]["publication_date"])