In [76]:
import pandas as pd
import requests
import json
import re

In [77]:
API_URL = "https://api.openalex.org/works?select=authorship"


In [78]:
response = requests.get(API_URL)

In [79]:
if response.status_code == 200:
    data = response.json()
    results = data.get("results", [])

    # Create a list to store extracted data
    extracted_data = []

    for work in results:
        authorships = work.get("authorships", [])

        if not authorships:
            continue

        # Extract first author
        first_author = authorships[0]
        name = first_author["author"].get("display_name", "Unknown")
        orcid = first_author["author"].get("orcid", "N/A")
        
        institution = (
            first_author["institutions"][0]["display_name"]
            if first_author.get("institutions") else "Unknown"
        )

        # Extract coauthors (middle and last authors)
        coauthors = [auth["author"].get("display_name", "Unknown") for auth in authorships[1:]]
        coauthors_str = ", ".join(coauthors) if coauthors else "N/A"

        # Append data to list
        extracted_data.append([name, orcid, institution, coauthors_str])


In [80]:
df = pd.DataFrame(extracted_data, columns=['Name', 'Orcid', 'Institution', 'Coauthors'])


In [81]:
df

Unnamed: 0,Name,Orcid,Institution,Coauthors
0,OliverH. Lowry,,Washington University in St. Louis,"NiraJ. Rosebrough, A. Farr, RoseJ. Randall"
1,R Core Team,,Unknown,
2,Ulrich K. Laemmli,,MRC Laboratory of Molecular Biology,
3,Mark A. Bradford,https://orcid.org/0000-0002-2022-8331,University of Georgia,
4,Marion M. Bradford,,University of Georgia,
...,...,...,...,...
92,Matthias Egger,https://orcid.org/0000-0001-7462-5132,University of Bristol,"George Davey Smith, Martin Schneider, C. Minder"
93,E. G. Bligh,,Unknown,W. J. Dyer
94,Haider Raza,https://orcid.org/0000-0001-9680-5388,Unknown,"Annie John, Abderrahim Nemmar"
95,A. P. Dempster,,Educational Testing Service,"N. M. Laird, Donald B. Rubin"
