In [93]:
import requests
import csv
import pandas as pd

# Get Data from API

In [None]:
# Define the base URL for OpenAlex API works endpoint with a larger per-page limit
base_url = "https://api.openalex.org/works?select=authorships&per-page=200"

In [None]:
# Send request to OpenAlex API
response = requests.get(base_url)

if response.status_code != 200:
    print(f"Error: {response.status_code}")

In [96]:
data = response.json()
results = data.get("results", [])

In [97]:
# List to hold data for DataFrame
rows = []

In [98]:
for work in results:
    authorships = work.get("authorships", [])
    
    if not authorships:
        continue
    
    # Extract first author
    first_author = authorships[0]
    name = first_author["author"].get("display_name", "Unknown")
    orcid = first_author["author"].get("orcid", "N/A")
    institution = (
        first_author["institutions"][0]["display_name"]
        if first_author.get("institutions") else "Unknown"
    )
    
    # Extract coauthors (middle and last authors)
    coauthors = [auth["author"].get("display_name", "Unknown") for auth in authorships[1:]]
    coauthors_str = ", ".join(coauthors) if coauthors else "N/A"
    
    # Append data to rows list
    rows.append({
        'Name': name,
        'Orcid': orcid,
        'Institution': institution,
        'Coauthors': coauthors_str
    })


In [99]:
# Create DataFrame
df = pd.DataFrame(rows)

In [100]:
# Save DataFrame to CSV
df.to_csv('alldata-csv.csv', index=False)

In [101]:
# Display the DataFrame
print(df)

                   Name                                  Orcid  \
0        OliverH. Lowry                                   None   
1           R Core Team                                   None   
2     Ulrich K. Laemmli                                   None   
3      Mark A. Bradford  https://orcid.org/0000-0002-2022-8331   
4    Marion M. Bradford                                   None   
..                  ...                                    ...   
189   Nitish Srivastava  https://orcid.org/0000-0002-3442-5352   
190    Barney G. Glaser                                   None   
191  Zbyszek Otwinowski  https://orcid.org/0000-0003-3640-8545   
192     David P. Bartel  https://orcid.org/0000-0002-3872-2856   
193        John E. Ware  https://orcid.org/0000-0002-0744-2149   

                                     Institution  \
0             Washington University in St. Louis   
1                                        Unknown   
2            MRC Laboratory of Molecular Biology   

# Refine Dataset

## Split into known and unknown ORCID

In [102]:
# Refine Dataset
authors_with_orcid = df[df['Orcid'].notna() & (df['Orcid'] != "N/A")]
authors_without_orcid = df[df['Orcid'].isna() | (df['Orcid'] == "N/A")]

authors_with_orcid.to_csv('authors_with_orcid.csv', index=False)
authors_without_orcid.to_csv('authors_without_orcid.csv', index=False)

In [103]:
print(authors_with_orcid)

                       Name                                  Orcid  \
3          Mark A. Bradford  https://orcid.org/0000-0002-2022-8331   
5                Kaiming He  https://orcid.org/0000-0001-7318-9658   
6            John P. Perdew  https://orcid.org/0000-0003-4237-824X   
7          Kenneth J. Livak  https://orcid.org/0000-0001-9105-5856   
8            Virginia Braun  https://orcid.org/0000-0002-3435-091X   
..                      ...                                    ...   
186  Albert‐László Barabási  https://orcid.org/0000-0002-4028-3522   
189       Nitish Srivastava  https://orcid.org/0000-0002-3442-5352   
191      Zbyszek Otwinowski  https://orcid.org/0000-0003-3640-8545   
192         David P. Bartel  https://orcid.org/0000-0002-3872-2856   
193            John E. Ware  https://orcid.org/0000-0002-0744-2149   

                                     Institution  \
3                          University of Georgia   
5            Microsoft Research (United Kingdom)   
6  

In [104]:
print(authors_without_orcid)

                       Name Orcid  \
0            OliverH. Lowry  None   
1               R Core Team  None   
2         Ulrich K. Laemmli  None   
4        Marion M. Bradford  None   
9           Joseph Sambrook  None   
..                      ...   ...   
184               Venkatesh  None   
185  Laurens van der Maaten  None   
187         Daniel Kahneman  None   
188            Da Wei Huang  None   
190        Barney G. Glaser  None   

                                           Institution  \
0                   Washington University in St. Louis   
1                                              Unknown   
2                  MRC Laboratory of Molecular Biology   
4                                University of Georgia   
9                                              Unknown   
..                                                 ...   
184               University of Maryland, College Park   
185                                            Unknown   
187                     Universit

In [105]:
authors_with_orcid.to_csv('withorcid-csv.csv', index=False)
authors_without_orcid.to_csv('withoutorcid-csv.csv', index=False)