In [11]:
import requests
import pandas as pd
import os
from dotenv import load_dotenv

# Data Ingestion and Processing

### Set up API

In [13]:
# Load environment variables
load_dotenv('.env')
my_api_key = os.getenv('API_KEY')
if my_api_key is None:
    print("No API key found!!!")
    exit()
else:
    print("API key is set!")

API key is set!


### Get Data

In [14]:
# Base URL for OpenAlex API authors endpoint
base_url = "https://api.openalex.org/authors"


In [15]:
# Parameters for pagination and selecting data
#params = "per-page=200&select=display_name,orcid,works_count,affiliations,x_concepts"

params = "per-page=200&select=id,display_name,orcid,works_count,affiliations,x_concepts"


In [16]:
# Headers with User-Agent
headers = {"User-Agent": "MyApp/1.0 (rlee379@gatech.edu)"}

In [17]:
# List to hold data for DataFrame
rows = []

In [18]:
# Pagination
cursor = '*'
while cursor and len(rows) < 1:
    url = f"{base_url}?{params}&cursor={cursor}&api_key={my_api_key}"
    print(f"Requesting data with URL: {url}")
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Error: {response.status_code} - {response.text}")
        break

    data = response.json()
    results = data.get("results", [])

    for author in results:
        author_id = author.get("id")
        name = author.get("display_name", "Unknown")
        # print(f"Author ID for {name}: {author_id}")
        orcid = author.get("orcid")
        orcid = orcid.split("/")[-1] if orcid else "N/A"

        # Extract institutions
        institutions = [affiliation['institution']['display_name'] for affiliation in author.get("affiliations", [])]
        institutions_str = ", ".join(set(institutions)) if institutions else "N/A"

        # Extract concepts
        concepts = [concept['display_name'] for concept in author.get("x_concepts", [])]
        concepts_str = ", ".join(set(concepts)) if concepts else "N/A"

        # Request author's works to get coauthors
        if author_id:
            works_url = f"https://api.openalex.org/works?filter=author.id:{author_id}&per-page=200&api_key={my_api_key}"
            print(f"Fetching works for author {name} from URL: {works_url}")
            works_response = requests.get(works_url, headers=headers)
            if works_response.status_code == 200:
                works_data = works_response.json()
                # print(f"Works data fetched for {name}: {works_data}")
                coauthors = set()
                for work in works_data.get("results", []):
                    authorships = work.get("authorships", [])
                    # print(f"Authorships in work: {authorships}")
                    for authorship in authorships:
                        coauthor_name = authorship.get("author", {}).get("display_name")
                        if coauthor_name and coauthor_name != name:
                            coauthors.add(coauthor_name)
                            # print(f"Found coauthor: {coauthor_name}")
                coauthors_str = ", ".join(coauthors) if coauthors else "N/A"
                print(f"Coauthors for {name}: {coauthors_str}")
            else:
                print(f"Error fetching works: {works_response.status_code} - {works_response.text}")
                coauthors_str = "N/A"
        else:
            coauthors_str = "N/A"

        rows.append({
            'Name': name,
            'Orcid': orcid,
            'Institutions': institutions_str,
            'Concepts': concepts_str,
            'Coauthors': coauthors_str
        })

    cursor = data['meta'].get('next_cursor', False)



Requesting data with URL: https://api.openalex.org/authors?per-page=200&select=id,display_name,orcid,works_count,affiliations,x_concepts&cursor=*&api_key=NFsUeovmrxadcjQyaoxEf8
Fetching works for author A Boyle from URL: https://api.openalex.org/works?filter=author.id:https://openalex.org/A5010062957&per-page=200&api_key=NFsUeovmrxadcjQyaoxEf8
Coauthors for A Boyle: N/A
Fetching works for author Terry Law from URL: https://api.openalex.org/works?filter=author.id:https://openalex.org/A5022654839&per-page=200&api_key=NFsUeovmrxadcjQyaoxEf8
Coauthors for Terry Law: Joshua Adkins, Haylie Kimball, Scott Baker, Sarah Leichty
Fetching works for author Josh Adkins from URL: https://api.openalex.org/works?filter=author.id:https://openalex.org/A5025810381&per-page=200&api_key=NFsUeovmrxadcjQyaoxEf8
Coauthors for Josh Adkins: John Cort, Yinyin Ye, Ernesto Nakayasu, Justine Nguyen, Jason McDermott, Katherine Graham, Terry Law, Kristie Oxford
Fetching works for author George M Garrity from URL: htt

Coauthors for Charles Thomas Parker: Neil Byers, Paul Gilna, Mehmet Ulaş Çınar, Brieanne Forbes, Anne Lichtenwalner, T. Dai, B. V. Jacak, Brian J. Beck, Ellen Cassidy, Y. He, Paul De Vos, Ronald M. Hansen, Yi Ji, JOHN P. MOORE, T. B. K. Reddy, C. Nattrass, J. Latessa, Ys Barker, Pierre‐Edouard Fournier, Sarah Wigley, David J. Holtschlag, Loren Schwiebert, Cody W. Thompson, Dylan O’Ryan, WILLIAM D. LIDSTER, Bo Li, P. M. Jacobs, Mark Morrison, Jingfeng Wu, Gerald P. Roston, Rashmi Datta, Mikayla Borton, António Ventosa, L. Bomson, Saeed Roshan, Claire M. Fraser, Hans‐Peter Klenk, B. Maidak, Sikorski, Frank Oliver Glö, Nikos C. Kyrpides, Nan Qin, Marcella McIntyre-Redden, Y.-J. Lee, Kjiersten Fagnan, Ryan D. Oliveira, Y. -J. Lee, Ismail Soudi, Frank R. Masiarz, A. Majumder, Stephen Coates, Joseph Latessa, I. Soudi, R. A. Soltz, Hans‐Jürgen Busse, Anne Thessen, J.M. Tiedje, H. Elfner, Randelle M. Bundy, Susan Schoenian, Konstantinos Konstantinidis, Ulrich Heinz, U. Heinz, Carolee T. Bull, 

In [19]:
# Create DataFrame
df = pd.DataFrame(rows)

In [20]:
df

Unnamed: 0,Name,Orcid,Institutions,Concepts,Coauthors
0,A Boyle,,"Macquarie University, University of Michigan–A...",Computer science,
1,Terry Law,0000-0001-8278-6729,Environmental Molecular Sciences Laboratory,"Computer science, Environmental science","Joshua Adkins, Haylie Kimball, Scott Baker, Sa..."
2,Josh Adkins,,Pacific Northwest National Laboratory,"Geography, Computer science, Environmental sci...","John Cort, Yinyin Ye, Ernesto Nakayasu, Justin..."
3,George M Garrity,0000-0002-4465-7034,"Michigan Public Health Institute, Michigan Uni...","Geography, Zoology, Biology, Ecology, Taxonomy...","Neil Byers, Jeffrey L. Boore, Cheryl D. Schwar..."
4,Charles Thomas Parker,0000-0002-7436-3176,"The Ohio State University, Cyclotron (Netherla...","Geography, Zoology, Biology, Ecology, Taxonomy...","Neil Byers, Paul Gilna, Mehmet Ulaş Çınar, Bri..."
...,...,...,...,...,...
195,Christine Chang,0000-0002-9198-4855,"California Institute of Technology, Pacific No...","Computer science, Environmental science","Abby Jerger, Aaron J. Rossini, Antony Williams..."
196,Curtis A. Bradley,,"Singer (United States), East Tennessee State U...","Business, Biotechnology, Political science, Ma...","Stephen Dycus, Cynthia Adams, Tod Lindberg, El..."
197,Allison Myers‐Pigg,0000-0002-6905-6841,"Smithsonian Environmental Research Center, Uni...","Computer science, Environmental science","Robert Griffin, Roisin McCallum, Nicholas Ward..."
198,Jack L. Goldsmith,,"University of Western Australia, University of...","Business, Biotechnology, Political science, Ma...","Stephen Dycus, Cynthia Adams, Elihu Lauterpach..."


In [21]:
# Save DataFrame to CSV
df.to_csv('all_authors_data.csv', index=False)

### Refine Dataset

#### Split into known and unknown ORCID

In [23]:
# Refine Dataset
authors_with_orcid = df[df['Orcid'].notna() & (df['Orcid'] != "N/A")]
authors_without_orcid = df[df['Orcid'].isna() | (df['Orcid'] == "N/A")]

authors_with_orcid.to_csv('authors_with_orcid.csv', index=False)
authors_without_orcid.to_csv('authors_without_orcid.csv', index=False)

In [24]:
authors_with_orcid

Unnamed: 0,Name,Orcid,Institutions,Concepts,Coauthors
1,Terry Law,0000-0001-8278-6729,Environmental Molecular Sciences Laboratory,"Computer science, Environmental science","Joshua Adkins, Haylie Kimball, Scott Baker, Sa..."
3,George M Garrity,0000-0002-4465-7034,"Michigan Public Health Institute, Michigan Uni...","Geography, Zoology, Biology, Ecology, Taxonomy...","Neil Byers, Jeffrey L. Boore, Cheryl D. Schwar..."
4,Charles Thomas Parker,0000-0002-7436-3176,"The Ohio State University, Cyclotron (Netherla...","Geography, Zoology, Biology, Ecology, Taxonomy...","Neil Byers, Paul Gilna, Mehmet Ulaş Çınar, Bri..."
5,Tujin Shi,0000-0002-5592-3588,"Environmental Molecular Sciences Laboratory, H...","Computer science, Geography","Bo Peng, Jordan B. Fishman, Yongli Bao, Joseph..."
6,Karin Rodland,0000-0001-7070-6541,"Environmental Molecular Sciences Laboratory, C...","Computer science, Geography","Martha Campbell‐Thompson, Jianbo Pan, H. Thoma..."
...,...,...,...,...,...
192,Hao Chen,0009-0001-6480-7976,"Peking University, Sichuan University, Wenzhou...","Quantum mechanics, Composite material, Thermod...","Tong Wu, Haokai Shen, Bradley M. Broom, Zong‐W..."
194,Ashley Robinson,0000-0002-7561-9276,"Environmental Molecular Sciences Laboratory, D...","Database, Computer science, Environmental science","William Halliday, D. J. Hooper, Kenneth J. Aub..."
195,Christine Chang,0000-0002-9198-4855,"California Institute of Technology, Pacific No...","Computer science, Environmental science","Abby Jerger, Aaron J. Rossini, Antony Williams..."
197,Allison Myers‐Pigg,0000-0002-6905-6841,"Smithsonian Environmental Research Center, Uni...","Computer science, Environmental science","Robert Griffin, Roisin McCallum, Nicholas Ward..."


In [25]:
authors_without_orcid

Unnamed: 0,Name,Orcid,Institutions,Concepts,Coauthors
0,A Boyle,,"Macquarie University, University of Michigan–A...",Computer science,
2,Josh Adkins,,Pacific Northwest National Laboratory,"Geography, Computer science, Environmental sci...","John Cort, Yinyin Ye, Ernesto Nakayasu, Justin..."
12,Ron Moore,,"Knoxville College, Texas State University, Pac...","Computer science, Geography","Tujin Shi, Rosey Chu, Yi‐Ting Wang, Marina Gri..."
13,Rosey Chu,,"Environmental Molecular Sciences Laboratory, P...","Computer science, Geography","Tamás Varga, Ruonan Wu, Giorgis Isaac, Swarup ..."
14,CF Tsai,,"Chung Shan Medical University, Chung Shan Medi...","Computer science, Geography","YT Chuang, Ron Moore, Tujin Shi, Marina Gritse..."
...,...,...,...,...,...
181,Li,,"Gene Therapy Laboratory, Army Medical Universi...","Gene, Internal medicine, Physics, Biology, Eco...","Gui-tong, Zhonghui, Ji-mei, van, Hongtao, Zhen..."
189,John B. Heppner,,"University of Florida, Plant (United States), ...","Computer science, Botany, Biology, Ecology","Monica Juarez, Holly Downing, Norman G. Gratz,..."
193,PowerTAC,,,Computer science,
196,Curtis A. Bradley,,"Singer (United States), East Tennessee State U...","Business, Biotechnology, Political science, Ma...","Stephen Dycus, Cynthia Adams, Tod Lindberg, El..."
