# Import libraries

In [35]:
import os
import pandas as pd
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
import time
import zipfile
import re

# Load NSF data


In [3]:
# Unzip the file

# Create the main directory to store OSF data (xml files) if it doesn't exist
if not os.path.isdir("NSF_data"):
    os.mkdir("NSF_data")

# Unzip the file into a subdirectory
if not os.path.isdir("NSF_data/2018"):
    with zipfile.ZipFile("NSF_zip/2018.zip", 'r') as zip_ref:
        zip_ref.extractall("NSF_data/2018")

This is a website introducing the directory and affiliated divisions: https://new.nsf.gov/about/directorates-offices

In [36]:
# Extracts data from downloaded NSF files
def extract_data_from_file(file_path):
    tree = etree.parse(file_path)
    root = tree.getroot()

    extracted_data = {
        "first_name": "",
        "middle_name": "",
        "last_name": "",
        "email": "",
        "directorate": "",
        "division": "",
        "effective_date": "",
        "expiration_date": "",
        "award_amount": "",
        "abstract": ""
    }

    award = root.find('Award')

    extracted_data["effective_date"] = award.findtext('AwardEffectiveDate') or ''
    extracted_data["expiration_date"] = award.findtext('AwardExpirationDate') or ''
    extracted_data["award_amount"] = award.findtext('AwardTotalIntnAmount') or ''
    extracted_data["abstract"] = award.findtext('AbstractNarration') or ''
    extracted_data["directorate"] = award.findtext('Organization/Directorate/LongName') or ''
    extracted_data["division"] = award.findtext('Organization/Division/LongName') or ''
    extracted_data["first_name"] = award.findtext('Investigator/FirstName') or ''
    extracted_data["middle_name"] = award.findtext('Investigator/PI_MID_INIT') or ''
    extracted_data["last_name"] = award.findtext('Investigator/LastName') or ''
    extracted_data["email"] = award.findtext('Investigator/EmailAddress') or ''

    # Also retrieve the awarded year
    extracted_data["awarded_year"] = re.findall(r"NSF_data/(\d{4})/\d+\.xml", file_path)[0]

    return extracted_data

In [37]:
# Processes files in a folder
def process_folder(folder_path, filter_directorate):
    all_data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.xml'):
            file_path = os.path.join(folder_path, filename)
            data = extract_data_from_file(file_path)
            if data["directorate"] == filter_directorate:
              all_data.append(data)

    return pd.DataFrame(all_data)

In [39]:
# example use for extracting
folder_path = 'NSF_data/2018' # path to folder where NSF files are stored
file_path = 'funding_info_2018.csv'

if os.path.isfile(file_path):
  nsf_df = pd.read_csv(file_path)
else:
  nsf_df = process_folder(folder_path, filter_directorate="Direct For Social, Behav & Economic Scie") # process the folder
  nsf_df.to_csv(file_path, index=False) # save the dataframe in a csv file

- `funding_info` table
    - first_name: first name of funded authors
    - middle_name: middle name of funded authors
    - last_name: last name of funded authors
    - email: email address of funded authors 
    - directorate: directorate of NSF foundation
    - division: division under the directorate of NSF foundation
    - effective_date: the date when the funding begins
    - expiration_date: the date when the funding expires
    - award_amount: NSF funding amount
    - abstract: abstract used for NSF funding proposal
    - awarded_year: year at which the author is awarded

In [40]:
nsf_df.head()

Unnamed: 0,first_name,middle_name,last_name,email,directorate,division,effective_date,expiration_date,award_amount,abstract,awarded_year
0,Keith,M,Murphy,kmmurphy@uci.edu,"Direct For Social, Behav & Economic Scie",Division Of Behavioral and Cognitive Sci,03/01/2019,02/29/2024,209801.0,Communication among humans is known to be comp...,2018
1,Scott,,StGeorge,stgeorge@umn.edu,"Direct For Social, Behav & Economic Scie",Division Of Behavioral and Cognitive Sci,09/01/2018,04/30/2023,349934.0,This research project will examine the degree ...,2018
2,Shauna,M,Cooper,scooper1@live.unc.edu,"Direct For Social, Behav & Economic Scie",Division Of Behavioral and Cognitive Sci,07/01/2017,06/30/2019,96141.0,Little is known about African American fathers...,2018
3,Alan,C,Yu,aclyu@uchicago.edu,"Direct For Social, Behav & Economic Scie",Division Of Behavioral and Cognitive Sci,09/15/2018,02/29/2020,30648.0,Language change is inevitable and constant: al...,2018
4,Wilson,,Silva,wdelimasilva@email.arizona.edu,"Direct For Social, Behav & Economic Scie",Division Of Behavioral and Cognitive Sci,06/01/2018,12/31/2020,90383.0,A finely balanced linguistic ecology is needed...,2018


In [8]:
nsf_df["division"].value_counts()

Division Of Behavioral and Cognitive Sci    481
Divn Of Social and Economic Sciences        388
SBE Off Of Multidisciplinary Activities      87
National Center For S&E Statistics            8
Name: division, dtype: int64

# Dynamically scrape the author_info and publication_info about the author on the NSF award list

## Define helper functions to dynamically scrape citation metrics, publication details, and research interests of a given author

In [63]:
# Finds Google Scholar urls (author's Google Scholar profile)
def find_url(driver, full_name, email_domain):
    url = f"https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors={full_name}"
    driver.get(url)
    time.sleep(3)

    authors = driver.find_elements(By.CSS_SELECTOR, "div.gs_ai.gs_scl.gs_ai_chpr")
    for author in authors:
        author_email_text = author.find_element(By.CSS_SELECTOR, "div.gs_ai_eml").text
        if 'Verified email at ' in author_email_text:
            author_email_domain = author_email_text.split('Verified email at ')[1]
            if email_domain == author_email_domain:
                link_element = author.find_element(By.CSS_SELECTOR, "a.gs_ai_pho")
                return link_element.get_attribute('href')

    return None

# Finds author's total and yearly citations, as well as h_index
def find_citations(driver, url):
    driver.set_window_size(800, 1000)
    driver.get(url)
    time.sleep(3)

    cited_by_tab = driver.find_element(By.ID, "gsc_prf_t-cit")
    cited_by_tab.click()
    time.sleep(3)

    total_citations = driver.find_element(By.XPATH, '//*[@id="gsc_rsb_st"]/tbody/tr[1]/td[2]').text
    h_index = driver.find_element(By.XPATH, '//*[@id="gsc_rsb_st"]/tbody/tr[2]/td[2]').text

    year_citations = {}
    year_elements = driver.find_elements(By.CSS_SELECTOR, "div.gsc_md_hist_w .gsc_g_t")
    citation_elements = driver.find_elements(By.CSS_SELECTOR, "div.gsc_md_hist_w .gsc_g_a")

    for year, citation in zip(year_elements, citation_elements):
        citation_count = driver.execute_script("return arguments[0].textContent", citation)
        year_citations[year.text] = citation_count

    return total_citations, h_index, year_citations

# Finds interests
def find_interests(driver, url):
    driver.get(url)
    time.sleep(3)

    interests = []

    try:
        interest = driver.find_elements(By.CSS_SELECTOR, "div#gsc_prf_int a.gsc_prf_inta")
        interests = [i.text for i in interest] if interest else None

    except Exception as e:
        print(f"Error occurred: {e}")

    return interests

# Finds affiliation
def find_affiliation(driver, url):
    driver.get(url)
    time.sleep(3)

    try:
        affiliation = driver.find_element(By.CSS_SELECTOR, "a.gsc_prf_ila").text

    except Exception as e:
        print(f"Error occurred: {e}")

    return affiliation

# Finds publications => retreive not only the tile of the publication, 
# but also its citation count, year of publication, and paper abstract
def find_publications(driver, url):
    driver.get(url)
    time.sleep(3)

    while True:
        try:
            show_more_button = driver.find_element(By.ID, "gsc_bpf_more")
            if show_more_button.is_displayed() and show_more_button.is_enabled():
                show_more_button.click()
                time.sleep(2)
            else:
                break
        except (NoSuchElementException, ElementClickInterceptedException):
            break

    publications = []
    rows = driver.find_elements(By.CSS_SELECTOR, "tr.gsc_a_tr")

    for row in rows:
        # Extract title
        title_element = row.find_element(By.CSS_SELECTOR, "a.gsc_a_at")
        title = title_element.text

        # Extract coauthors
        coauthors_element = row.find_elements(By.CSS_SELECTOR, "td.gsc_a_t div.gs_gray")[0]
        coauthors = coauthors_element.text if coauthors_element else "Coauthors not found"

        # Extract citation count
        citation_element = row.find_element(By.CSS_SELECTOR, "td.gsc_a_c a.gsc_a_ac.gs_ibl")
        n_citation = citation_element.text if citation_element else "0"

        # Extract year of publication
        year_element = row.find_element(By.CSS_SELECTOR, "td.gsc_a_y span.gsc_a_h.gsc_a_hc.gs_ibl")
        year = year_element.text if year_element else "Year not found"

        # Navigate to the citation link page to extract abstract
        publication_url = title_element.get_attribute('href')
        driver.get(publication_url)
        time.sleep(3)  # Wait for the page to load

        # Extract the abstract
        try:
            abstract_element = driver.find_element(By.CSS_SELECTOR, "div.gsh_csp") # div.gsh_small
            abstract = abstract_element.text
        except NoSuchElementException:
            abstract = "Abstract not found"

        driver.back()

        publications.append({
            "url": url,
            "title": title,
            "year": year,
            "coauthors": coauthors,
            "n_citation": n_citation,
            "abstract": abstract
        })
    
    # Convert the list of dictionaries to a pandas DataFrame
    df_publications = pd.DataFrame(publications)

    return df_publications

- `author_info` table
    - email: funded authors' email address
    - url: funded authors' Google Scholar Page
    - interests: funded authors' research interests
    - affiliation: funded author' affiliation (university or institutions)
    - total_citations: funded authors'total number of citations 
    - h_index: funded authors' h-index
    - citation_5_year_before_sum: funded authors' total number of citations within 5 years before when they were awarded
    - citation_5_year_after_sum: funded authors' total number of citations within 5 years after when they were awarded
    

- `pulication_info` table
    - url: funded authors' Google Scholar Page
    - title: title of funded authors' one specific paper
    - abstract: abstract of funded authors' one specific paper
    - year: publication year of funded authors' one specific paper
    - citation: citation count of funded authors' one specific paper


## Define the function that scrapes publishing info of authors by relating to the `nsf_data` dataframe just created

In [64]:
# Scrape the funded authors' author_info and funding_info from nsf_data
def scrape_author(nsf_data):

    # Create a DataFrame to store author_info
    author_info = pd.DataFrame(columns=['email', 'url', 'interests', 'affiliation', 
                                        'total_citations', 'h_index',
                                        'citation_5_year_before_sum', 
                                        'citation_5_year_after_sum'])
    
    # Create a list to store a list of publication info for each author 
    publication_info_lst = []

    # Set up Chrome Driver options
    chrome_options = webdriver.ChromeOptions()
    # chrome_options.add_argument('--headless')  # Ensure GUI is off
    chrome_options.add_argument('--no-sandbox')  # Bypass OS security model
    chrome_options.add_argument('--disable-dev-shm-usage')  # Overcome limited resource problems

    # Set up ChromeService using the specified path
    driver = webdriver.Chrome(options=chrome_options)

    # Remove rows in nsf_data with no emails
    nsf_data = nsf_data.dropna(subset=['email'])

    for index, row in nsf_data.iterrows():
        full_name = f"{row['first_name']} {row['middle_name']} {row['last_name']}".strip() if pd.notna(row['middle_name']) else f"{row['first_name']} {row['last_name']}"
        email_domain = row['email'].split('@')[-1]
        url = find_url(driver, full_name, email_domain)

        # Update author's email and url in the author_info table
        author_info.loc[index, "email"] = email_domain
        author_info.loc[index, "url"] = url

        # Update author's email in 
        time.sleep(3)

    # Remove rows with no urls
    author_info = author_info.dropna(subset=['url'])

    # Year when the author is awarded
    awarded_year = nsf_data["awarded_year"]
    
    for index, row in author_info.iterrows():
        url = row['url']

        # Updates interests
        interests = find_interests(driver, url)
        if interests:
            author_info.at[index, 'interests'] = [interest for interest in interests]
        
        # Updates affiliation
        author_info.at[index, "affiliation"] = find_affiliation(driver, url)

        # Updates author's citations
        total_citations, h_index, year_citations = find_citations(driver, url)
        author_info.loc[index, "total_citations"] = total_citations
        author_info.loc[index, "h_index"] = h_index
        citation_5_year_before_sum = 0
        citation_5_year_after_sum = 0
        
        for year, citations in year_citations.items():
            if int(year) >= int(awarded_year) - 5 and int(year) < int(awarded_year):
                citation_5_year_before_sum += int(citations)

            if int(year) > int(awarded_year) and int(year) <= int(awarded_year) + 5:
                citation_5_year_after_sum += int(citations)

            author_info.loc[index, 'citation_5_year_before_sum'] = citation_5_year_before_sum
            author_info.loc[index, 'citation_5_year_after_sum'] = citation_5_year_after_sum

        # Retrieve each individual author's publication records
        publication_info_lst.append(find_publications(driver, url))

        time.sleep(3)

    driver.quit()

    publication_info = pd.concat(publication_info_lst, ignore_index=True)

    return author_info, publication_info

    # df.to_csv('publication_info_2018.csv', index=False)

In [24]:
nsf_df[:1]

Unnamed: 0,first_name,middle_name,last_name,email,directorate,division,effective_date,expiration_date,award_amount,abstract
0,Keith,M,Murphy,kmmurphy@uci.edu,"Direct For Social, Behav & Economic Scie",Division Of Behavioral and Cognitive Sci,03/01/2019,02/29/2024,209801.0,Communication among humans is known to be comp...


In [66]:
# example use
author_info, publication_info = scrape_author(nsf_df[:1])


publication_info

Unnamed: 0,url,title,year,coauthors,n_citation,abstract
0,https://scholar.google.com/citations?hl=en&use...,Bourdieu and phenomenology: A critical assessment,2002.0,"CJ Throop, KM Murphy",410.0,This article sets out to examine and criticall...
1,https://scholar.google.com/citations?hl=en&use...,Collaborative imagining: The interactive use o...,2005.0,KM Murphy,235.0,This article examines the use of imagination a...
2,https://scholar.google.com/citations?hl=en&use...,Imagination as joint activity: The case of arc...,2004.0,KM Murphy,152.0,This article draws from the insights offered b...
3,https://scholar.google.com/citations?hl=en&use...,Design and anthropology,2016.0,KM Murphy,124.0,"In this review, I examine the recent turn to d..."
4,https://scholar.google.com/citations?hl=en&use...,Swedish design: an ethnography,2019.0,KM Murphy,117.0,Abstract not found
5,https://scholar.google.com/citations?hl=en&use...,"Epilogue: Ethnography and design, ethnography ...",2020.0,"KM Murphy, GE Marcus",76.0,Abstract not found
6,https://scholar.google.com/citations?hl=en&use...,Transmodality and temporality in design intera...,2012.0,KM Murphy,76.0,Abstract not found
7,https://scholar.google.com/citations?hl=en&use...,Embodied reasoning in architectural critique,2012.0,"KM Murphy, J Ivarsson, G Lymer",76.0,Abstract not found
8,https://scholar.google.com/citations?hl=en&use...,A cultural geometry: Designing political thing...,2013.0,KM Murphy,50.0,"In Sweden, a long‐standing and pervasive disco..."
9,https://scholar.google.com/citations?hl=en&use...,Building stories: The embodied narration of wh...,2011.0,KM Murphy,43.0,In this chapter I examine what I am calling “e...
