# Import libraries

In [8]:
import os
import pandas as pd
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
import time
import zipfile

# Set up Chromdriver for Dynamic Scraping

In [None]:
# Set up chromedriver
!pip install chromedriver-autoinstaller

import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
import chromedriver_autoinstaller

# set path to chromedriver as per your configuration
chromedriver_autoinstaller.install()

# Load NSF data


In [10]:
# Unzip the file

# Create the main directory to store OSF data (xml files) if it doesn't exist
if not os.path.isdir("NSF_data"):
    os.mkdir("NSF_data")

# Unzip the file into a subdirectory
if not os.path.isdir("NSF_data/2018"):
    with zipfile.ZipFile("NSF_zip/2018.zip", 'r') as zip_ref:
        zip_ref.extractall("NSF_data/2018")

This is a website introducing the directory and affiliated divisions: https://new.nsf.gov/about/directorates-offices

In [12]:
# Extracts data from downloaded NSF files
def extract_data_from_file(file_path):
    tree = etree.parse(file_path)
    root = tree.getroot()

    extracted_data = {
        "first_name": "",
        "middle_name": "",
        "last_name": "",
        "email": "",
        "directorate": "",
        "division": "",
        "effective_date": "",
        "expiration_date": "",
        "award_amount": "",
        "abstract": ""
    }

    award = root.find('Award')

    extracted_data["effective_date"] = award.findtext('AwardEffectiveDate') or ''
    extracted_data["expiration_date"] = award.findtext('AwardExpirationDate') or ''
    extracted_data["award_amount"] = award.findtext('AwardTotalIntnAmount') or ''
    extracted_data["abstract"] = award.findtext('AbstractNarration') or ''
    extracted_data["directorate"] = award.findtext('Organization/Directorate/LongName') or ''
    extracted_data["division"] = award.findtext('Organization/Division/LongName') or ''
    extracted_data["first_name"] = award.findtext('Investigator/FirstName') or ''
    extracted_data["middle_name"] = award.findtext('Investigator/PI_MID_INIT') or ''
    extracted_data["last_name"] = award.findtext('Investigator/LastName') or ''
    extracted_data["email"] = award.findtext('Investigator/EmailAddress') or ''

    return extracted_data

In [13]:
# Processes files in a folder
def process_folder(folder_path, filter_directorate):
    all_data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.xml'):
            file_path = os.path.join(folder_path, filename)
            data = extract_data_from_file(file_path)
            if data["directorate"] == filter_directorate:
              all_data.append(data)

    return pd.DataFrame(all_data)

In [14]:
# example use for extracting
folder_path = 'NSF_data/2018' # path to folder where NSF files are stored
file_path = 'funding_info_2018.csv'

if os.path.isfile(file_path):
  nsf_df = pd.read_csv(file_path)
else:
  nsf_df = process_folder(folder_path, filter_directorate="Direct For Social, Behav & Economic Scie") # process the folder
  nsf_df.to_csv(file_path, index=False) # save the dataframe in a csv file

In [15]:
nsf_df.head()

Unnamed: 0,first_name,middle_name,last_name,email,directorate,division,effective_date,expiration_date,award_amount,abstract
0,Keith,M,Murphy,kmmurphy@uci.edu,"Direct For Social, Behav & Economic Scie",Division Of Behavioral and Cognitive Sci,03/01/2019,02/29/2024,209801.0,Communication among humans is known to be comp...
1,Scott,,StGeorge,stgeorge@umn.edu,"Direct For Social, Behav & Economic Scie",Division Of Behavioral and Cognitive Sci,09/01/2018,04/30/2023,349934.0,This research project will examine the degree ...
2,Shauna,M,Cooper,scooper1@live.unc.edu,"Direct For Social, Behav & Economic Scie",Division Of Behavioral and Cognitive Sci,07/01/2017,06/30/2019,96141.0,Little is known about African American fathers...
3,Alan,C,Yu,aclyu@uchicago.edu,"Direct For Social, Behav & Economic Scie",Division Of Behavioral and Cognitive Sci,09/15/2018,02/29/2020,30648.0,Language change is inevitable and constant: al...
4,Wilson,,Silva,wdelimasilva@email.arizona.edu,"Direct For Social, Behav & Economic Scie",Division Of Behavioral and Cognitive Sci,06/01/2018,12/31/2020,90383.0,A finely balanced linguistic ecology is needed...


In [16]:
nsf_df["division"].value_counts()

Division Of Behavioral and Cognitive Sci    481
Divn Of Social and Economic Sciences        388
SBE Off Of Multidisciplinary Activities      87
National Center For S&E Statistics            8
Name: division, dtype: int64

# Dynamically scrape the publishing-related info about the author on the NSF award list

## Define helper functions to dynamically scrape citation metrics, publication details, and research interests of a given author

In [18]:
# Finds Google Scholar urls
def find_url(driver, full_name, email_domain):
    url = f"https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors={full_name}"
    driver.get(url)
    time.sleep(3)

    authors = driver.find_elements(By.CSS_SELECTOR, "div.gs_ai.gs_scl.gs_ai_chpr")
    for author in authors:
        author_email_text = author.find_element(By.CSS_SELECTOR, "div.gs_ai_eml").text
        if 'Verified email at ' in author_email_text:
            author_email_domain = author_email_text.split('Verified email at ')[1]
            if email_domain == author_email_domain:
                link_element = author.find_element(By.CSS_SELECTOR, "a.gs_ai_pho")
                return link_element.get_attribute('href')

    return None


# Finds citations
def find_citations(driver, url):
    driver.set_window_size(800, 1000)
    driver.get(url)
    time.sleep(3)

    cited_by_tab = driver.find_element(By.ID, "gsc_prf_t-cit")
    cited_by_tab.click()
    time.sleep(3)

    total_citations = driver.find_element(By.XPATH, '//*[@id="gsc_rsb_st"]/tbody/tr[1]/td[2]').text
    h_index = driver.find_element(By.XPATH, '//*[@id="gsc_rsb_st"]/tbody/tr[2]/td[2]').text

    year_citations = {}
    year_elements = driver.find_elements(By.CSS_SELECTOR, "div.gsc_md_hist_w .gsc_g_t")
    citation_elements = driver.find_elements(By.CSS_SELECTOR, "div.gsc_md_hist_w .gsc_g_a")

    for year, citation in zip(year_elements, citation_elements):
        citation_count = driver.execute_script("return arguments[0].textContent", citation)
        year_citations[year.text] = citation_count

    return total_citations, h_index, year_citations


# Finds publications => retreive not only the tile of the publication, 
# but also its citation count, year of publication, and paper abstract
from selenium.webdriver.support.ui import WebDriverWait
def find_publications(driver, url):
    driver.get(url)
    time.sleep(3)

    while True:
        try:
            show_more_button = driver.find_element(By.ID, "gsc_bpf_more")
            if show_more_button.is_displayed() and show_more_button.is_enabled():
                show_more_button.click()
                time.sleep(2)
            else:
                break
        except (NoSuchElementException, ElementClickInterceptedException):
            break

    publications = []
    rows = driver.find_elements(By.CSS_SELECTOR, "tr.gsc_a_tr")

    for row in rows:
        # Extract title
        title_element = row.find_element(By.CSS_SELECTOR, "a.gsc_a_at")
        title = title_element.text

        # Extract coauthors
        coauthors_element = row.find_elements(By.CSS_SELECTOR, "td.gsc_a_t div.gs_gray")[0]
        coauthors = coauthors_element.text if coauthors_element else "Coauthors not found"

        # Extract citation count
        citation_element = row.find_element(By.CSS_SELECTOR, "td.gsc_a_c a.gsc_a_ac.gs_ibl")
        n_citation = citation_element.text if citation_element else "0"

        # Extract year of publication
        year_element = row.find_element(By.CSS_SELECTOR, "td.gsc_a_y span.gsc_a_h.gsc_a_hc.gs_ibl")
        year = year_element.text if year_element else "Year not found"

        # Navigate to the citation link page to extract abstract
        publication_url = title_element.get_attribute('href')
        driver.get(publication_url)
        time.sleep(3)  # Wait for the page to load

        # Extract the abstract
        try:
            abstract_element = driver.find_element(By.CSS_SELECTOR, "div.gsh_csp") # div.gsh_small
            abstract = abstract_element.text
        except NoSuchElementException:
            abstract = "Abstract not found"

        driver.back()

        publications.append({
            "title": title,
            "year": year,
            "coauthors": coauthors,
            "n_citation": n_citation,
            "abstract": abstract
        })

    return publications

# Finds interests
def find_interests(driver, url):
    driver.get(url)
    time.sleep(3)

    interests = []

    try:
        interest = driver.find_elements(By.CSS_SELECTOR, "div#gsc_prf_int a.gsc_prf_inta")
        interests = [i.text for i in interest] if interest else None

    except Exception as e:
        print(f"Error occurred: {e}")

    return interests

## Define the function that scrapes publishing info of authors by relating to the `nsf_data` dataframe just created

In [19]:
# Cleans and updates the dataframe
from selenium.webdriver.chrome.service import Service as ChromeService

def update_and_save_dataframe(df, chrome_driver_path):
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')  # Ensure GUI is off
    chrome_options.add_argument('--no-sandbox')  # Bypass OS security model
    chrome_options.add_argument('--disable-dev-shm-usage')  # Overcome limited resource problems

    # Set up ChromeService using the specified path
    service = ChromeService(executable_path=chrome_driver_path)
    driver = webdriver.Chrome(service=service, options=chrome_options)

    # Updates Google Scholar urls
    df = df.dropna(subset=['email'])
    df['url'] = None

    for index, row in df.iterrows():
        full_name = f"{row['first_name']} {row['middle_name']} {row['last_name']}".strip() if pd.notna(row['middle_name']) else f"{row['first_name']} {row['last_name']}"
        email_domain = row['email'].split('@')[-1]
        url = find_url(driver, full_name, email_domain)
        df.loc[index, 'url'] = url
        time.sleep(3)

    df = df.dropna(subset=['url'])
    df['publications'] = None
    df['interests'] = None

    for index, row in df.iterrows():
        url = row['url']

        # Updates citations
        total_citations, h_index, year_citations = find_citations(driver, url)
        df.at[index, 'total_citations'] = total_citations
        df.at[index, 'h_index'] = h_index

        for year, citations in year_citations.items():
            col_name = f'citations_{year}'
            df.at[index, col_name] = citations

        # Updates publications
        publications = find_publications(driver, url)
        df.at[index, 'publications'] = publications # For now I store the whole dictionary of title, n_citation, year, and abstract in a whole cell (which definitely could be improved in future)

        # Updates interests
        interests = find_interests(driver, url)
        if interests:
            df.at[index, 'interests'] = [interest for interest in interests]

        time.sleep(3)

    driver.quit()

    df.to_csv('publication_info_2018.csv', index=False)


In [29]:
# Define path to chromedriver
chrome_driver_path = 'chromedriver-mac-x64/chromedriver'

In [30]:
# example use
update_and_save_dataframe(nsf_df[:1], chrome_driver_path) # updates and saves the dataframe into a new csv file

StaleElementReferenceException: Message: stale element reference: stale element not found
  (Session info: chrome-headless-shell=121.0.6167.139); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception
Stacktrace:
0   chromedriver                        0x0000000100962168 chromedriver + 4673896
1   chromedriver                        0x00000001009599c3 chromedriver + 4639171
2   chromedriver                        0x000000010054dfdd chromedriver + 397277
3   chromedriver                        0x000000010055d803 chromedriver + 460803
4   chromedriver                        0x000000010055426a chromedriver + 422506
5   chromedriver                        0x000000010055246e chromedriver + 414830
6   chromedriver                        0x000000010055579f chromedriver + 427935
7   chromedriver                        0x000000010055584c chromedriver + 428108
8   chromedriver                        0x0000000100599a03 chromedriver + 707075
9   chromedriver                        0x0000000100599dd1 chromedriver + 708049
10  chromedriver                        0x000000010058e156 chromedriver + 659798
11  chromedriver                        0x00000001005bc8ed chromedriver + 850157
12  chromedriver                        0x000000010058e038 chromedriver + 659512
13  chromedriver                        0x00000001005bca7e chromedriver + 850558
14  chromedriver                        0x00000001005db796 chromedriver + 976790
15  chromedriver                        0x00000001005bc663 chromedriver + 849507
16  chromedriver                        0x000000010058c1cf chromedriver + 651727
17  chromedriver                        0x000000010058d1ae chromedriver + 655790
18  chromedriver                        0x0000000100922380 chromedriver + 4412288
19  chromedriver                        0x0000000100927798 chromedriver + 4433816
20  chromedriver                        0x0000000100906d71 chromedriver + 4300145
21  chromedriver                        0x00000001009284e6 chromedriver + 4437222
22  chromedriver                        0x00000001008f8d3c chromedriver + 4242748
23  chromedriver                        0x0000000100948208 chromedriver + 4567560
24  chromedriver                        0x00000001009483be chromedriver + 4567998
25  chromedriver                        0x0000000100959603 chromedriver + 4638211
26  libsystem_pthread.dylib             0x00007ff80bda71d3 _pthread_start + 125
27  libsystem_pthread.dylib             0x00007ff80bda2bd3 thread_start + 15
