# Import libraries

In [1]:
import os
import pandas as pd

# Import web scraping helper functions

In [2]:
import scraping_helper_functions.get_all_NSF as get_all_NSF
import scraping_helper_functions.get_author_info as get_author_info
import scraping_helper_functions.get_pub_info as get_pub_info

# Load NSF data (from 2011 to 2020)

This is a website introducing the directory and affiliated divisions: https://new.nsf.gov/about/directorates-offices

In this project, we focus on the Division Of Behavioral and Cognitive Science affiliated in the Directory For Social, Behavioral & Economic Science. In addition, we choose 2011 to 2020 as the 10-year range, based on which we scrape NSF awarding information.

- `funding_info_2011_2020` table
    - first_name: first name of funded authors
    - middle_name: middle name of funded authors
    - last_name: last name of funded authors
    - email: email address of funded authors 
    - directorate: directorate of NSF foundation
    - division: division under the directorate of NSF foundation
    - effective_date: the date when the funding begins
    - expiration_date: the date when the funding expires
    - award_amount: NSF funding amount
    - award_title: title of awared projects
    - abstract: abstract used for NSF funding proposal
    - year: year at which the author is awarded

In [3]:
nsf_data_file_path = 'database/funding_info_2011_2020.csv'

if os.path.exists(nsf_data_file_path):
    nsf_df = pd.read_csv(nsf_data_file_path)
else:
    nsf_df = get_all_NSF.process_all_folders(base_path="nsf_data", 
                                             start_year=2011, end_year=2020, 
                                             filter_directorate="Direct For Social, Behav & Economic Scie", 
                                             filter_division="Division Of Behavioral and Cognitive Sci")
    nsf_df.to_csv(nsf_data_file_path, index=False)

In [4]:
nsf_df.head()

Unnamed: 0,first_name,middle_name,last_name,email,institution,directorate,division,effective_date,expiration_date,award_amount,award_title,abstract,year
0,Jean,,Comaroff,jcomaro@uchicago.edu,University of Chicago,"Direct For Social, Behav & Economic Scie",Division Of Behavioral and Cognitive Sci,02/01/2011,01/31/2013,20000,Doctoral Dissertation Research: The Role of Kn...,University of Chicago doctoral student Brenden...,2011
1,Talal,,Asad,talalasad@earthlink.net,CUNY Graduate School University Center,"Direct For Social, Behav & Economic Scie",Division Of Behavioral and Cognitive Sci,01/01/2011,09/30/2012,19901,Doctoral Dissertation Research: Mental Disorde...,Doctoral student Ana Maria Vinea (Graduate Cen...,2011
2,John,,Cherry,John_Cherry@brown.edu,Brown University,"Direct For Social, Behav & Economic Scie",Division Of Behavioral and Cognitive Sci,01/01/2011,06/30/2012,14820,Doctoral Dissertation Improvement Grant: Rethi...,In collaboration with the Central Lydia Archae...,2011
3,Leaf,,Van Boven,vanboven@colorado.edu,University of Colorado at Boulder,"Direct For Social, Behav & Economic Scie",Division Of Behavioral and Cognitive Sci,04/01/2011,03/31/2015,250000,EAGER: Perceiving Political Distributions,The present research focuses on motivated poli...,2011
4,Steven,,Bedrick,bedricks@ohsu.edu,Oregon Health & Science University,"Direct For Social, Behav & Economic Scie",Division Of Behavioral and Cognitive Sci,04/01/2011,09/30/2013,114140,"Corpora of Non-Linguistic Symbol Systems, and ...","Throughout the millenia, humans have used grap...",2011


# Dynamically scrape the author_info about the author on the NSF award list

- `author_info_<year>` table
    - first_name: first name of funded authors
    - middle_name: middle name of funded authors
    - last_name: last name of funded authors
    - email: funded authors' email address
    - institution: funded author' affiliation (university or institutions)
    - url: funded authors' Google Scholar Page
    - interests: funded authors' research interests
    - total_citations: funded authors'total number of citations 
    - h_index: funded authors' h-index
    - The remaing seven columns are citation number 3 years before and after the awarded year

In [5]:
# A sample use of getting author info for a specific year 
# (remember to replace the `year` argument!)
if not os.path.exists(f"database/author_info_2018.csv"):
    author_info_18 = get_author_info.safe_retrieve_author_info(nsf_df, 2018)

TypeError: retrieve_author_info() missing 1 required positional argument: 'driver'

# Dynamically scrape the pub_info about the author on the NSF award list

- `pub_info_<year1>_<year2>` table (3 years before and after the awarded year)
    - first_name: first name of funded authors
    - middle_name: middle name of funded authors
    - last_name: last name of funded authors
    - email: email address of funded authors
    - Title: title of the publication (paper)
    - Year: year of publication
    - Cited by: number of citation in total
    - Paper URL: url that leads to paper details (e.g., abstract) after one click
    - Authors: all authors of the publication
    - Publication date: specific date of publication
    - Journal: journal of the article
    - Abstract: abstract of funded authors' one specific paper
    - Citations: yearly breakdown of citation (a dictionary)

In [None]:
if not os.path.exists(f"database/pub_info_{2018}.csv"):
    get_pub_info.generate_pub_info_table(2018)