# Import libraries

In [1]:
import os
import pandas as pd

# Import web scraping helper functions

In [2]:
import scraping_helper_functions.get_all_NSF as get_all_NSF
import scraping_helper_functions.get_author_info as get_author_info
import scraping_helper_functions.get_pub_info as get_pub_info

# Load NSF data (from 2011 to 2020)

This is a website introducing the directory and affiliated divisions: https://new.nsf.gov/about/directorates-offices

In this project, we focus on the Division Of Behavioral and Cognitive Science affiliated in the Directory For Social, Behavioral & Economic Science. In addition, we choose 2011 to 2020 as the 10-year range, based on which we scrape NSF awarding information.

- `funding_info_2011_2020` table
    - first_name: first name of funded authors
    - middle_name: middle name of funded authors
    - last_name: last name of funded authors
    - email: email address of funded authors 
    - directorate: directorate of NSF foundation
    - division: division under the directorate of NSF foundation
    - effective_date: the date when the funding begins
    - expiration_date: the date when the funding expires
    - award_amount: NSF funding amount
    - award_title: title of awared projects
    - abstract: abstract used for NSF funding proposal
    - year: year at which the author is awarded

In [4]:
nsf_data_file_path = 'database/funding_info_2011_2020.csv'

if os.path.exists(nsf_data_file_path):
    nsf_df = pd.read_csv(nsf_data_file_path)
else:
    nsf_df = get_all_NSF.process_all_folders(base_path="nsf_data", 
                                             start_year=2011, end_year=2020, 
                                             filter_directorate="Direct For Social, Behav & Economic Scie", 
                                             filter_division="Division Of Behavioral and Cognitive Sci")
    nsf_df.to_csv(nsf_data_file_path, index=False)

In [5]:
nsf_df.head()

Unnamed: 0,first_name,middle_name,last_name,email,institution,directorate,division,effective_date,expiration_date,award_amount,award_title,abstract,year
0,Jean,,Comaroff,jcomaro@uchicago.edu,University of Chicago,"Direct For Social, Behav & Economic Scie",Division Of Behavioral and Cognitive Sci,02/01/2011,01/31/2013,20000,Doctoral Dissertation Research: The Role of Kn...,University of Chicago doctoral student Brenden...,2011
1,Talal,,Asad,talalasad@earthlink.net,CUNY Graduate School University Center,"Direct For Social, Behav & Economic Scie",Division Of Behavioral and Cognitive Sci,01/01/2011,09/30/2012,19901,Doctoral Dissertation Research: Mental Disorde...,Doctoral student Ana Maria Vinea (Graduate Cen...,2011
2,John,,Cherry,John_Cherry@brown.edu,Brown University,"Direct For Social, Behav & Economic Scie",Division Of Behavioral and Cognitive Sci,01/01/2011,06/30/2012,14820,Doctoral Dissertation Improvement Grant: Rethi...,In collaboration with the Central Lydia Archae...,2011
3,Leaf,,Van Boven,vanboven@colorado.edu,University of Colorado at Boulder,"Direct For Social, Behav & Economic Scie",Division Of Behavioral and Cognitive Sci,04/01/2011,03/31/2015,250000,EAGER: Perceiving Political Distributions,The present research focuses on motivated poli...,2011
4,Steven,,Bedrick,bedricks@ohsu.edu,Oregon Health & Science University,"Direct For Social, Behav & Economic Scie",Division Of Behavioral and Cognitive Sci,04/01/2011,09/30/2013,114140,"Corpora of Non-Linguistic Symbol Systems, and ...","Throughout the millenia, humans have used grap...",2011


# Dynamically scrape the author_info about the author on the NSF award list

- `author_info_<year>` table
    - email: funded authors' email address
    - url: funded authors' Google Scholar Page
    - interests: funded authors' research interests
    - affiliation: funded author' affiliation (university or institutions)
    - total_citations: funded authors'total number of citations 
    - h_index: funded authors' h-index
    - citation_5_year_before_sum: funded authors' total number of citations within 5 years before when they were awarded
    - citation_5_year_after_sum: funded authors' total number of citations within 5 years after when they were awarded

In [4]:
# Define a year of interest
year = 2018

In [None]:
# A sample use of getting author info for a specific year 
# (remember to replace the `year` argument!)
if not os.path.exists(f"author_info_{year}.csv"):
    get_author_info.retrieve_author_info(nsf_df, year)

In [None]:
get_author_info.retrieve_author_info(nsf_df[nsf_df["year"] == 2018].iloc[:52], year)

# Dynamically scrape the pub_info about the author on the NSF award list

- `pub_info_<year1>_<year2>` table
    - Title: title of the publication (paper)
    - Year: year of publication
    - Cited by: number of citation in total
    - Paper URL: url that leads to paper details (e.g., abstract) after one click
    - email: email 
    - url: funded authors' Google Scholar Page
    - Publication date: data of publication
    - Journal: journal of the article
    - abstract: abstract of funded authors' one specific paper
    - year: publication year of funded authors' one specific paper
    - Citations: yearly breakdown of citation

Change it tomorrow

In [5]:
get_pub_info.generate_pub_info_table(year)

{'Authors': 'AN Underhill, CD Hirsch, MD Clark', 'Publication Date': '2020/4/24', 'Journal': 'Plant Phenomics', 'Abstract': 'Grape berry color is an economically important trait that is controlled by two major genes influencing anthocyanin synthesis in the skin. Color is often described qualitatively using six major categories; however, this is a subjective rating that often fails to describe variation within these six classes. To investigate minor genes influencing berry color, image analysis was used to quantify berry color using different color spaces. An image analysis pipeline was developed and utilized to quantify color in a segregating hybrid wine grape population across two years. Images were collected from grape clusters immediately after harvest and segmented by color to determine the red, green, and blue (RGB); hue, saturation, and intensity (HSI); and lightness, red-green, and blue-yellow values (L∗ a∗ b∗) of berries. QTL analysis identified known major QTL for color on chr