# Import libraries

In [None]:
import os
import pandas as pd

# Import web scraping helper functions

In [None]:
import scraping_helper_functions.get_all_NSF as get_all_NSF
import scraping_helper_functions.get_author_info as get_author_info
import scraping_helper_functions.get_pub_info as get_pub_info

# Load NSF data (from 2011 to 2020)

This is a website introducing the directory and affiliated divisions: https://new.nsf.gov/about/directorates-offices

In this project, we focus on the Division Of Behavioral and Cognitive Science affiliated in the Directory For Social, Behavioral & Economic Science. In addition, we choose 2011 to 2020 as the 10-year range, based on which we scrape NSF awarding information.

- `funding_info_2011_2020` table
    - first_name: first name of funded authors
    - middle_name: middle name of funded authors
    - last_name: last name of funded authors
    - email: email address of funded authors 
    - directorate: directorate of NSF foundation
    - division: division under the directorate of NSF foundation
    - effective_date: the date when the funding begins
    - expiration_date: the date when the funding expires
    - award_amount: NSF funding amount
    - award_title: title of awared projects
    - abstract: abstract used for NSF funding proposal
    - year: year at which the author is awarded

In [None]:
nsf_data_file_path = 'database/funding_info_2011_2020.csv'

if os.path.exists(nsf_data_file_path):
    nsf_df = pd.read_csv(nsf_data_file_path)
else:
    nsf_df = get_all_NSF.process_all_folders(base_path="nsf_data", 
                                             start_year=2011, end_year=2020, 
                                             filter_directorate="Direct For Social, Behav & Economic Scie", 
                                             filter_division="Division Of Behavioral and Cognitive Sci")
    nsf_df.to_csv(nsf_data_file_path, index=False)

In [None]:
nsf_df.head()

# Dynamically scrape the author_info about the author on the NSF award list

- `author_info_<year>` table
    - email: funded authors' email address
    - url: funded authors' Google Scholar Page
    - interests: funded authors' research interests
    - affiliation: funded author' affiliation (university or institutions)
    - total_citations: funded authors'total number of citations 
    - h_index: funded authors' h-index
    - citation_5_year_before_sum: funded authors' total number of citations within 5 years before when they were awarded
    - citation_5_year_after_sum: funded authors' total number of citations within 5 years after when they were awarded

In [None]:
# Define a year of interest
year = 2018

In [None]:
# A sample use of getting author info for a specific year 
# (remember to replace the `year` argument!)
if not os.path.exists(f"database/author_info_{year}.csv"):
    get_author_info.retrieve_author_info(nsf_df, year)

# Dynamically scrape the pub_info about the author on the NSF award list

- `pub_info_<year1>_<year2>` table
    - first_name: first name of funded authors
    - middle_name: middle name of funded authors
    - last_name: last name of funded authors
    - email: email address of funded authors
    - Title: title of the publication (paper)
    - Year: year of publication
    - Cited by: number of citation in total
    - Paper URL: url that leads to paper details (e.g., abstract) after one click
    - Authors: all authors of the publication
    - Publication date: specific date of publication
    - Journal: journal of the article
    - Abstract: abstract of funded authors' one specific paper
    - Citations: yearly breakdown of citation (a dictionary)

In [None]:
if not os.path.exists(f"database/pub_info_{year}.csv"):
    get_pub_info.generate_pub_info_table(year)