In [4]:
import pandas as pd
import utils
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

# Load the dataset
xml_df = utils.load_dataset("C:/workspace/NSF_Vitor_24/raw_data/cleaned_xml_combined_08112024.csv")

# Add location column
xml_df = utils.add_location_column(xml_df)

# Get unique locations
unique_locations = utils.get_unique_locations(xml_df)
print("Unique locations in the dataset:")
for location in unique_locations:
    print(location)

# Filter data for New York City
nyc_df = utils.filter_by_city(xml_df, "NEW YORK")
print("Grants for institutions in NEW YORK:")
print(nyc_df)

# Get unique schools in New York City
unique_schools_nyc = utils.get_unique_schools(nyc_df)
print("Unique schools in NEW YORK:")
for school in unique_schools_nyc:
    print(school)

# Define Albany region cities
albany_region_cities = ["ALBANY", "TROY", "SCHENECTADY", "LOUDONVILLE", "SARATOGA SPRINGS"]

# Filter data for Albany region
albany_region_df = utils.filter_by_region_cities(xml_df, albany_region_cities)
print("Grants for institutions in the Albany region:")
print(albany_region_df)

# Get unique schools in the Albany region
unique_schools_albany_region = utils.get_unique_schools(albany_region_df)
print("Unique schools in the Albany region:")
for school in unique_schools_albany_region:
    print(school)

# Filter data for RPI
rpi_df = utils.filter_by_institution_name(xml_df, "Rensselaer Polytechnic Institute")
print("Grants for Rensselaer Polytechnic Institute (RPI):")
print(rpi_df)

# Get unique investigators for RPI
unique_investigators = utils.get_unique_schools(rpi_df)
print("Unique Investigator1_FullName for Rensselaer Polytechnic Institute (RPI):")
print(unique_investigators)

# Count grants per investigator
rpi_df = utils.count_grants_per_investigator(rpi_df)

# Sort data for RPI
rpi_df_sorted = utils.sort_by_columns(rpi_df, ['Institution_Name', 'GrantCount', 'Investigator1_FullName', 'AwardEffectiveDate'], [True, False, True, True])
print("Sorted grants for Rensselaer Polytechnic Institute (RPI):")
print(rpi_df_sorted)

# Save sorted RPI data to CSV
utils.save_to_csv(rpi_df_sorted, "C:/workspace/nsf-award-extract/Data/sorted_grants_rpi.csv")

# Filter and combine data for Albany region and specified schools
specified_schools = [
    "University at Albany, SUNY",
    "Rensselaer Polytechnic Institute (RPI)",
    "Union College",
    "Siena College",
    "The College of Saint Rose",
    "Albany College of Pharmacy and Health Sciences",
    "Russell Sage College",
    "Hudson Valley Community College",
    "Albany Law School",
    "Maria College",
    "Skidmore College",
    "SUNY Empire State College",
    "Schenectady County Community College",
    "Clarkson University Capital Region Campus",
    "Excelsior College"
]
specified_schools_df = utils.filter_by_institution_name(xml_df, '|'.join(specified_schools))
combined_df = pd.concat([albany_region_df, specified_schools_df]).drop_duplicates()
combined_df = utils.count_grants_per_investigator(combined_df)

# Sort combined data
combined_df_sorted = utils.sort_by_columns(combined_df, ['Institution_Name', 'GrantCount', 'Investigator1_FullName', 'AwardEffectiveDate'], [True, False, True, True])
print("Sorted grants for specified schools in the Albany region:")
print(combined_df_sorted)

# Save sorted combined data to CSV
utils.save_to_csv(combined_df_sorted, "C:/workspace/nsf-award-extract/Data/sorted_nsf_grants_albany_schools.csv")

# wide to long investigator data
reshaped_df = utils.reshape_investigator_data(combined_df_sorted)
print("Reshaped data:")
print(reshaped_df)

# Save reshaped data to CSV
utils.save_to_csv(reshaped_df, 'C:/workspace/nsf-award-extract/Data/reshaped_investigators.csv')

Unique locations in the dataset:
NEW YORK, New York
RENO, Nevada
ANN ARBOR, Michigan
MINNEAPOLIS, Minnesota
RALEIGH, North Carolina
Grand Prairie, Texas
SEATTLE, Washington
LA JOLLA, California
EAST LANSING, Michigan
TEMPE, Arizona
BOSTON, Massachusetts
MORGANTOWN, West Virginia
TUCSON, Arizona
PITTSBURGH, Pennsylvania
WASHINGTON, District of Columbia
IRVINE, California
BLACKSBURG, Virginia
DAVIS, California
La Jolla, California
CHARLOTTE, North Carolina
SANTA FE, New Mexico
UNIVERSITY PARK, Pennsylvania
CHAPEL HILL, North Carolina
SAN ANTONIO, Texas
Danville, Virginia
SANTA CRUZ, California
PORTLAND, Oregon
NEWARK, Delaware
ITHACA, New York
PHILADELPHIA, Pennsylvania
POTSDAM, New York
NORTHAMPTON, Massachusetts
LOGAN, Utah
WOODS HOLE, Massachusetts
DURHAM, North Carolina
TALLAHASSEE, Florida
KNOXVILLE, Tennessee
MACY, Nebraska
Pittsburgh, Pennsylvania
PROVO, Utah
BLOOMINGTON, Indiana
Berkeley, California
CORVALLIS, Oregon
ROCHESTER, New York
WILLIAMSPORT, Pennsylvania
Brooklyn, New Yo