# 0. USA-NPN Data Processing

In [1]:
import pandas as pd

In [2]:
# Load USA-NPN dataset
usa_npn = pd.read_csv("../data/raw/USA-NPN_status_intensity_observations_data.csv", na_values=[-9999.00])

In [3]:
def format_df(df):
    # Drop any rows with no Observation_Date value
    #usa_npn.dropna(subset=['Observation_Date'], inplace=True)
    
    # Cast Observation_Date column as datetime
    df['Observation_Date'] = pd.to_datetime(df['Observation_Date'])
    
    # Extract the year and create a new column
    df.insert(0, "Year", df['Observation_Date'].dt.year)
    
    # Add the location name
    df.insert(0, "location", df['Site_ID'])
    
    # Initialize empty dataframe
    full_bloom_df = pd.DataFrame()
    
    # Iterate through unique combinations of Species_ID and Site_ID
    for species_id, site_id in df[['Species_ID', 'Site_ID']].drop_duplicates().values:
        # Copy USA-NPN dataset for the specific Species_ID and Site_ID combination
        t_df = df[(df['Species_ID'] == species_id) & (df['Site_ID'] == site_id)]
        
        # Iterate through every year available in the dataset for the specific Species_ID and Site_ID combination
        for year in t_df.Year.unique():
            # Subset by selected year
            t_df_year = t_df[t_df['Year'] == year]
            
            # Find the first day where the Phenophase_Status was 1 (e.g. bloom date)
            t_df_year = t_df_year[t_df_year['Phenophase_Status'] == 1].sort_values(by=['Day_of_Year'], ascending=True).head(1)
            
            # Concatenate the bloom date row with the results dataframe
            full_bloom_df = pd.concat([full_bloom_df, t_df_year])
    
    # Drop unecessary column
    full_bloom_df = full_bloom_df.drop(columns=['Phenophase_Status'])
    full_bloom_df = full_bloom_df.reset_index(drop=True)
    
    return full_bloom_df

In [8]:
# Get NYC-specific data
nyc_df = usa_npn[(usa_npn['Site_ID'] == 32789) & (usa_npn['Species_ID'] == 228)]
nyc_df = nyc_df[['Species_ID', 'Site_ID', 'Latitude', 'Longitude', 'Elevation_in_Meters', 'Observation_Date', 'Day_of_Year', 'Phenophase_Status']]

nyc_bloom_df = format_df(nyc_df)

nyc_bloom_df = nyc_bloom_df[['location', 'Latitude', 'Longitude', 'Elevation_in_Meters', 'Year', 'Observation_Date', 'Day_of_Year']]

nyc_bloom_df.columns = ['location', 'lat', 'long', 'alt', 'year', 'bloom_date', 'bloom_doy']

# Output the dataframe to CSV
nyc_bloom_df.to_csv("../data/interim/main_locations/validation/nyc.csv", index=False)

In [9]:
# Get USA data
usa_df = usa_npn[(usa_npn['Site_ID'] != 32789) | (usa_npn['Species_ID'] != 228)]
usa_df = usa_df[['Species_ID', 'Site_ID', 'Latitude', 'Longitude', 'Elevation_in_Meters', 'Observation_Date', 'Day_of_Year', 'Phenophase_Status']]

usa_bloom_df = format_df(usa_df)

usa_bloom_df = usa_bloom_df[['location', 'Latitude', 'Longitude', 'Elevation_in_Meters', 'Year', 'Observation_Date', 'Day_of_Year']]

usa_bloom_df.columns = ['location', 'lat', 'long', 'alt', 'year', 'bloom_date', 'bloom_doy']

# Output the dataframe to CSV
usa_bloom_df.to_csv("../data/raw/usa_formatted.csv", index=False)