Initial Data Exploration

In [1]:
#Import dependencies

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
# Read in CSV
raw_data_2022_mn_pm25 = "Data/2022_MN_PM_25.csv"
df_raw_2022_mn_pm25 = pd.read_csv(raw_data_2022_mn_pm25)

In [5]:
# Drop unneeded columns
df_raw_2022_mn_pm25_dropped = df_raw_2022_mn_pm25.drop(columns = ["Source", "Site ID", "DAILY_OBS_COUNT", "PERCENT_COMPLETE", 
                                                                  "AQS_PARAMETER_CODE", "AQS_PARAMETER_DESC", "CBSA_CODE", 
                                                                  "CBSA_NAME", "COUNTY_CODE", "STATE_CODE", "STATE"])
#df_raw_2022_mn_pm25_dropped

In [6]:
# Rename columns
columns = {"Date": "Date", "POC": "POC", "Daily Mean PM2.5 Concentration": "Day_Mean_PM25_Conc", "UNITS": "Units", 
           "DAILY_AQI_VALUE":"Daily_AQI", "Site Name": "Site_Name", "COUNTY": "County", "SITE_LATITUDE": "Latitude",
          "SITE_LONGITUDE": "Longitude"}

df_raw_2022_mn_pm25_rename = df_raw_2022_mn_pm25_dropped.rename(columns = columns)
#df_raw_2022_mn_pm25_rename

In [7]:
# Add column "AQI_Concern" to reflect AQI category names
df_raw_2022_mn_pm25_rename["AQI_Concern"] = np.where(df_raw_2022_mn_pm25_rename["Daily_AQI"] <= 50, "Good", 
                                                     np.where(df_raw_2022_mn_pm25_rename["Daily_AQI"] <= 100, "Moderate", 
                                                              np.where(df_raw_2022_mn_pm25_rename["Daily_AQI"] <= 150, "Unhealthy for Sensitive Groups", 
                                                                       np.where(df_raw_2022_mn_pm25_rename["Daily_AQI"] <= 200, "Unhealthy", 
                                                                                np.where(df_raw_2022_mn_pm25_rename["Daily_AQI"] <= 300, "Very Unhealthy", "Hazardous")))))

In [8]:
# Change Date column to datetime
df_raw_2022_mn_pm25_rename["Date"] = pd.to_datetime(df_raw_2022_mn_pm25_rename["Date"])
df_raw_2022_mn_pm25_rename.dtypes

Date                  datetime64[ns]
POC                            int64
Day_Mean_PM25_Conc           float64
Units                         object
Daily_AQI                      int64
Site_Name                     object
County                        object
Latitude                     float64
Longitude                    float64
AQI_Concern                   object
dtype: object

In [9]:
df_raw_2022_mn_pm25_rename.groupby(["Site_Name"]).count()

Unnamed: 0_level_0,Date,POC,Day_Mean_PM25_Conc,Units,Daily_AQI,County,Latitude,Longitude,AQI_Concern
Site_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Andersen School,458,458,458,458,458,458,458,458,458
Anoka County Airport,468,468,468,468,468,468,468,468,468
Apple Valley,491,491,491,491,491,491,491,491,491
B.F. Pearson School,355,355,355,355,355,355,355,355,355
Ben Franklin School,359,359,359,359,359,359,359,359,359
Boundary Waters,413,413,413,413,413,413,413,413,413
Brainerd Lakes Regional Airport,343,343,343,343,343,343,343,343,343
FWS Wetland Management District,338,338,338,338,338,338,338,338,338
Fond du Lac Band,356,356,356,356,356,356,356,356,356
Grand Portage Band,335,335,335,335,335,335,335,335,335


In [None]:
# Look at Anoka separately

anoka_2022_pm25 = df_raw_2022_mn_pm25_rename.loc[df_raw_2022_mn_pm25_rename["Site_Name"] == "Anoka County Airport"]
anoka_2022_pm25

In [None]:
anoka_2022_pm25.loc[anoka_2022_pm25["Date"] == "2022-01-02"]

In [None]:
anoka_2022_pm25_daily = anoka_2022_pm25[anoka_2022_pm25["POC"] == 3]
anoka_2022_pm25_daily

In [None]:
# Graph Anoka County Airport AQI
plt.figure(figsize=(10, 6))
x_values = anoka_2022_pm25_daily["Date"]
y_values = anoka_2022_pm25_daily["Daily_AQI"]
plt.scatter(x_values, y_values, linewidth = 0.5, marker = '.')

plt.title("Anoka County Airport Air Quality Index")
plt.xlabel("Date")
plt.ylabel("Air Quality Index")


plt.show()

In [None]:
# Make color-coded scatter plot for Anoka County Airport
moderate_bound = 50
unhealthy_for_special_groups_bound = 100
unhealthy_bound = 150
very_unhealthy_bound = 200
hazardous_bound = 300

x = anoka_2022_pm25_daily["Date"]
y = anoka_2022_pm25_daily["Daily_AQI"]

good = np.ma.masked_where(y > moderate_bound, y)
moderate = np.ma.masked_where((y <= moderate_bound) | (y > unhealthy_for_special_groups_bound), y)
unhealthy_special_groups = np.ma.masked_where((y <= unhealthy_for_special_groups_bound) | (y > unhealthy_bound), y)
unhealthy = np.ma.masked_where((y <= unhealthy_bound) | (y > very_unhealthy_bound), y)
very_unhealthy = np.ma.masked_where((y <= very_unhealthy_bound) | (y > hazardous_bound), y)
hazardous = np.ma.masked_where(y <= hazardous_bound, y)

fig, ax = plt.subplots()
ax.scatter(x, good, color = 'green', marker = '.')
ax.scatter(x, moderate, color = 'yellow', marker = '.')
ax.scatter(x, unhealthy_special_groups, color = 'orange', marker = '.')
ax.scatter(x, unhealthy, color = 'red', marker = '.')
ax.scatter(x, very_unhealthy, color = 'purple', marker = '.')
ax.scatter(x, hazardous, color = 'maroon', marker = '.')

plt.xlabel("Date")
plt.ylabel("Air Quality Index")
plt.title("AQI for Anoka County Airport in 2022")

plt.show()

In [10]:
# Generalize the color-coded scatter plot process

def scatter_plot_color(df_name, pollutant_name, location_name, year):
    moderate_bound = 50
    unhealthy_for_special_groups_bound = 100
    unhealthy_bound = 150
    very_unhealthy_bound = 200
    hazardous_bound = 300

    x_values = df_name["Date"]
    y_values = df_name["Daily_AQI"]
    
    good = np.ma.masked_where(y > moderate_bound, y)
    moderate = np.ma.masked_where((y <= moderate_bound) | (y > unhealthy_for_special_groups_bound), y)
    unhealthy_special_groups = np.ma.masked_where((y <= unhealthy_for_special_groups_bound) | (y > unhealthy_bound), y)
    unhealthy = np.ma.masked_where((y <= unhealthy_bound) | (y > very_unhealthy_bound), y)
    very_unhealthy = np.ma.masked_where((y <= very_unhealthy_bound) | (y > hazardous_bound), y)
    hazardous = np.ma.masked_where(y <= hazardous_bound, y)
    
    fig, ax = plt.subplots()
    ax.scatter(x, good, color = 'green', marker = '.')
    ax.scatter(x, moderate, color = 'yellow', marker = '.')
    ax.scatter(x, unhealthy_special_groups, color = 'orange', marker = '.')
    ax.scatter(x, unhealthy, color = 'red', marker = '.')
    ax.scatter(x, very_unhealthy, color = 'purple', marker = '.')
    ax.scatter(x, hazardous, color = 'maroon', marker = '.')

    plt.xlabel("Date")
    plt.ylabel("Air Quality Index")
    plt.title(f'{pollutant_name} AQI for {location_name} in {year}')
    
    plt.show()

In [None]:
# Test function on Andersen School data

andersen_df_pm25_2022 = df_raw_2022_mn_pm25_rename.loc[df_raw_2022_mn_pm25_rename["Site_Name"] == "Andersen School"]
andersen_df_pm25_2022

In [None]:
andersen_df_pm25_2022.groupby(["POC"]).count()

In [None]:
andersen_df_pm25_2022 = andersen_df_pm25_2022[andersen_df_pm25_2022["POC"] == 3]

In [None]:
scatter_plot_color(andersen_df_pm25_2022, "PM 2.5", "Andersen School", 2022)

In [11]:
# Generalize separating location data and identifying POC with most data

def data_split(raw_df_name, pollutant_name, year):
    # Clear variables
    location_list = []
    location_list_renamed = []
    df_names_list = []
    
    # Create list of unique locations
    location_list = raw_df_name["Site_Name"].unique().tolist()
    
        
    for index in range(len(location_list)):
        # Rename locations for consistency
        location_list[index] = location_list[index].replace(' ', '_').replace('/', "_").replace(":", "")
        # Create names for dataframes    
        df_names_list.append(f"df_{location_list_renamed[index]}_pm25_2022")
        # Create location dataframes
        for index2 in range(len(df_names_list)):
            
        # Group each dataframe by POC

        # Choose POC with most data

SyntaxError: incomplete input (3896899315.py, line 21)

In [None]:
location_0_df = df_raw_2022_mn_pm25_rename[df_raw_2022_mn_pm25_rename["Site_Name"] == location_list[0]]
location_0_df

TESTING TO CREATE FUNCTION

In [19]:
location_list = []
location_list_renamed = []
df_names_list = []

location_list = df_raw_2022_mn_pm25_rename["Site_Name"].unique().tolist()

In [20]:
for index in range(len(location_list)):
    # Rename locations for consistency
    location_list_renamed.append(location_list[index].replace(' ', '_').replace('/', "_").replace(":", ""))
    # Create names for dataframes    
    df_names_list.append(f"df_{location_list_renamed[index]}_pm25_2022")

# location_list_renamed    
df_names_list

['df_Anoka_County_Airport_pm25_2022',
 'df_FWS_Wetland_Management_District_pm25_2022',
 'df_Red_Lake_Nation_pm25_2022',
 'df_Fond_du_Lac_Band_pm25_2022',
 'df_Leech_Lake_Nation_pm25_2022',
 'df_Grand_Portage_Band_pm25_2022',
 'df_Brainerd_Lakes_Regional_Airport_pm25_2022',
 'df_Apple_Valley_pm25_2022',
 'df_Near_Road_I-35_pm25_2022',
 'df_Near_Road_I-35_I-94_pm25_2022',
 'df_Andersen_School_pm25_2022',
 'df_St._Louis_Park_City_Hall_pm25_2022',
 'df_Boundary_Waters_pm25_2022',
 'df_Southwest_Minnesota_Regional_Airport_pm25_2022',
 'df_Ben_Franklin_School_pm25_2022',
 'df_Ramsey_Health_Center_pm25_2022',
 'df_Harding_High_School_pm25_2022',
 'df_Voyageurs_NP_-_Sullivan_Bay_pm25_2022',
 'df_Virginia_City_Hall_pm25_2022',
 'df_U_of_M_-_Duluth_pm25_2022',
 'df_Laura_MacArthur_School_pm25_2022',
 'df_B.F._Pearson_School_pm25_2022',
 'df_Talahi_School_pm25_2022',
 'df_Great_River_Bluffs_pm25_2022',
 'df_St._Michael_Elementary_School_pm25_2022']