# Air Quality Analysis
## Contributor: Sam Espe
---

This is a self-guided project to look into EPA air quality data.

#### Functions

In [1]:
#Import dependencies

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
# Generalize processing raw CSV

def clean_data(raw_csv_path):
    # Clear variables
    column_names = []
    
    # Read in raw data
    data_path = raw_csv_path
    raw_dataframe = pd.read_csv(raw_csv_path)
    
    # Drop unnecessary columns
    raw_df_dropped = raw_dataframe.drop(columns = ["Source", "Site ID", "DAILY_OBS_COUNT", "PERCENT_COMPLETE", 
                                                   "AQS_PARAMETER_CODE", "AQS_PARAMETER_DESC", "CBSA_CODE", 
                                                   "CBSA_NAME", "COUNTY_CODE", "STATE_CODE", "STATE"])
    
    # Rename columns
    column_names = {"Date": "Date", "POC": "POC", "Daily Mean PM2.5 Concentration": "Day_Mean_PM25_Conc", "UNITS": "Units", 
           "DAILY_AQI_VALUE":"Daily_AQI", "Site Name": "Site_Name", "COUNTY": "County", "SITE_LATITUDE": "Latitude",
          "SITE_LONGITUDE": "Longitude"}
    
    raw_df_renamed = raw_df_dropped.rename(columns = column_names)
    
    # Change date column to datetime 
    raw_df_renamed["Date"] = pd.to_datetime(raw_df_renamed["Date"])
    
    # Add "AQI_Concern" column to reflect AQI category names
    raw_df_renamed["AQI_Concern"] = np.where(raw_df_renamed["Daily_AQI"] <= 50, "Good", 
                                                     np.where(raw_df_renamed["Daily_AQI"] <= 100, "Moderate", 
                                                              np.where(raw_df_renamed["Daily_AQI"] <= 150, "Unhealthy for Sensitive Groups", 
                                                                       np.where(raw_df_renamed["Daily_AQI"] <= 200, "Unhealthy", 
                                                                                np.where(raw_df_renamed["Daily_AQI"] <= 300, "Very Unhealthy", "Hazardous")))))
    
    return raw_df_renamed 

In [3]:
# Generalize separating location data and identifying POC with most data

def data_split(raw_df_name, state, pollutant_name, year):
    # Clear variables
    location_list = []
    data_dict = {}
    
    # Create list of unique locations
    location_list = raw_df_name["Site_Name"].unique().tolist()
    for location in location_list:
        location_df = raw_df_name.loc[raw_df_name["Site_Name"] == location]
        poc_list_location = dict(location_df.value_counts("POC"))
    
        # Choose POC with most data
        if len(poc_list_location) > 1:
            location_df = location_df.loc[location_df["POC"] == max(poc_list_location, key = poc_list_location.get)]
    
        location_standard_name = location.replace(' ', '_').replace('/', "_").replace(":", "").replace("-", "").replace(".", "")
        new_name = f"{location_standard_name}_{state}_{pollutant_name}_{year}"
        data_dict[new_name] = location_df
        
    return data_dict

In [4]:
# Generalize the color-coded scatter plot process

def scatter_plot_color(df_name, pollutant_name, location_name, year):
    moderate_bound = 50
    unhealthy_for_special_groups_bound = 100
    unhealthy_bound = 150
    very_unhealthy_bound = 200
    hazardous_bound = 300

    x_values = df_name["Date"]
    y_values = df_name["Daily_AQI"]
    
    good = np.ma.masked_where(y_values > moderate_bound, y_values)
    moderate = np.ma.masked_where((y_values <= moderate_bound) | (y_values > unhealthy_for_special_groups_bound), y_values)
    unhealthy_special_groups = np.ma.masked_where((y_values <= unhealthy_for_special_groups_bound) | (y_values > unhealthy_bound), y_values)
    unhealthy = np.ma.masked_where((y_values <= unhealthy_bound) | (y_values > very_unhealthy_bound), y_values)
    very_unhealthy = np.ma.masked_where((y_values <= very_unhealthy_bound) | (y_values > hazardous_bound), y_values)
    hazardous = np.ma.masked_where(y_values <= hazardous_bound, y_values)
    
    fig, ax = plt.subplots()
    ax.scatter(x_values, good, color = 'green', marker = '.')
    ax.scatter(x_values, moderate, color = 'yellow', marker = '.')
    ax.scatter(x_values, unhealthy_special_groups, color = 'orange', marker = '.')
    ax.scatter(x_values, unhealthy, color = 'red', marker = '.')
    ax.scatter(x_values, very_unhealthy, color = 'purple', marker = '.')
    ax.scatter(x_values, hazardous, color = 'maroon', marker = '.')

    plt.xlabel("Date")
    plt.ylabel("Air Quality Index")
    plt.title(f'{pollutant_name} AQI for {location_name} in {year}')
    
    plt.show()

#### MN PM 2.5 data for 2022