## Data Cleaning - Pre-Generated Data Files  
### EPA - United States Environmental Protection Agency  

**Link:** [EPA Air Data Download Files](https://aqs.epa.gov/aqsweb/airdata/download_files.html)


In [1]:
import pandas as pd

def calculate_aqi_from_concentration(concentration, breakpoints):
        for (bp_lo, bp_hi), (aqi_lo, aqi_hi) in breakpoints.items():
            if bp_lo <= concentration <= bp_hi:
                return round(((aqi_hi - aqi_lo) / (bp_hi - bp_lo)) * (concentration - bp_lo) + aqi_lo)
        # Return maximum AQI value otherwise
        return 500

def cleanData(year: str):
    # change to respective path
    df = pd.read_csv(f'../data/raw-data/annual_conc_by_monitor/individual-data/annual_conc_by_monitor_{year}.csv', low_memory=False)
    #Ozone Data Cleaning and Processing
    filtered_Ozone_df = df[(df['Parameter Name'] == 'Ozone') & (df['Sample Duration'] == '8-HR RUN AVG BEGIN HOUR')]

    # Calculating the average of 'Arithmetic Mean' grouped by 'State Code'

    ozone_average_by_state = filtered_Ozone_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='Ozone Average')

    filtered_Sulfur_df = df[(df['Parameter Name'] == 'Sulfur dioxide') & (df['Sample Duration'] == '1 HOUR')]

    #Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
    sulfur_average_by_state = filtered_Sulfur_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='Sulfur Average')

    filtered_Carbon_df = df[(df['Parameter Name'] == 'Carbon monoxide') & (df['Sample Duration'] == '8-HR RUN AVG END HOUR')]

    # Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
    carbon_average_by_state = filtered_Carbon_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='Carbon Average')

    filtered_Nitrogen_df = df[(df['Parameter Name'] == 'Nitrogen dioxide (NO2)') & (df['Sample Duration'] == '1 HOUR')]

    # Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
    nitrogen_average_by_state = filtered_Nitrogen_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='Nitrogen Average')

    filtered_Small_df = df[(df['Parameter Name'] == 'PM2.5 - Local Conditions') & (df['Sample Duration'] == '24-HR BLK AVG')]

    # Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
    small_particulate_average_by_state = filtered_Small_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='PM2.5 Average')

    filtered_df = df[(df['Parameter Name'] == 'PM10 Total 0-10um STP') & (df['Sample Duration'] == '24-HR BLK AVG')]

    # Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
    large_particulate_average_by_state = filtered_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='PM10 Average')

    all_cols = df.columns.tolist()
    selected_cols = [  'Year','State Name',
                     "Ozone Average","Carbon Average",	"Nitrogen Average",	
                     "Sulfur Average",	"PM2.5 Average","PM10 Average",]
    
    remove_cols = list(set(all_cols) - set(selected_cols))
    df.drop(remove_cols, axis=1, inplace=True)

    df.drop_duplicates(inplace=True, ignore_index=True)

    df = df.merge(ozone_average_by_state, on='State Name', how='left')
    df = df.merge(carbon_average_by_state, on='State Name', how='left')
    df = df.merge(nitrogen_average_by_state, on='State Name', how='left')
    df = df.merge(sulfur_average_by_state, on='State Name', how='left')
    df = df.merge(small_particulate_average_by_state, on='State Name', how='left')
    df = df.merge(large_particulate_average_by_state, on='State Name', how='left')
    
      #Breakpoints

    df['Carbon Average'].fillna(0, inplace= True)
    df['Nitrogen Average'].fillna(0, inplace= True)
    df['Ozone Average'].fillna(0, inplace= True)
    df['Sulfur Average'].fillna(0, inplace= True)
    df['PM2.5 Average'].fillna(0, inplace= True)
    df['PM10 Average'].fillna(0, inplace= True)


    pm25_breakpoints = {
        (0.0, 9): (0, 50),
        (9.1, 35.4): (51, 100),
        (35.5, 55.4): (101, 150),
        (55.5, 150.4): (151, 200),
        (150.5, 250.4): (201, 300),
        (250.5, 350.4): (301, 400),
        (350.5, 500.4): (401, 500)
    }
    pm10_breakpoints = {
        (0.0, 54.9): (0, 50),
        (55, 154.9): (51, 100),
        (155, 254.9): (101, 150),
        (255, 354.9): (151, 200),
        (355, 424.9): (201, 300),
        (425, 504.9): (301, 400),
        (505, 609): (401, 500)
    }

    no2_breakpoints = {
        (0, 53): (0, 50),
        (54, 100): (51, 100),
        (101, 360): (101, 150),
        (361, 649): (151, 200),
        (650, 1249): (201, 300),
        (1250, 1649): (301, 400),
        (1650, 2049): (401, 500)
    }
    ozone_breakpoints = {
        (0, 54): (0, 50),
        (55, 70): (51, 100),
        (71, 85): (101, 150),
        (86, 105): (151, 200),
        (106, 200): (201, 300)
    }
    carbon_breakpoints = {
        (0, 4.4): (0, 50),
        (4.5, 9.4): (51, 100),
        (9.5, 12.4): (101, 150),
        (12.5, 15.4): (151, 200),
        (15.5, 30.4): (201, 300)
    }

    sulphur_breakpoints = {
        (0, 35.9): (0, 50),
        (36, 76.9): (51, 100),
        (77, 185.9): (101, 150),
        (186, 304.9): (151, 200),
        (305, 604.9): (201, 300)
    }

    

    # Round each concentration to match the breakpoints
    df['Carbon Average'] = df['Carbon Average'].apply(lambda x: round(x, 1))
    df['Nitrogen Average'] = df['Nitrogen Average'].apply(lambda x: round(x))
    df['Ozone Average'] = df['Ozone Average'].apply(lambda x: round(x))
    df['Sulfur Average'] = df['Sulfur Average'].apply(lambda x: round(x))
    df['PM2.5 Average'] = df['PM2.5 Average'].apply(lambda x: round(x, 1))
    df['PM10 Average'] = df['PM10 Average'].apply(lambda x: round(x, 1))


    # Calculate the AQI for each pollutant
    df['Ozone AQI'] = df['Ozone Average'].apply(lambda x: calculate_aqi_from_concentration(x, ozone_breakpoints))
    df['Carbon AQI'] = df['Carbon Average'].apply(lambda x: calculate_aqi_from_concentration(x, carbon_breakpoints))
    df['Nitrogen AQI'] = df['Nitrogen Average'].apply(lambda x: calculate_aqi_from_concentration(x, no2_breakpoints))
    df['Sulfur AQI'] = df['Sulfur Average'].apply(lambda x: calculate_aqi_from_concentration(x, sulphur_breakpoints))
    df['PM2.5 AQI'] = df['PM2.5 Average'].apply(lambda x: calculate_aqi_from_concentration(x, pm25_breakpoints))
    df['PM10 AQI'] = df['PM10 Average'].apply(lambda x: calculate_aqi_from_concentration(x, pm10_breakpoints))


    # The overall AQI for the date would be the maximum AQI value across all pollutants
    df['Overall AQI'] = df[['PM2.5 AQI', 'Nitrogen AQI', 'Ozone AQI', 'PM10 AQI', 'Sulfur AQI','Carbon AQI']].max(axis=1).round().astype(int)

    return df
    



In [2]:
#Define the years as strings
years = [str(year) for year in range(2000, 2025)]

In [None]:
#Clean all the data.
df = pd.DataFrame()

for year in years:
    partial_df = cleanData(year)
    df = pd.concat([df,partial_df])

df.to_csv('../Processed_Data/AQI_Ratings.csv', index=False)

: 