## Data Cleaning - Pre-Generated Data Files  
### EPA - United States Environmental Protection Agency  

**Link:** [EPA Air Data Download Files](https://aqs.epa.gov/aqsweb/airdata/download_files.html)


In [2]:
import pandas as pd

def cleanData(year: str):
    # change to respective path
    df = pd.read_csv(f'../data/raw-data/annual_conc_by_monitor/individual-data/annual_conc_by_monitor_{year}.csv', low_memory=False)
    #Ozone Data Cleaning and Processing
    filtered_Ozone_df = df[(df['Parameter Name'] == 'Ozone') & (df['Sample Duration'] == '8-HR RUN AVG BEGIN HOUR')]

    # Calculating the average of 'Arithmetic Mean' grouped by 'State Code'

    ozone_average_by_state = filtered_Ozone_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='Ozone Average')

    filtered_Sulfur_df = df[(df['Parameter Name'] == 'Sulfur dioxide') & (df['Sample Duration'] == '1 HOUR')]

    #Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
    sulfur_average_by_state = filtered_Sulfur_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='Sulfur Average')

    filtered_Carbon_df = df[(df['Parameter Name'] == 'Carbon monoxide') & (df['Sample Duration'] == '8-HR RUN AVG END HOUR')]

    # Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
    carbon_average_by_state = filtered_Carbon_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='Carbon Average')

    filtered_Nitrogen_df = df[(df['Parameter Name'] == 'Nitrogen dioxide (NO2)') & (df['Sample Duration'] == '1 HOUR')]

    # Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
    nitrogen_average_by_state = filtered_Nitrogen_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='Nitrogen Average')

    filtered_Small_df = df[(df['Parameter Name'] == 'PM2.5 - Local Conditions') & (df['Sample Duration'] == '24-HR BLK AVG')]

    # Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
    small_particulate_average_by_state = filtered_Small_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='PM2.5 Average')

    filtered_df = df[(df['Parameter Name'] == 'PM10 Total 0-10um STP') & (df['Sample Duration'] == '24-HR BLK AVG')]

    # Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
    large_particulate_average_by_state = filtered_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='PM10 Average')

    all_cols = df.columns.tolist()
    selected_cols = [  'Year','State Name',
                     "Ozone Average","Carbon Average",	"Nitrogen Average",	
                     "Sulfur Average",	"PM2.5 Average","PM10 Average",]
    
    remove_cols = list(set(all_cols) - set(selected_cols))
    df.drop(remove_cols, axis=1, inplace=True)

    df.drop_duplicates(inplace=True, ignore_index=True)

    df = df.merge(ozone_average_by_state, on='State Name', how='left')
    df = df.merge(carbon_average_by_state, on='State Name', how='left')
    df = df.merge(nitrogen_average_by_state, on='State Name', how='left')
    df = df.merge(sulfur_average_by_state, on='State Name', how='left')
    df = df.merge(small_particulate_average_by_state, on='State Name', how='left')
    df = df.merge(large_particulate_average_by_state, on='State Name', how='left')
    

    return df
    



In [3]:
#Define the years as strings
years = [str(year) for year in range(2000, 2024)]

In [6]:
#Clean all the data.
df = pd.DataFrame()

for year in years:
    partial_df = cleanData(year)
    df = pd.concat([df,partial_df])

df.to_csv('../Processed_Data/AQI_Ratings.csv', index=False)