## Data Cleaning - Pre-Generated Data Files  
### EPA - United States Environmental Protection Agency  

**Link:** [EPA Air Data Download Files](https://aqs.epa.gov/aqsweb/airdata/download_files.html)


In [13]:
import pandas as pd

In [48]:
def cleanData(year:str):
    df = pd.read_csv(f'../data/raw-data/annual_conc_by_monitor/individual-data/annual_conc_by_monitor_{year}.csv', low_memory=False)
    boro = ['Bronx', 'Kings', 'New York', 'Queens', 'Richmond']
    df = df[(df['State Name'] == 'New York')&(df['County Name'].isin(boro))]
    
    all_cols = df.columns.tolist()
    selected_cols = ['Latitude', 'Longitude', 'Datum', 'Parameter Name', 'Sample Duration', 'Pollutant Standard', 'Metric Used', 'Method Name', 'Year', 'Units of Measure', 'Observation Count', 'Observation Percent', 'Arithmetic Mean', 'Arithmetic Standard Dev', 'Local Site Name', 'Address', 'State Name', 'County Name','City Name']
    remove_cols = list(set(all_cols)-set(selected_cols))
    df.drop(remove_cols, axis=1, inplace=True)

    df.loc[df['County Name'] == 'Kings', 'County Name'] = 'Brooklyn'
    df.loc[df['County Name'] == 'New York', 'County Name'] = 'Manhatten'
    df.loc[df['County Name'] == 'Richmond', 'County Name'] = 'Staten Island'
    

    df.to_csv(f'../data/clean-data/annual_conc_by_monitor/individual-data/annual_conc_by_monitor_{year}.csv', index=False)
    df

In [49]:
#Define the years as strings
years = ['2000','2001','2002','2003','2004','2005','2006','2007']

In [50]:
#Clean all the data.
for year in years:
    cleanData(year)

In [51]:
def mergeData(start:int, stop:int):
    df_list = []
    for i in range(start,stop+1):
        df = pd.read_csv(f'../data/clean-data/annual_conc_by_monitor/individual-data/annual_conc_by_monitor_{str(i)}.csv', low_memory=False)
        df['Year'] = i
        df_list.append(df)
    
    union_df = pd.concat(df_list)
    union_df.to_csv(f'../data/clean-data/annual_conc_by_monitor/combined-data/annual_conc_by_monitor_{str(start)}-{str(stop)}.csv', index=False)

    

In [53]:
mergeData(2000,2007)