## Data Cleaning - Pre-Generated Data Files  
### EPA - United States Environmental Protection Agency  

**Link:** [EPA Air Data Download Files](https://aqs.epa.gov/aqsweb/airdata/download_files.html)


In [12]:
import pandas as pd

In [2]:
import pandas as pd

def cleanData(year: str):
    # change to respective path
    df = pd.read_csv(f'/Users/rahib/ctp_data/annual_conc_by_monitor_{year}.csv', low_memory=False)
    
    all_cols = df.columns.tolist()
    selected_cols = ['Latitude', 'Longitude', 'Datum', 'Parameter Name', 'Sample Duration', 
                     'Pollutant Standard', 'Metric Used', 'Method Name', 'Year', 
                     'Units of Measure', 'Observation Count', 'Observation Percent', 
                     'Arithmetic Mean', 'Arithmetic Standard Dev', 'Local Site Name', 
                     'Address', 'State Name', 'County Name', 'City Name']
    
    remove_cols = list(set(all_cols) - set(selected_cols))
    df.drop(remove_cols, axis=1, inplace=True)
    
    parameter_list = ["Ozone", "Sulfur dioxide", "Carbon monoxide", "Nitrogen dioxide (NO2)"]
    df = df[df["Parameter Name"].isin(parameter_list)]  # Exclude unwanted parameters

    # change to respective path
    df.to_csv(f'/Users/rahib/ctp_data/filtered_data_{year}.csv', index=False)



In [13]:
# Ozone filteration and average calulation by state

df = pd.read_csv(f'/Users/rahib/ctp_data/annual_conc_by_monitor_2023.csv', low_memory=False)

filtered_df = df[(df['Parameter Name'] == 'Ozone') & (df['Sample Duration'] == '8-HR RUN AVG BEGIN HOUR')]

# Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
ozone_average_by_state = filtered_df.groupby('State Name')['Arithmetic Mean'].mean()

# Display the result
print(ozone_average_by_state)

State Name
Alabama                 0.043222
Alaska                  0.028366
Arizona                 0.049037
Arkansas                0.041512
California              0.042853
Colorado                0.049375
Connecticut             0.044706
Country Of Mexico       0.042174
Delaware                0.044349
District Of Columbia    0.039905
Florida                 0.038653
Georgia                 0.042953
Hawaii                  0.026811
Idaho                   0.046905
Illinois                0.046454
Indiana                 0.044837
Iowa                    0.045449
Kansas                  0.047159
Kentucky                0.044757
Louisiana               0.040354
Maine                   0.036762
Maryland                0.045658
Massachusetts           0.040997
Michigan                0.042968
Minnesota               0.042678
Mississippi             0.044182
Missouri                0.047129
Montana                 0.044070
Nebraska                0.046293
Nevada                  0.047099

In [14]:
#Sulphur averagecalulcation by state

df = pd.read_csv(f'/Users/rahib/ctp_data/annual_conc_by_monitor_2023.csv', low_memory=False)

filtered_df = df[(df['Parameter Name'] == 'Sulfur dioxide') & (df['Sample Duration'] == '1 HOUR')]

# Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
sulfur_average_by_state = filtered_df.groupby('State Name')['Arithmetic Mean'].mean()

# Display the result
print(sulfur_average_by_state)

State Name
Alabama                  1.951925
Alaska                   2.182044
Arizona                  1.755658
Arkansas                 0.741973
California               0.654154
Colorado                 2.059463
Connecticut              0.245845
Delaware                 0.420791
District Of Columbia     0.655256
Florida                  1.574574
Georgia                  2.311342
Hawaii                   2.731039
Idaho                    3.881806
Illinois                 1.806726
Indiana                  1.805423
Iowa                     0.808401
Kansas                   0.272587
Kentucky                 1.452255
Louisiana                1.940870
Maine                    0.076555
Maryland                 0.326567
Massachusetts            0.374783
Michigan                 1.540018
Minnesota                0.929310
Mississippi              0.255040
Missouri                 9.891772
Montana                  1.782442
Nebraska                 1.572441
Nevada                   0.666151
New

In [15]:
#Carbon Average Calculation by State

df = pd.read_csv(f'/Users/rahib/ctp_data/annual_conc_by_monitor_2023.csv', low_memory=False)

filtered_df = df[(df['Parameter Name'] == 'Carbon monoxide') & (df['Sample Duration'] == '8-HR RUN AVG END HOUR')]

# Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
carbon_average_by_state = filtered_df.groupby('State Name')['Arithmetic Mean'].mean()

# Display the result
print(carbon_average_by_state)

State Name
Alabama                 0.179913
Alaska                  0.325719
Arizona                 0.287274
Arkansas                0.420219
California              0.309614
Colorado                0.262584
Connecticut             0.261225
District Of Columbia    0.325275
Florida                 0.278191
Georgia                 0.472353
Hawaii                  0.113790
Idaho                   0.193458
Illinois                0.233717
Indiana                 0.315926
Iowa                    0.217986
Kansas                  0.143523
Kentucky                0.260309
Louisiana               0.296406
Maine                   0.178426
Maryland                0.209562
Massachusetts           0.287923
Michigan                0.332531
Minnesota               0.314648
Mississippi             0.238401
Missouri                0.271601
Montana                 0.171858
Nebraska                0.238679
Nevada                  0.259397
New Hampshire           0.153393
New Jersey              0.376049

In [16]:
#Nitrogen Average Calculation by State

df = pd.read_csv(f'/Users/rahib/ctp_data/annual_conc_by_monitor_2023.csv', low_memory=False)

filtered_df = df[(df['Parameter Name'] == 'Nitrogen dioxide (NO2)') & (df['Sample Duration'] == '1 HOUR')]

# Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
nitrogen_average_by_state = filtered_df.groupby('State Name')['Arithmetic Mean'].mean()

# Display the result
print(nitrogen_average_by_state)

State Name
Alabama                 14.857208
Arizona                 19.716058
Arkansas                 6.375365
California              12.780099
Colorado                11.641546
Connecticut             15.998629
Delaware                13.673875
District Of Columbia    16.369841
Florida                 12.104429
Georgia                 19.490053
Hawaii                   6.145184
Idaho                   14.984825
Illinois                17.692747
Indiana                 11.464871
Iowa                     6.355422
Kansas                   7.636685
Kentucky                11.314929
Louisiana                8.641623
Maine                    6.726784
Maryland                13.653340
Massachusetts           11.251638
Michigan                14.653077
Minnesota               10.801407
Mississippi              6.582739
Missouri                13.550632
Montana                  2.950513
Nevada                  20.307559
New Hampshire            3.545907
New Jersey              17.622178
New

In [17]:
#PM2.5 Average Calculation by State

df = pd.read_csv(f'/Users/rahib/ctp_data/annual_conc_by_monitor_2023.csv', low_memory=False)

filtered_df = df[(df['Parameter Name'] == 'PM2.5 - Local Conditions') & (df['Sample Duration'] == '24-HR BLK AVG')]

# Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
small_particulate_average_by_state = filtered_df.groupby('State Name')['Arithmetic Mean'].mean()

# Display the result
print(small_particulate_average_by_state)

State Name
Alabama                  8.370108
Alaska                   4.545506
Arizona                  7.319577
Arkansas                11.000141
California               7.661331
Colorado                 6.030074
Connecticut              7.549358
Country Of Mexico       12.605847
Delaware                 8.936746
District Of Columbia     8.722264
Florida                  7.057733
Georgia                 10.016676
Hawaii                   3.271501
Illinois                 9.100609
Indiana                 10.438122
Iowa                    10.845811
Kansas                   8.144290
Kentucky                 8.498699
Louisiana                9.032349
Maine                    6.166939
Maryland                 7.039947
Massachusetts            7.023527
Michigan                10.817855
Minnesota                7.864462
Mississippi             10.001916
Missouri                 9.631491
Montana                  7.567665
Nebraska                 7.244999
Nevada                   5.079409
New

In [18]:
#PM10 Average Calculation by State

df = pd.read_csv(f'/Users/rahib/ctp_data/annual_conc_by_monitor_2023.csv', low_memory=False)

filtered_df = df[(df['Parameter Name'] == 'PM10 Total 0-10um STP') & (df['Sample Duration'] == '24-HR BLK AVG')]


# Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
large_particulate_average_by_state = filtered_df.groupby('State Name')['Arithmetic Mean'].mean()

# Display the result
print(large_particulate_average_by_state)

State Name
Alabama                 19.918663
Alaska                  11.625622
Arizona                 32.005115
California              25.249740
Colorado                18.993431
Connecticut             14.896639
Country Of Mexico       54.398329
District Of Columbia    17.661729
Florida                 18.530195
Georgia                 18.415520
Hawaii                  14.537993
Idaho                   45.367379
Illinois                29.173494
Indiana                 21.001673
Iowa                    34.920504
Kansas                  24.712202
Kentucky                20.314208
Louisiana               19.700003
Maine                   14.365160
Massachusetts           13.733037
Michigan                20.087767
Minnesota               21.764107
Mississippi             20.474926
Missouri                24.652847
Montana                 18.555313
Nebraska                21.063439
Nevada                  21.037694
New Hampshire           11.245287
New Mexico              27.593661
Nor

KeyError: 'id'

In [3]:
#Define the years as strings
years = ["2023"]

In [4]:
#Clean all the data.
for year in years:
    cleanData(year)