## Data Cleaning - Pre-Generated Data Files  
### EPA - United States Environmental Protection Agency  

**Link:** [EPA Air Data Download Files](https://aqs.epa.gov/aqsweb/airdata/download_files.html)


In [20]:
import pandas as pd

In [51]:
import pandas as pd

def cleanData(year: str):
    # change to respective path
    df = pd.read_csv(f'../data/clean-data/annual_conc_by_monitor/individual-data/annual_conc_by_monitor_{year}.csv', low_memory=False)
    
    #Ozone Data Cleaning and Processing

    filtered_Ozone_df = df[(df['Parameter Name'] == 'Ozone') & (df['Sample Duration'] == '8-HR RUN AVG BEGIN HOUR')]

    # Calculating the average of 'Arithmetic Mean' grouped by 'State Code'

    ozone_average_by_state = filtered_Ozone_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='Ozone Average')

    filtered_Sulfur_df = df[(df['Parameter Name'] == 'Sulfur dioxide') & (df['Sample Duration'] == '1 HOUR')]

    #Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
    sulfur_average_by_state = filtered_Sulfur_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='Sulfur Average')

    filtered_Carbon_df = df[(df['Parameter Name'] == 'Carbon monoxide') & (df['Sample Duration'] == '8-HR RUN AVG END HOUR')]

    # Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
    carbon_average_by_state = filtered_Carbon_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='Carbon Average')

    filtered_Nitrogen_df = df[(df['Parameter Name'] == 'Nitrogen dioxide (NO2)') & (df['Sample Duration'] == '1 HOUR')]

    # Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
    nitrogen_average_by_state = filtered_Nitrogen_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='Nitrogen Average')

    filtered_Small_df = df[(df['Parameter Name'] == 'PM2.5 - Local Conditions') & (df['Sample Duration'] == '24-HR BLK AVG')]

    # Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
    small_particulate_average_by_state = filtered_Small_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='PM2.5 Average')

    filtered_df = df[(df['Parameter Name'] == 'PM10 Total 0-10um STP') & (df['Sample Duration'] == '24-HR BLK AVG')]

    # Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
    large_particulate_average_by_state = filtered_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='PM10 Average')

    all_cols = df.columns.tolist()
    selected_cols = [   'State Name',
                     "Ozone Average","Carbon Average",	"Nitrogen Average",	
                     "Sulfur Average",	"PM2.5 Average","PM10 Average",]
    
    remove_cols = list(set(all_cols) - set(selected_cols))
    df.drop(remove_cols, axis=1, inplace=True)

    df.drop_duplicates(inplace=True, ignore_index=True)

    df = df.merge(ozone_average_by_state, on='State Name', how='left')
    df = df.merge(carbon_average_by_state, on='State Name', how='left')
    df = df.merge(nitrogen_average_by_state, on='State Name', how='left')
    df = df.merge(sulfur_average_by_state, on='State Name', how='left')
    df = df.merge(small_particulate_average_by_state, on='State Name', how='left')
    df = df.merge(large_particulate_average_by_state, on='State Name', how='left')
    

    #Breakpoints

    df['Carbon Average'].fillna(0, inplace= True)
    df['Nitrogen Average'].fillna(0, inplace= True)
    df['Ozone Average'].fillna(0, inplace= True)
    df['Sulfur Average'].fillna(0, inplace= True)
    df['PM2.5 Average'].fillna(0, inplace= True)
    df['PM10 Average'].fillna(0, inplace= True)


    pm25_breakpoints = {
        (0.0, 9): (0, 50),
        (9.1, 35.4): (51, 100),
        (35.5, 55.4): (101, 150),
        (55.5, 150.4): (151, 200),
        (150.5, 250.4): (201, 300),
        (250.5, 350.4): (301, 400),
        (350.5, 500.4): (401, 500)
    }
    pm10_breakpoints = {
        (0.0, 54.9): (0, 50),
        (55, 154.9): (51, 100),
        (155, 254.9): (101, 150),
        (255, 354.9): (151, 200),
        (355, 424.9): (201, 300),
        (425, 504.9): (301, 400),
        (505, 609): (401, 500)
    }

    no2_breakpoints = {
        (0, 53): (0, 50),
        (54, 100): (51, 100),
        (101, 360): (101, 150),
        (361, 649): (151, 200),
        (650, 1249): (201, 300),
        (1250, 1649): (301, 400),
        (1650, 2049): (401, 500)
    }
    ozone_breakpoints = {
        (0, 54): (0, 50),
        (55, 70): (51, 100),
        (71, 85): (101, 150),
        (86, 105): (151, 200),
        (106, 200): (201, 300)
    }
    carbon_breakpoints = {
        (0, 4.4): (0, 50),
        (4.5, 9.4): (51, 100),
        (9.5, 12.4): (101, 150),
        (12.5, 15.4): (151, 200),
        (15.5, 30.4): (201, 300)
    }

    sulphur_breakpoints = {
        (0, 35.9): (0, 50),
        (36, 76.9): (51, 100),
        (77, 185.9): (101, 150),
        (186, 304.9): (151, 200),
        (305, 604.9): (201, 300)
    }

    def calculate_aqi_from_concentration(concentration, breakpoints):
        for (bp_lo, bp_hi), (aqi_lo, aqi_hi) in breakpoints.items():
            if bp_lo <= concentration <= bp_hi:
                return round(((aqi_hi - aqi_lo) / (bp_hi - bp_lo)) * (concentration - bp_lo) + aqi_lo)
        # Return maximum AQI value otherwise
        return 500

    # Round each concentration to match the breakpoints
    df['Carbon Average'] = df['Carbon Average'].apply(lambda x: round(x, 1))
    df['Nitrogen Average'] = df['Nitrogen Average'].apply(lambda x: round(x))
    df['Ozone Average'] = df['Ozone Average'].apply(lambda x: round(x))
    df['Sulfur Average'] = df['Sulfur Average'].apply(lambda x: round(x))
    df['PM2.5 Average'] = df['PM2.5 Average'].apply(lambda x: round(x, 1))
    df['PM10 Average'] = df['PM10 Average'].apply(lambda x: round(x, 1))


    # Calculate the AQI for each pollutant
    df['Ozone AQI'] = df['Ozone Average'].apply(lambda x: calculate_aqi_from_concentration(x, ozone_breakpoints))
    df['Carbon AQI'] = df['Carbon Average'].apply(lambda x: calculate_aqi_from_concentration(x, carbon_breakpoints))
    df['Nitrogen AQI'] = df['Nitrogen Average'].apply(lambda x: calculate_aqi_from_concentration(x, no2_breakpoints))
    df['Sulfur AQI'] = df['Sulfur Average'].apply(lambda x: calculate_aqi_from_concentration(x, sulphur_breakpoints))
    df['PM2.5 AQI'] = df['PM2.5 Average'].apply(lambda x: calculate_aqi_from_concentration(x, pm25_breakpoints))
    df['PM10 AQI'] = df['PM10 Average'].apply(lambda x: calculate_aqi_from_concentration(x, pm10_breakpoints))


    # The overall AQI for the date would be the maximum AQI value across all pollutants
    df['Overall AQI'] = df[['PM2.5 AQI', 'Nitrogen AQI', 'Ozone AQI', 'PM10 AQI', 'Sulfur AQI','Carbon AQI']].max(axis=1).round().astype(int)






    # change to respective path
    df.to_csv(f'../Processed_Data/filtered_data_{year}.csv', index=False)



In [18]:
# Ozone filteration and average calulation by state

import pandas as pd

df = pd.read_csv(f'/Users/rahib/ctp_data/annual_conc_by_monitor_2023.csv', low_memory=False)

filtered_df = df[(df['Parameter Name'] == 'Ozone') & (df['Sample Duration'] == '8-HR RUN AVG BEGIN HOUR')]

# Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
#ozone_average_by_state = filtered_df.groupby('State Name')['Arithmetic Mean'].mean()

ozone_average_by_state = filtered_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='Ozone Average')

#df = df.merge(ozone_average_by_state, on='State Name', how='left')

#df

# Display the result
#print(ozone_average_by_state)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/rahib/ctp_data/annual_conc_by_monitor_2023.csv'

In [19]:
#Sulphur averagecalulcation by state

df = pd.read_csv(f'/Users/rahib/ctp_data/annual_conc_by_monitor_2023.csv', low_memory=False)

filtered_df = df[(df['Parameter Name'] == 'Sulfur dioxide') & (df['Sample Duration'] == '1 HOUR')]

# Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
sulfur_average_by_state = filtered_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='Sulfur Average')


# Display the result
print(sulfur_average_by_state)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/rahib/ctp_data/annual_conc_by_monitor_2023.csv'

In [52]:
#Carbon Average Calculation by State

df = pd.read_csv(f'/Users/rahib/ctp_data/annual_conc_by_monitor_2023.csv', low_memory=False)

filtered_df = df[(df['Parameter Name'] == 'Carbon monoxide') & (df['Sample Duration'] == '8-HR RUN AVG END HOUR')]

# Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
carbon_average_by_state = filtered_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='Carbon Average')

# Display the result
print(carbon_average_by_state)

              State Name  Carbon Average
0                Alabama        0.179913
1                 Alaska        0.325719
2                Arizona        0.287274
3               Arkansas        0.420219
4             California        0.309614
5               Colorado        0.262584
6            Connecticut        0.261225
7   District Of Columbia        0.325275
8                Florida        0.278191
9                Georgia        0.472353
10                Hawaii        0.113790
11                 Idaho        0.193458
12              Illinois        0.233717
13               Indiana        0.315926
14                  Iowa        0.217986
15                Kansas        0.143523
16              Kentucky        0.260309
17             Louisiana        0.296406
18                 Maine        0.178426
19              Maryland        0.209562
20         Massachusetts        0.287923
21              Michigan        0.332531
22             Minnesota        0.314648
23           Mis

In [53]:
#Nitrogen Average Calculation by State

df = pd.read_csv(f'/Users/rahib/ctp_data/annual_conc_by_monitor_2023.csv', low_memory=False)

filtered_df = df[(df['Parameter Name'] == 'Nitrogen dioxide (NO2)') & (df['Sample Duration'] == '1 HOUR')]

# Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
nitrogen_average_by_state = filtered_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='Nitrogen Average')

# Display the result
print(nitrogen_average_by_state)

              State Name  Nitrogen Average
0                Alabama         14.857208
1                Arizona         19.716058
2               Arkansas          6.375365
3             California         12.780099
4               Colorado         11.641546
5            Connecticut         15.998629
6               Delaware         13.673875
7   District Of Columbia         16.369841
8                Florida         12.104429
9                Georgia         19.490053
10                Hawaii          6.145184
11                 Idaho         14.984825
12              Illinois         17.692747
13               Indiana         11.464871
14                  Iowa          6.355422
15                Kansas          7.636685
16              Kentucky         11.314929
17             Louisiana          8.641623
18                 Maine          6.726784
19              Maryland         13.653340
20         Massachusetts         11.251638
21              Michigan         14.653077
22         

In [54]:
#PM2.5 Average Calculation by State

df = pd.read_csv(f'/Users/rahib/ctp_data/annual_conc_by_monitor_2023.csv', low_memory=False)

filtered_df = df[(df['Parameter Name'] == 'PM2.5 - Local Conditions') & (df['Sample Duration'] == '24-HR BLK AVG')]

# Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
small_particulate_average_by_state = filtered_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='PM2.5 Average')

# Display the result
print(small_particulate_average_by_state)

              State Name  PM2.5 Average
0                Alabama       8.370108
1                 Alaska       4.545506
2                Arizona       7.319577
3               Arkansas      11.000141
4             California       7.661331
5               Colorado       6.030074
6            Connecticut       7.549358
7      Country Of Mexico      12.605847
8               Delaware       8.936746
9   District Of Columbia       8.722264
10               Florida       7.057733
11               Georgia      10.016676
12                Hawaii       3.271501
13              Illinois       9.100609
14               Indiana      10.438122
15                  Iowa      10.845811
16                Kansas       8.144290
17              Kentucky       8.498699
18             Louisiana       9.032349
19                 Maine       6.166939
20              Maryland       7.039947
21         Massachusetts       7.023527
22              Michigan      10.817855
23             Minnesota       7.864462


In [55]:
#PM10 Average Calculation by State

df = pd.read_csv(f'/Users/rahib/ctp_data/annual_conc_by_monitor_2023.csv', low_memory=False)

filtered_df = df[(df['Parameter Name'] == 'PM10 Total 0-10um STP') & (df['Sample Duration'] == '24-HR BLK AVG')]


# Calculating the average of 'Arithmetic Mean' grouped by 'State Code'
large_particulate_average_by_state = filtered_df.groupby('State Name')['Arithmetic Mean'].mean().reset_index(name='PM10 Average')

# Display the result
print(large_particulate_average_by_state)

              State Name  PM10 Average
0                Alabama     19.918663
1                 Alaska     11.625622
2                Arizona     32.005115
3             California     25.249740
4               Colorado     18.993431
5            Connecticut     14.896639
6      Country Of Mexico     54.398329
7   District Of Columbia     17.661729
8                Florida     18.530195
9                Georgia     18.415520
10                Hawaii     14.537993
11                 Idaho     45.367379
12              Illinois     29.173494
13               Indiana     21.001673
14                  Iowa     34.920504
15                Kansas     24.712202
16              Kentucky     20.314208
17             Louisiana     19.700003
18                 Maine     14.365160
19         Massachusetts     13.733037
20              Michigan     20.087767
21             Minnesota     21.764107
22           Mississippi     20.474926
23              Missouri     24.652847
24               Montana 

In [56]:
df = df.merge(ozone_average_by_state, on='State Name', how='left')
df = df.merge(carbon_average_by_state, on='State Name', how='left')
df = df.merge(nitrogen_average_by_state, on='State Name', how='left')
df = df.merge(sulfur_average_by_state, on='State Name', how='left')
df = df.merge(small_particulate_average_by_state, on='State Name', how='left')
df = df.merge(large_particulate_average_by_state, on='State Name', how='left')
df

Unnamed: 0,State Code,County Code,Site Num,Parameter Code,POC,Latitude,Longitude,Datum,Parameter Name,Sample Duration,...,County Name,City Name,CBSA Name,Date of Last Change,Ozone Average,Carbon Average,Nitrogen Average,Sulfur Average,PM2.5 Average,PM10 Average
0,1,3,10,44201,1,30.497478,-87.880258,NAD83,Ozone,1 HOUR,...,Baldwin,Fairhope,"Daphne-Fairhope-Foley, AL",2024-05-24,0.043222,0.179913,14.857208,1.951925,8.370108,19.918663
1,1,3,10,44201,1,30.497478,-87.880258,NAD83,Ozone,8-HR RUN AVG BEGIN HOUR,...,Baldwin,Fairhope,"Daphne-Fairhope-Foley, AL",2024-05-24,0.043222,0.179913,14.857208,1.951925,8.370108,19.918663
2,1,3,10,44201,1,30.497478,-87.880258,NAD83,Ozone,8-HR RUN AVG BEGIN HOUR,...,Baldwin,Fairhope,"Daphne-Fairhope-Foley, AL",2024-05-24,0.043222,0.179913,14.857208,1.951925,8.370108,19.918663
3,1,3,10,44201,1,30.497478,-87.880258,NAD83,Ozone,8-HR RUN AVG BEGIN HOUR,...,Baldwin,Fairhope,"Daphne-Fairhope-Foley, AL",2024-05-24,0.043222,0.179913,14.857208,1.951925,8.370108,19.918663
4,1,3,10,88101,3,30.497478,-87.880258,NAD83,PM2.5 - Local Conditions,1 HOUR,...,Baldwin,Fairhope,"Daphne-Fairhope-Foley, AL",2024-08-06,0.043222,0.179913,14.857208,1.951925,8.370108,19.918663
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77955,80,26,6,88101,21,31.291293,-110.951513,WGS84,PM2.5 - Local Conditions,24-HR BLK AVG,...,SONORA,,,2024-08-07,0.042174,,,,12.605847,54.398329
77956,80,26,8012,44201,1,32.466389,-114.768611,WGS84,Ozone,1 HOUR,...,SONORA,,,2024-05-24,0.042174,,,,12.605847,54.398329
77957,80,26,8012,44201,1,32.466389,-114.768611,WGS84,Ozone,8-HR RUN AVG BEGIN HOUR,...,SONORA,,,2024-05-24,0.042174,,,,12.605847,54.398329
77958,80,26,8012,44201,1,32.466389,-114.768611,WGS84,Ozone,8-HR RUN AVG BEGIN HOUR,...,SONORA,,,2024-05-24,0.042174,,,,12.605847,54.398329


In [57]:
all_cols = df.columns.tolist()
selected_cols = [   'State Name',
                     "Ozone Average","Carbon Average",	"Nitrogen Average",	
                     "Sulfur Average",	"PM2.5 Average","PM10 Average",]
    
remove_cols = list(set(all_cols) - set(selected_cols))
df.drop(remove_cols, axis=1, inplace=True)

df.drop_duplicates(inplace=True, ignore_index=True)


In [58]:
df.drop_duplicates(inplace=True, ignore_index=True)
df

Unnamed: 0,State Name,Ozone Average,Carbon Average,Nitrogen Average,Sulfur Average,PM2.5 Average,PM10 Average
0,Alabama,0.043222,0.179913,14.857208,1.951925,8.370108,19.918663
1,Alaska,0.028366,0.325719,,2.182044,4.545506,11.625622
2,Arizona,0.049037,0.287274,19.716058,1.755658,7.319577,32.005115
3,Arkansas,0.041512,0.420219,6.375365,0.741973,11.000141,
4,California,0.042853,0.309614,12.780099,0.654154,7.661331,25.24974
5,Colorado,0.049375,0.262584,11.641546,2.059463,6.030074,18.993431
6,Connecticut,0.044706,0.261225,15.998629,0.245845,7.549358,14.896639
7,Delaware,0.044349,,13.673875,0.420791,8.936746,
8,District Of Columbia,0.039905,0.325275,16.369841,0.655256,8.722264,17.661729
9,Florida,0.038653,0.278191,12.104429,1.574574,7.057733,18.530195


In [62]:
#Breakpoints

df['Carbon Average'].fillna(0, inplace= True)
df['Nitrogen Average'].fillna(0, inplace= True)
df['Ozone Average'].fillna(0, inplace= True)
df['Sulfur Average'].fillna(0, inplace= True)
df['PM2.5 Average'].fillna(0, inplace= True)
df['PM10 Average'].fillna(0, inplace= True)


pm25_breakpoints = {
    (0.0, 9): (0, 50),
    (9.1, 35.4): (51, 100),
    (35.5, 55.4): (101, 150),
    (55.5, 150.4): (151, 200),
    (150.5, 250.4): (201, 300),
    (250.5, 350.4): (301, 400),
    (350.5, 500.4): (401, 500)
}
pm10_breakpoints = {
    (0.0, 54.9): (0, 50),
    (55, 154.9): (51, 100),
    (155, 254.9): (101, 150),
    (255, 354.9): (151, 200),
    (355, 424.9): (201, 300),
    (425, 504.9): (301, 400),
    (505, 609): (401, 500)
}

no2_breakpoints = {
    (0, 53): (0, 50),
    (54, 100): (51, 100),
    (101, 360): (101, 150),
    (361, 649): (151, 200),
    (650, 1249): (201, 300),
    (1250, 1649): (301, 400),
    (1650, 2049): (401, 500)
}
ozone_breakpoints = {
    (0, 54): (0, 50),
    (55, 70): (51, 100),
    (71, 85): (101, 150),
    (86, 105): (151, 200),
    (106, 200): (201, 300)
}
carbon_breakpoints = {
    (0, 4.4): (0, 50),
    (4.5, 9.4): (51, 100),
    (9.5, 12.4): (101, 150),
    (12.5, 15.4): (151, 200),
    (15.5, 30.4): (201, 300)
}

sulphur_breakpoints = {
    (0, 35.9): (0, 50),
    (36, 76.9): (51, 100),
    (77, 185.9): (101, 150),
    (186, 304.9): (151, 200),
    (305, 604.9): (201, 300)
}

def calculate_aqi_from_concentration(concentration, breakpoints):
    for (bp_lo, bp_hi), (aqi_lo, aqi_hi) in breakpoints.items():
        if bp_lo <= concentration <= bp_hi:
            return round(((aqi_hi - aqi_lo) / (bp_hi - bp_lo)) * (concentration - bp_lo) + aqi_lo)
    # Return maximum AQI value otherwise
    return 500

# Round each concentration to match the breakpoints
df['Carbon Average'] = df['Carbon Average'].apply(lambda x: round(x, 1))
df['Nitrogen Average'] = df['Nitrogen Average'].apply(lambda x: round(x))
df['Ozone Average'] = df['Ozone Average'].apply(lambda x: round(x,100))
df['Sulfur Average'] = df['Sulfur Average'].apply(lambda x: round(x))
df['PM2.5 Average'] = df['PM2.5 Average'].apply(lambda x: round(x, 1))
df['PM10 Average'] = df['PM10 Average'].apply(lambda x: round(x, 1))


# Calculate the AQI for each pollutant
df['Ozone AQI'] = df['Ozone Average'].apply(lambda x: calculate_aqi_from_concentration(x, ozone_breakpoints))
df['Carbon AQI'] = df['Carbon Average'].apply(lambda x: calculate_aqi_from_concentration(x, carbon_breakpoints))
df['Nitrogen AQI'] = df['Nitrogen Average'].apply(lambda x: calculate_aqi_from_concentration(x, no2_breakpoints))
df['Sulfur AQI'] = df['Sulfur Average'].apply(lambda x: calculate_aqi_from_concentration(x, sulphur_breakpoints))
df['PM2.5 AQI'] = df['PM2.5 Average'].apply(lambda x: calculate_aqi_from_concentration(x, pm25_breakpoints))
df['PM10 AQI'] = df['PM10 Average'].apply(lambda x: calculate_aqi_from_concentration(x, pm10_breakpoints))


df 
# The overall AQI for the date would be the maximum AQI value across all pollutants
df['Overall AQI'] = df[['PM2.5 AQI', 'Nitrogen AQI', 'Ozone AQI', 'PM10 AQI', 'Sulfur AQI','Carbon AQI']].max(axis=1).round().astype(int)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Carbon Average'].fillna(0, inplace= True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Ozone Average'].fillna(0, inplace= True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

In [63]:
df

Unnamed: 0,State Name,Ozone Average,Carbon Average,Nitrogen Average,Sulfur Average,PM2.5 Average,PM10 Average,Ozone AQI,Carbon AQI,Nitrogen AQI,Sulfur AQI,PM2.5 AQI,PM10 AQI,Overall AQI
0,Alabama,0,0.2,15,2,8.4,19.9,0,2,14,3,47,18,47
1,Alaska,0,0.3,0,2,4.5,11.6,0,3,0,3,25,11,25
2,Arizona,0,0.3,20,2,7.3,32.0,0,3,19,3,41,29,41
3,Arkansas,0,0.4,6,1,11.0,0.0,0,5,6,1,55,0,55
4,California,0,0.3,13,1,7.7,25.2,0,3,12,1,43,23,43
5,Colorado,0,0.3,12,2,6.0,19.0,0,3,11,3,33,17,33
6,Connecticut,0,0.3,16,0,7.5,14.9,0,3,15,0,42,14,42
7,Delaware,0,0.0,14,0,8.9,0.0,0,0,13,0,49,0,49
8,District Of Columbia,0,0.3,16,1,8.7,17.7,0,3,15,1,48,16,48
9,Florida,0,0.3,12,2,7.1,18.5,0,3,11,3,39,17,39


In [52]:
#Define the years as strings
years = ["2004"]

In [53]:
#Clean all the data.
for year in years:
    cleanData(year)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Carbon Average'].fillna(0, inplace= True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Nitrogen Average'].fillna(0, inplace= True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values