# Data Wrangling

In [16]:
import pandas as pd
import pycountry_convert as pc
import pycountry
import numpy as np

In [17]:
data = pd.read_csv("../data/raw/world_air_quality.csv", sep=';')

data = data.drop(["City", "Location", "Source Name"], axis=1)

data = data.rename(columns={'Last Updated': 'time', 
                     'Country Label': 'countryname',
                     'Value': 'value',
                     'Pollutant': 'pollutant',
                     'Coordinates': 'coordinates',
                     'Unit': 'unit',
                     'Country Code': 'country_code'})

data['time'] = pd.to_datetime(data['time'])
data['time'] = data['time'].dt.to_period('M')
data = data.groupby(['time', 'countryname', 'pollutant', 'unit', 'country_code', 'coordinates']).agg(
    value=('value', 'sum') 
).reset_index()


data.head()

  data['time'] = data['time'].dt.to_period('M')


Unnamed: 0,time,countryname,pollutant,unit,country_code,coordinates,value
0,2014-03,United States,PM2.5,µg/m³,US,"33.688, -84.29",5.0
1,2014-08,United States,PM2.5,µg/m³,US,"37.132, -86.148",7.1
2,2015-08,Viet Nam,PM2.5,µg/m³,VN,"21.048, 105.8",36.9
3,2015-09,Singapore,PM2.5,µg/m³,SG,"1.298, 103.78",282.9
4,2016-02,Chile,CO,µg/m³,CL,"-32.833639, -70.99693",740.7


In [18]:
# Create continent column
def get_continent(country_code):
    try:
        continent_code = pc.country_alpha2_to_continent_code(country_code)
        continent_name = pc.convert_continent_code_to_continent_name(continent_code)
        return continent_name
    except KeyError:
        return np.nan

data['continent'] = data.apply(lambda row: get_continent(row['country_code']), axis=1)
data


Unnamed: 0,time,countryname,pollutant,unit,country_code,coordinates,value,continent
0,2014-03,United States,PM2.5,µg/m³,US,"33.688, -84.29",5.000000,North America
1,2014-08,United States,PM2.5,µg/m³,US,"37.132, -86.148",7.100000,North America
2,2015-08,Viet Nam,PM2.5,µg/m³,VN,"21.048, 105.8",36.900000,Asia
3,2015-09,Singapore,PM2.5,µg/m³,SG,"1.298, 103.78",282.900000,Asia
4,2016-02,Chile,CO,µg/m³,CL,"-32.833639, -70.99693",740.700000,South America
...,...,...,...,...,...,...,...,...
52750,2024-03,United Kingdom,UM003,particles/cm³,GB,"51.27224121233743, 1.0687339793186899",2197.927063,Europe
52751,2024-03,United Kingdom,UM003,particles/cm³,GB,"51.44276, -2.60125",5092.022298,Europe
52752,2024-03,United Kingdom,UM003,particles/cm³,GB,"51.4644277673234, -2.567887786679186",4467.214315,Europe
52753,2024-03,United Kingdom,UM003,particles/cm³,GB,"51.47230908118687, -0.08600623929255025",7021.323934,Europe


In [19]:
data[data['countryname'].isnull()]['country_code'].unique()


array([], dtype=object)

In [20]:
# Impute missing country names from country code
data.loc[(data['country_code'] == 'XK') & (data['countryname'].isnull()), 'countryname'] = 'Kosovo'
data.loc[(data['country_code'] == 'AJ') & (data['countryname'].isnull()), 'countryname'] = 'Azerbaijan'
data.loc[(data['country_code'] == 'CE') & (data['countryname'].isnull()), 'countryname'] = 'Sri Lanka'
data.loc[(data['country_code'] == 'UC') & (data['countryname'].isnull()), 'countryname'] = 'Curacao'
data.loc[(data['country_code'] == 'TI') & (data['countryname'].isnull()), 'countryname'] = 'Tajikistan'
data.loc[(data['country_code'] == 'IZ') & (data['countryname'].isnull()), 'countryname'] = 'Iraq'
data.loc[(data['country_code'] == 'KU') & (data['countryname'].isnull()), 'countryname'] = 'Kuwait'
data.loc[(data['country_code'] == 'BK') & (data['countryname'].isnull()), 'countryname'] = 'Bosnia-Herzegovina'
data.loc[(data['country_code'] == 'VM') & (data['countryname'].isnull()), 'countryname'] = 'Vietnam'
data.loc[(data['country_code'] == 'TX') & (data['countryname'].isnull()), 'countryname'] = 'Turkmenistan'
data.loc[(data['country_code'] == 'KV') & (data['countryname'].isnull()), 'countryname'] = 'Kosovo'
data.loc[(data['country_code'] == 'SU') & (data['countryname'].isnull()), 'countryname'] = 'Sudan'
data.loc[(data['country_code'] == 'CS') & (data['countryname'].isnull()), 'countryname'] = 'Costa Rica'
data.loc[(data['country_code'] == 'CW') & (data['countryname'].isnull()), 'countryname'] = 'Cook Islands'
data.loc[(data['country_code'] == 'AQ') & (data['countryname'].isnull()), 'countryname'] = 'Antartica'

# Check for missing names
data[data['countryname'].isnull()]['country_code'].unique()


array([], dtype=object)

In [21]:
correction_mapping = {
    "Czech Republic": "Czechia",
    "Russian Federation": "Russia",
    "Macedonia, The former Yugoslav Rep. of": "North Macedonia",
    "Taiwan, China": "Taiwan",
    "United States": "United States of America",
    "Lao People's Dem. Rep.": "Laos",
    "Moldova, Republic of": "Moldova",
    "Serbia": "Republic of Serbia",
    "Hong Kong, China": "China",  # Assuming you want to map Hong Kong to China 
    "West Bank and Gaza Strip": "Palestine",  # Assuming mapping to Palestine
    "Viet Nam": "Vietnam"
}

data['countryname'] = data['countryname'].apply(lambda x: correction_mapping.get(x, x))

countries_to_drop = ['Andorra', 'Gibraltar', 'Malta']
filtered_data = data[~data['countryname'].isin(countries_to_drop)]

data = filtered_data
data

Unnamed: 0,time,countryname,pollutant,unit,country_code,coordinates,value,continent
0,2014-03,United States of America,PM2.5,µg/m³,US,"33.688, -84.29",5.000000,North America
1,2014-08,United States of America,PM2.5,µg/m³,US,"37.132, -86.148",7.100000,North America
2,2015-08,Vietnam,PM2.5,µg/m³,VN,"21.048, 105.8",36.900000,Asia
3,2015-09,Singapore,PM2.5,µg/m³,SG,"1.298, 103.78",282.900000,Asia
4,2016-02,Chile,CO,µg/m³,CL,"-32.833639, -70.99693",740.700000,South America
...,...,...,...,...,...,...,...,...
52750,2024-03,United Kingdom,UM003,particles/cm³,GB,"51.27224121233743, 1.0687339793186899",2197.927063,Europe
52751,2024-03,United Kingdom,UM003,particles/cm³,GB,"51.44276, -2.60125",5092.022298,Europe
52752,2024-03,United Kingdom,UM003,particles/cm³,GB,"51.4644277673234, -2.567887786679186",4467.214315,Europe
52753,2024-03,United Kingdom,UM003,particles/cm³,GB,"51.47230908118687, -0.08600623929255025",7021.323934,Europe


In [22]:
# Check for missing names
data[data['continent'].isnull()]['countryname'].unique()

array(['Serbia and Montenegro', 'USSR'], dtype=object)

In [23]:
data.loc[data['countryname'] == 'Antartica', 'continent'] = 'Antartica'
data.loc[data['countryname'] == 'Azerbaijan', 'continent'] = 'Asia'
data.loc[data['countryname'] == 'Bosnia-Herzegovina', 'continent'] = 'Europe'
data.loc[data['countryname'] == 'Curacao', 'continent'] = 'South America'
data.loc[data['countryname'] == 'Iraq', 'continent'] = 'Asia'
data.loc[data['countryname'] == 'Kosovo', 'continent'] = 'Europe'
data.loc[data['countryname'] == 'Kuwait', 'continent'] = 'Asia'
data.loc[data['countryname'] == 'Serbia and Montenegro', 'continent'] = 'Europe'
data.loc[data['countryname'] == 'Sri Lanka', 'continent'] = 'Asia'
data.loc[data['countryname'] == 'Turkmenistan', 'continent'] = 'Asia'
data.loc[data['countryname'] == 'Tajikistan', 'continent'] = 'Asia'
data.loc[data['countryname'] == 'USSR', 'continent'] = 'Asia'
data.loc[data['countryname'] == 'Vietnam', 'continent'] = 'Asia'

data[data['continent'].isnull()]['countryname'].unique()

array([], dtype=object)

In [24]:
data = data.sort_values(by='countryname')
data

Unnamed: 0,time,countryname,pollutant,unit,country_code,coordinates,value,continent
6856,2021-08,Afghanistan,PM2.5,µg/m³,AF,"34.554384, 69.204597",136.000,Asia
6855,2021-08,Afghanistan,PM2.5,µg/m³,AF,"34.535812, 69.190514",-999.000,Asia
33185,2024-03,Algeria,PM2.5,µg/m³,DZ,"36.755798, 3.039114",78.000,Africa
21550,2023-05,Antigua and Barbuda,PM2.5,µg/m³,AG,"36.755798, 3.039114",10.000,North America
31640,2024-02,Argentina,SO2,µg/m³,AR,"-34.883175, -58.682542000000005",0.420,South America
...,...,...,...,...,...,...,...,...
27543,2023-05,Uzbekistan,PM2.5,µg/m³,UZ,"41.3672, 69.2725",14.000,Asia
27542,2023-05,Uzbekistan,O3,ppm,UZ,"41.3672, 69.2725",0.011,Asia
291,2016-11,Vietnam,PM2.5,µg/m³,VN,"21.02177, 105.819002",12.900,Asia
2,2015-08,Vietnam,PM2.5,µg/m³,VN,"21.048, 105.8",36.900,Asia


In [25]:
# Check for pollutant names
data['pollutant'].unique()

array(['PM2.5', 'SO2', 'TEMPERATURE', 'CO', 'NO', 'NO2', 'NOX', 'O3',
       'PM10', 'RELATIVEHUMIDITY', 'UM003', 'PM1', 'BC'], dtype=object)

In [26]:
# Drop observations not pertaining to pollutants
data = data[data['pollutant'] != 'TEMPERATURE']
data = data[data['pollutant'] != 'RELATIVEHUMIDITY']
data = data[data['pollutant'] != 'UM003']
data = data[data['pollutant'] != 'BC']

data['pollutant'].unique()


array(['PM2.5', 'SO2', 'CO', 'NO', 'NO2', 'NOX', 'O3', 'PM10', 'PM1'],
      dtype=object)

In [27]:
# First need to convert all pollutant values into the unit of measurement needed for AQI calculations

def convert_values(pollutant, value, unit):
    # Conversion factors from https://uk-air.defra.gov.uk/assets/documents/reports/cat06/0502160851_Conversion_Factors_Between_ppb_and.pdf
    if pollutant == "O3":
        # Ozone needs to be in ppm
        if unit == 'µg/m³':
            return value / 1.9957 / 1000
        else:
            return value
    elif pollutant == "CO":
        # CO needs to be in ppm
        if unit == 'µg/m³':
            return value / 1.1642
        else:
            return value
    elif pollutant == "SO2":
        # SO2 needs to be in ppb
        if unit == 'µg/m³':
            return value / 2.6609 
        else:
            return value
    elif pollutant == "NO2":
        # NO2 needs to be in ppb
        if unit == 'µg/m³':
            return value / 1.9125 
        else:
            return value
    else:
        return value

data['converted_value'] = data.apply(lambda x: convert_values(x['pollutant'], x['value'], x['unit']), axis=1)

def convert_units(pollutant, unit):
    if pollutant == "O3":
        return "ppm"
    elif pollutant == "CO":
        return "ppm"
    elif pollutant == "SO2":
        return "ppb"
    elif pollutant == "NO2":
        return "ppb"
    else:
        return unit

data['converted_units'] = data.apply(lambda x: convert_units(x['pollutant'], x['unit']), axis=1)




In [28]:
data

Unnamed: 0,time,countryname,pollutant,unit,country_code,coordinates,value,continent,converted_value,converted_units
6856,2021-08,Afghanistan,PM2.5,µg/m³,AF,"34.554384, 69.204597",136.000,Asia,136.000000,µg/m³
6855,2021-08,Afghanistan,PM2.5,µg/m³,AF,"34.535812, 69.190514",-999.000,Asia,-999.000000,µg/m³
33185,2024-03,Algeria,PM2.5,µg/m³,DZ,"36.755798, 3.039114",78.000,Africa,78.000000,µg/m³
21550,2023-05,Antigua and Barbuda,PM2.5,µg/m³,AG,"36.755798, 3.039114",10.000,North America,10.000000,µg/m³
31640,2024-02,Argentina,SO2,µg/m³,AR,"-34.883175, -58.682542000000005",0.420,South America,0.157841,ppb
...,...,...,...,...,...,...,...,...,...,...
27543,2023-05,Uzbekistan,PM2.5,µg/m³,UZ,"41.3672, 69.2725",14.000,Asia,14.000000,µg/m³
27542,2023-05,Uzbekistan,O3,ppm,UZ,"41.3672, 69.2725",0.011,Asia,0.011000,ppm
291,2016-11,Vietnam,PM2.5,µg/m³,VN,"21.02177, 105.819002",12.900,Asia,12.900000,µg/m³
2,2015-08,Vietnam,PM2.5,µg/m³,VN,"21.048, 105.8",36.900,Asia,36.900000,µg/m³


In [29]:
# # Sanity Check for range of value
# data[data['pollutant']== "O3"]['converted_value'].describe()

# PM needs to be in mg/m3

# Only 6 observations that aren't in mg/m3 so we'll drop them
condition = (data['pollutant'] == "PM2.5") & (data['unit'] == 'ppm')

# Drop observations where the condition is met
data = data.drop(data[condition].index)

data[data['pollutant']=="PM2.5"]['unit'].value_counts()

unit
µg/m³    9075
Name: count, dtype: int64

In [30]:
pollutants = ['NO', 'NO2', 'NOX', 'SO2', 'PM2.5', 'CO', 'O3', 'PM10', 'PM1']

#Engineer the Overall AQI Feature 
def aqi_calculation(C, breakpoints):
    for C_low, C_high, I_low, I_high in breakpoints:
        if C_low <= C <= C_high:
            return ((I_high - I_low) / (C_high - C_low)) * (C - C_low) + I_low
    return np.nan
#AQI breakpoints, based on the US EPA System
    #https://aqs.epa.gov/aqsweb/documents/codetables/aqi_breakpoints.html
breakpoints = {
    'PM2.5': [(0.0, 12.0, 0, 50), (12.1, 35.4, 51, 100), (35.5, 55.4, 101, 150), (55.5, 150.4, 151, 200), (150.5, 250.4, 201, 300), (250.5, 350.4, 301, 400), (350.5, 500.4, 401, 500)],
    'PM10': [(0, 54, 0, 50), (55, 154, 51, 100), (155, 254, 101, 150), (255, 354, 151, 200), (355, 424, 201, 300), (425, 504, 301, 400), (505, 604, 401, 500)],
    'CO': [(0.0, 4.4, 0, 50), (4.5, 9.4, 51, 100), (9.5, 12.4, 101, 150), (12.5, 15.4, 151, 200), (15.5, 30.4, 201, 300), (30.5, 40.4, 301, 400), (40.5, 50.4, 401, 500)],
    'SO2': [(0, 35, 0, 50), (36, 75, 51, 100), (76, 185, 101, 150), (186, 304, 151, 200), (305, 604, 201, 300), (605, 804, 301, 400), (805, 1004, 401, 500)],
    'NO2': [(0, 53, 0, 50), (54, 100, 51, 100), (101, 360, 101, 150), (361, 649, 151, 200), (650, 1249, 201, 300), (1250, 1649, 301, 400), (1650, 2049, 401, 500)],
    'O3': [(0.125, 0.164, 101, 150), (0.165, 0.204, 151, 200), (0.205, 0.404, 201, 300), (0.405, 0.504, 301, 400), (0.505, 0.604, 401, 500)],
    'O3_8hr': [(0.0, 0.054, 0, 50), (0.055, 0.07, 51, 100), (0.071, 0.085, 101, 150), (0.086, 0.105, 151, 200)]
}

cat_breakpoints = {
    'PM2.5': [(0.0, 12.0,"Good"), (12.1, 35.4,"Moderate"), (35.5, 55.4,"Unhealthy for Sensitive Groups"), (55.5, 150.4,"Unhealthy"), (150.5, 250.4, "Very Unhealthy"), (250.5, 99999.9,"Hazardous")],
    'PM10': [(0, 54,"Good"), (55, 154,"Moderate"), (155, 254,"Unhealthy for Sensitive Groups"), (255, 354,"Unhealthy"), (355, 424,"Very Unhealthy"), (425, 99999.9, "Hazardous")],
    'CO': [(0.0, 4.4,"Good"), (4.5, 9.4,"Moderate"), (9.5, 12.4,"Unhealthy for Sensitive Groups"), (12.5, 15.4,"Unhealthy"), (15.5, 30.4,"Very Unhealthy"), (30.5, 99999.9,"Hazardous")],
    'SO2': [(0, 35,"Good"), (36, 75,"Moderate"), (76, 185, "Unhealthy for Sensitive Groups"), (186, 304, "Unhealthy"), (305, 604,"Very Unhealthy"), (605, 99999.9, "Hazardous")],
    'NO2': [(0, 53,"Good"), (54, 100,"Moderate"), (101, 360,"Unhealthy for Sensitive Groups"), (361, 649,"Unhealthy"), (650, 1249,"Very Unhealthy"), (1250, 99999.9,"Hazardous")],
    'O3': [(0, 0.124,"Good"), (0.125, 0.164,"Unhealthy for Sensitive Groups"), (0.165, 0.204,"Unhealthy"), (0.205, 0.404, "Very Unhealthy"), (0.405, 99999.9,"Hazardous")],
}

#Engineer the categorical AQI Feature 
def aqi_cat_calculation(C, breakpoints):
    for C_low, C_high, cat in breakpoints:
        if C_low <= C <= C_high:
            return cat


#Calculate AQI based on pollutation present in each row
def calculate_row_aqi(row):
    pollutant = row['pollutant']
    concentration = row['value']
    if pollutant in breakpoints:
        return aqi_calculation(concentration, breakpoints[pollutant])
    else:
        return np.nan

#Calculate AQI category based on pollutation present in each row
def calculate_row_aqi_cat(row):
    pollutant = row['pollutant']
    concentration = row['value']
    if pollutant in breakpoints:
        return aqi_cat_calculation(concentration, cat_breakpoints[pollutant])
    else:
        return np.nan

data['AQI'] = data.apply(calculate_row_aqi, axis=1)
data['AQI_cat'] = data.apply(calculate_row_aqi_cat, axis=1)

data

Unnamed: 0,time,countryname,pollutant,unit,country_code,coordinates,value,continent,converted_value,converted_units,AQI,AQI_cat
6856,2021-08,Afghanistan,PM2.5,µg/m³,AF,"34.554384, 69.204597",136.000,Asia,136.000000,µg/m³,192.564805,Unhealthy
6855,2021-08,Afghanistan,PM2.5,µg/m³,AF,"34.535812, 69.190514",-999.000,Asia,-999.000000,µg/m³,,
33185,2024-03,Algeria,PM2.5,µg/m³,DZ,"36.755798, 3.039114",78.000,Africa,78.000000,µg/m³,162.617492,Unhealthy
21550,2023-05,Antigua and Barbuda,PM2.5,µg/m³,AG,"36.755798, 3.039114",10.000,North America,10.000000,µg/m³,41.666667,Good
31640,2024-02,Argentina,SO2,µg/m³,AR,"-34.883175, -58.682542000000005",0.420,South America,0.157841,ppb,0.600000,Good
...,...,...,...,...,...,...,...,...,...,...,...,...
27543,2023-05,Uzbekistan,PM2.5,µg/m³,UZ,"41.3672, 69.2725",14.000,Asia,14.000000,µg/m³,54.995708,Moderate
27542,2023-05,Uzbekistan,O3,ppm,UZ,"41.3672, 69.2725",0.011,Asia,0.011000,ppm,,Good
291,2016-11,Vietnam,PM2.5,µg/m³,VN,"21.02177, 105.819002",12.900,Asia,12.900000,µg/m³,52.682403,Moderate
2,2015-08,Vietnam,PM2.5,µg/m³,VN,"21.048, 105.8",36.900,Asia,36.900000,µg/m³,104.447236,Unhealthy for Sensitive Groups


In [31]:
# Drop negative values as they are outliers or errors in measurements 
data = data[data['value'] >= 0]
data

Unnamed: 0,time,countryname,pollutant,unit,country_code,coordinates,value,continent,converted_value,converted_units,AQI,AQI_cat
6856,2021-08,Afghanistan,PM2.5,µg/m³,AF,"34.554384, 69.204597",136.000000,Asia,136.000000,µg/m³,192.564805,Unhealthy
33185,2024-03,Algeria,PM2.5,µg/m³,DZ,"36.755798, 3.039114",78.000000,Africa,78.000000,µg/m³,162.617492,Unhealthy
21550,2023-05,Antigua and Barbuda,PM2.5,µg/m³,AG,"36.755798, 3.039114",10.000000,North America,10.000000,µg/m³,41.666667,Good
31640,2024-02,Argentina,SO2,µg/m³,AR,"-34.883175, -58.682542000000005",0.420000,South America,0.157841,ppb,0.600000,Good
28273,2023-08,Argentina,PM2.5,µg/m³,AR,"-35.64627885582698, -59.78680330642245",13.558333,South America,13.558333,µg/m³,54.066882,Moderate
...,...,...,...,...,...,...,...,...,...,...,...,...
27543,2023-05,Uzbekistan,PM2.5,µg/m³,UZ,"41.3672, 69.2725",14.000000,Asia,14.000000,µg/m³,54.995708,Moderate
27542,2023-05,Uzbekistan,O3,ppm,UZ,"41.3672, 69.2725",0.011000,Asia,0.011000,ppm,,Good
291,2016-11,Vietnam,PM2.5,µg/m³,VN,"21.02177, 105.819002",12.900000,Asia,12.900000,µg/m³,52.682403,Moderate
2,2015-08,Vietnam,PM2.5,µg/m³,VN,"21.048, 105.8",36.900000,Asia,36.900000,µg/m³,104.447236,Unhealthy for Sensitive Groups


In [32]:
data[data['AQI'].isnull()]


Unnamed: 0,time,countryname,pollutant,unit,country_code,coordinates,value,continent,converted_value,converted_units,AQI,AQI_cat
31619,2024-02,Argentina,CO,µg/m³,AR,"-34.667375, -58.329231",180.000,South America,154.612610,ppm,,Hazardous
31620,2024-02,Argentina,CO,µg/m³,AR,"-34.883175, -58.682542000000005",1000.000,South America,858.958942,ppm,,Hazardous
31621,2024-02,Argentina,NO,µg/m³,AR,"-34.667375, -58.329231",3.000,South America,3.000000,µg/m³,,
31622,2024-02,Argentina,NO,µg/m³,AR,"-34.883175, -58.682542000000005",1.000,South America,1.000000,µg/m³,,
31628,2024-02,Argentina,NOX,µg/m³,AR,"-34.667375, -58.329231",10.000,South America,10.000000,µg/m³,,
...,...,...,...,...,...,...,...,...,...,...,...,...
25737,2023-05,United States of America,O3,ppm,US,"43.02421, -108.3637",0.039,North America,0.039000,ppm,,Good
25738,2023-05,United States of America,O3,ppm,US,"43.0289, -83.669403",0.071,North America,0.071000,ppm,,Good
25732,2023-05,United States of America,O3,ppm,US,"42.9925, -78.7764",0.062,North America,0.062000,ppm,,Good
25722,2023-05,United States of America,O3,ppm,US,"42.86183, -71.878626",0.057,North America,0.057000,ppm,,Good


In [33]:
# Create final data frame of converted columns and rename them as original features

converted_data = data.drop(['country_code', 'unit', 'value'], axis=1)
converted_data = converted_data.rename(columns={
    'converted_value': 'value',
    'converted_units': 'unit' 
    })
converted_data

Unnamed: 0,time,countryname,pollutant,coordinates,continent,value,unit,AQI,AQI_cat
6856,2021-08,Afghanistan,PM2.5,"34.554384, 69.204597",Asia,136.000000,µg/m³,192.564805,Unhealthy
33185,2024-03,Algeria,PM2.5,"36.755798, 3.039114",Africa,78.000000,µg/m³,162.617492,Unhealthy
21550,2023-05,Antigua and Barbuda,PM2.5,"36.755798, 3.039114",North America,10.000000,µg/m³,41.666667,Good
31640,2024-02,Argentina,SO2,"-34.883175, -58.682542000000005",South America,0.157841,ppb,0.600000,Good
28273,2023-08,Argentina,PM2.5,"-35.64627885582698, -59.78680330642245",South America,13.558333,µg/m³,54.066882,Moderate
...,...,...,...,...,...,...,...,...,...
27543,2023-05,Uzbekistan,PM2.5,"41.3672, 69.2725",Asia,14.000000,µg/m³,54.995708,Moderate
27542,2023-05,Uzbekistan,O3,"41.3672, 69.2725",Asia,0.011000,ppm,,Good
291,2016-11,Vietnam,PM2.5,"21.02177, 105.819002",Asia,12.900000,µg/m³,52.682403,Moderate
2,2015-08,Vietnam,PM2.5,"21.048, 105.8",Asia,36.900000,µg/m³,104.447236,Unhealthy for Sensitive Groups


In [34]:
# Save processed data to file
converted_data.to_csv("../data/processed/world_air_quality.csv")

In [35]:
converted_data

Unnamed: 0,time,countryname,pollutant,coordinates,continent,value,unit,AQI,AQI_cat
6856,2021-08,Afghanistan,PM2.5,"34.554384, 69.204597",Asia,136.000000,µg/m³,192.564805,Unhealthy
33185,2024-03,Algeria,PM2.5,"36.755798, 3.039114",Africa,78.000000,µg/m³,162.617492,Unhealthy
21550,2023-05,Antigua and Barbuda,PM2.5,"36.755798, 3.039114",North America,10.000000,µg/m³,41.666667,Good
31640,2024-02,Argentina,SO2,"-34.883175, -58.682542000000005",South America,0.157841,ppb,0.600000,Good
28273,2023-08,Argentina,PM2.5,"-35.64627885582698, -59.78680330642245",South America,13.558333,µg/m³,54.066882,Moderate
...,...,...,...,...,...,...,...,...,...
27543,2023-05,Uzbekistan,PM2.5,"41.3672, 69.2725",Asia,14.000000,µg/m³,54.995708,Moderate
27542,2023-05,Uzbekistan,O3,"41.3672, 69.2725",Asia,0.011000,ppm,,Good
291,2016-11,Vietnam,PM2.5,"21.02177, 105.819002",Asia,12.900000,µg/m³,52.682403,Moderate
2,2015-08,Vietnam,PM2.5,"21.048, 105.8",Asia,36.900000,µg/m³,104.447236,Unhealthy for Sensitive Groups


In [36]:
missing_AQI = converted_data[data['AQI'].isnull()]
missing_AQI

Unnamed: 0,time,countryname,pollutant,coordinates,continent,value,unit,AQI,AQI_cat
31619,2024-02,Argentina,CO,"-34.667375, -58.329231",South America,154.612610,ppm,,Hazardous
31620,2024-02,Argentina,CO,"-34.883175, -58.682542000000005",South America,858.958942,ppm,,Hazardous
31621,2024-02,Argentina,NO,"-34.667375, -58.329231",South America,3.000000,µg/m³,,
31622,2024-02,Argentina,NO,"-34.883175, -58.682542000000005",South America,1.000000,µg/m³,,
31628,2024-02,Argentina,NOX,"-34.667375, -58.329231",South America,10.000000,µg/m³,,
...,...,...,...,...,...,...,...,...,...
25737,2023-05,United States of America,O3,"43.02421, -108.3637",North America,0.039000,ppm,,Good
25738,2023-05,United States of America,O3,"43.0289, -83.669403",North America,0.071000,ppm,,Good
25732,2023-05,United States of America,O3,"42.9925, -78.7764",North America,0.062000,ppm,,Good
25722,2023-05,United States of America,O3,"42.86183, -71.878626",North America,0.057000,ppm,,Good


In [37]:
missing_AQI = converted_data[data['AQI_cat'].isnull()]
missing_AQI

Unnamed: 0,time,countryname,pollutant,coordinates,continent,value,unit,AQI,AQI_cat
31621,2024-02,Argentina,NO,"-34.667375, -58.329231",South America,3.000000,µg/m³,,
31622,2024-02,Argentina,NO,"-34.883175, -58.682542000000005",South America,1.000000,µg/m³,,
31628,2024-02,Argentina,NOX,"-34.667375, -58.329231",South America,10.000000,µg/m³,,
31629,2024-02,Argentina,NOX,"-34.883175, -58.682542000000005",South America,19.000000,µg/m³,,
32337,2024-02,Austria,PM2.5,"48.17435299946658, 16.361417",Europe,12.048000,µg/m³,,
...,...,...,...,...,...,...,...,...,...
24208,2023-05,United States of America,NO,"43.87483, -106.50974",North America,0.000300,ppm,,
24209,2023-05,United States of America,NO,"44.373056, -110.830833",North America,0.000900,ppm,,
24210,2023-05,United States of America,NO,"44.377048, -68.260902",North America,0.000000,ppm,,
24211,2023-05,United States of America,NO,"44.528389, -72.868973",North America,0.000000,ppm,,


I think the NaNs could be due to rounding errors? Once the values are passed through the formulas they become so small that they are essentially zero aka NaN? Maybe a better way to categorize these values is with 'good' 'moderate' 'unhealthy for sensitive groups' etc.? 

https://www.airnow.gov/sites/default/files/2020-05/aqi-technical-assistance-document-sept2018.pdf

https://uk-air.defra.gov.uk/assets/documents/reports/cat06/0502160851_Conversion_Factors_Between_ppb_and.pdf

In [38]:
converted_data[data['AQI'].isnull()]['pollutant'].value_counts()

pollutant
O3       8078
NO       3720
CO       3654
NOX      1969
PM1        74
PM10       31
PM2.5      15
NO2        10
SO2         6
Name: count, dtype: int64