# Data Wrangling

In [1]:
import pandas as pd
import aqi
import pycountry_convert as pc
import pycountry
import numpy as np

In [2]:
data = pd.read_csv("../data/raw/world_air_quality.csv", sep=';')

data = data.drop(["City", "Location", "Source Name"], axis=1)

data = data.rename(columns={'Last Updated': 'time', 
                     'Country Label': 'countryname',
                     'Value': 'value',
                     'Pollutant': 'pollutant',
                     'Coordinates': 'coordinates',
                     'Unit': 'unit',
                     'Country Code': 'country_code'})

data['time'] = pd.to_datetime(data['time']).dt.date
data.head()

Unnamed: 0,country_code,coordinates,pollutant,unit,value,time,countryname
0,JP,"33.880833, 130.873056",NO,ppm,0.002,2024-03-10,Japan
1,JP,"33.898056, 130.81",NO2,ppm,0.005,2024-03-10,Japan
2,JP,"33.895833, 130.935833",NOX,ppm,0.013,2024-03-10,Japan
3,JP,"42.786944, 141.605",NO2,ppm,0.004,2024-03-10,Japan
4,JP,"35.653889, 140.097778",NOX,ppm,0.003,2024-03-10,Japan


In [3]:
# Create continent column
def get_continent(country_code):
    try:
        continent_code = pc.country_alpha2_to_continent_code(country_code)
        continent_name = pc.convert_continent_code_to_continent_name(continent_code)
        return continent_name
    except KeyError:
        return np.nan

data['continent'] = data.apply(lambda row: get_continent(row['country_code']), axis=1)
data


Unnamed: 0,country_code,coordinates,pollutant,unit,value,time,countryname,continent
0,JP,"33.880833, 130.873056",NO,ppm,0.002,2024-03-10,Japan,Asia
1,JP,"33.898056, 130.81",NO2,ppm,0.005,2024-03-10,Japan,Asia
2,JP,"33.895833, 130.935833",NOX,ppm,0.013,2024-03-10,Japan,Asia
3,JP,"42.786944, 141.605",NO2,ppm,0.004,2024-03-10,Japan,Asia
4,JP,"35.653889, 140.097778",NOX,ppm,0.003,2024-03-10,Japan,Asia
...,...,...,...,...,...,...,...,...
54250,IT,"40.64389299999999, 15.872893000000001",CO,µg/m³,295.000,2024-03-11,Italy,Europe
54251,IT,"46.030833, 11.905833",O3,µg/m³,35.000,2024-03-11,Italy,Europe
54252,IT,"43.55472, 10.32972",CO,µg/m³,600.000,2024-03-11,Italy,Europe
54253,IT,"43.91611, 11.006939999999998",PM2.5,µg/m³,15.000,2024-03-11,Italy,Europe


In [4]:
data[data['countryname'].isnull()]['country_code'].unique()


array(['XK', 'AJ', 'CE', 'UC', 'CW', 'TI', 'IZ', 'KU', 'BK', 'VM', 'AQ',
       'TX', 'KV'], dtype=object)

In [5]:
# Impute missing country names from country code
data.loc[(data['country_code'] == 'XK') & (data['countryname'].isnull()), 'countryname'] = 'Kosovo'
data.loc[(data['country_code'] == 'AJ') & (data['countryname'].isnull()), 'countryname'] = 'Azerbaijan'
data.loc[(data['country_code'] == 'CE') & (data['countryname'].isnull()), 'countryname'] = 'Sri Lanka'
data.loc[(data['country_code'] == 'UC') & (data['countryname'].isnull()), 'countryname'] = 'Curacao'
data.loc[(data['country_code'] == 'TI') & (data['countryname'].isnull()), 'countryname'] = 'Tajikistan'
data.loc[(data['country_code'] == 'IZ') & (data['countryname'].isnull()), 'countryname'] = 'Iraq'
data.loc[(data['country_code'] == 'KU') & (data['countryname'].isnull()), 'countryname'] = 'Kuwait'
data.loc[(data['country_code'] == 'BK') & (data['countryname'].isnull()), 'countryname'] = 'Bosnia-Herzegovina'
data.loc[(data['country_code'] == 'VM') & (data['countryname'].isnull()), 'countryname'] = 'Vietnam'
data.loc[(data['country_code'] == 'TX') & (data['countryname'].isnull()), 'countryname'] = 'Turkmenistan'
data.loc[(data['country_code'] == 'KV') & (data['countryname'].isnull()), 'countryname'] = 'Kosovo'
data.loc[(data['country_code'] == 'SU') & (data['countryname'].isnull()), 'countryname'] = 'Sudan'
data.loc[(data['country_code'] == 'CS') & (data['countryname'].isnull()), 'countryname'] = 'Costa Rica'
data.loc[(data['country_code'] == 'CW') & (data['countryname'].isnull()), 'countryname'] = 'Cook Islands'
data.loc[(data['country_code'] == 'AQ') & (data['countryname'].isnull()), 'countryname'] = 'Antartica'

# Check for missing names
data[data['countryname'].isnull()]['country_code'].unique()


array([], dtype=object)

In [6]:
correction_mapping = {
    "Czech Republic": "Czechia",
    "Russian Federation": "Russia",
    "Macedonia, The former Yugoslav Rep. of": "North Macedonia",
    "Taiwan, China": "Taiwan",
    "United States": "United States of America",
    "Lao People's Dem. Rep.": "Laos",
    "Moldova, Republic of": "Moldova",
    "Serbia": "Republic of Serbia",
    "Hong Kong, China": "China",  # Assuming you want to map Hong Kong to China 
    "West Bank and Gaza Strip": "Palestine",  # Assuming mapping to Palestine
    "Viet Nam": "Vietnam"
}

data['countryname'] = data['countryname'].apply(lambda x: correction_mapping.get(x, x))

countries_to_drop = ['Andorra', 'Gibraltar', 'Malta']
filtered_data = data[~data['countryname'].isin(countries_to_drop)]

data = filtered_data
data

Unnamed: 0,country_code,coordinates,pollutant,unit,value,time,countryname,continent
0,JP,"33.880833, 130.873056",NO,ppm,0.002,2024-03-10,Japan,Asia
1,JP,"33.898056, 130.81",NO2,ppm,0.005,2024-03-10,Japan,Asia
2,JP,"33.895833, 130.935833",NOX,ppm,0.013,2024-03-10,Japan,Asia
3,JP,"42.786944, 141.605",NO2,ppm,0.004,2024-03-10,Japan,Asia
4,JP,"35.653889, 140.097778",NOX,ppm,0.003,2024-03-10,Japan,Asia
...,...,...,...,...,...,...,...,...
54250,IT,"40.64389299999999, 15.872893000000001",CO,µg/m³,295.000,2024-03-11,Italy,Europe
54251,IT,"46.030833, 11.905833",O3,µg/m³,35.000,2024-03-11,Italy,Europe
54252,IT,"43.55472, 10.32972",CO,µg/m³,600.000,2024-03-11,Italy,Europe
54253,IT,"43.91611, 11.006939999999998",PM2.5,µg/m³,15.000,2024-03-11,Italy,Europe


In [7]:
# Check for missing names
data[data['continent'].isnull()]['countryname'].unique()

array(['Azerbaijan', 'Sri Lanka', 'Curacao', 'Tajikistan', 'Iraq',
       'Kuwait', 'Bosnia-Herzegovina', 'Vietnam', 'Antartica',
       'Turkmenistan', 'Kosovo', 'USSR', 'Serbia and Montenegro'],
      dtype=object)

In [8]:
data.loc[data['countryname'] == 'Antartica', 'continent'] = 'Antartica'
data.loc[data['countryname'] == 'Azerbaijan', 'continent'] = 'Asia'
data.loc[data['countryname'] == 'Bosnia-Herzegovina', 'continent'] = 'Europe'
data.loc[data['countryname'] == 'Curacao', 'continent'] = 'South America'
data.loc[data['countryname'] == 'Iraq', 'continent'] = 'Asia'
data.loc[data['countryname'] == 'Kosovo', 'continent'] = 'Europe'
data.loc[data['countryname'] == 'Kuwait', 'continent'] = 'Asia'
data.loc[data['countryname'] == 'Serbia and Montenegro', 'continent'] = 'Europe'
data.loc[data['countryname'] == 'Sri Lanka', 'continent'] = 'Asia'
data.loc[data['countryname'] == 'Turkmenistan', 'continent'] = 'Asia'
data.loc[data['countryname'] == 'Tajikistan', 'continent'] = 'Asia'
data.loc[data['countryname'] == 'USSR', 'continent'] = 'Asia'
data.loc[data['countryname'] == 'Vietnam', 'continent'] = 'Asia'

data[data['continent'].isnull()]['countryname'].unique()

array([], dtype=object)

In [9]:
data = data.sort_values(by='countryname')
data

Unnamed: 0,country_code,coordinates,pollutant,unit,value,time,countryname,continent
46339,AF,"34.554384, 69.204597",PM2.5,µg/m³,136.000000,2021-08-15,Afghanistan,Asia
20436,AF,"34.535812, 69.190514",PM2.5,µg/m³,-999.000000,2021-08-16,Afghanistan,Asia
41831,DZ,"36.755798, 3.039114",PM2.5,µg/m³,78.000000,2024-03-25,Algeria,Africa
49813,AQ,"-74.16627777777778, 4.577805599999999",NO2,ppm,0.051045,2021-02-04,Antartica,Antartica
49812,AQ,"-74.16627777777778, 4.577805599999999",CO,ppm,2.105187,2021-02-04,Antartica,Antartica
...,...,...,...,...,...,...,...,...
6286,VN,"21.02177, 105.819002",PM2.5,µg/m³,12.900000,2016-11-09,Vietnam,Asia
20327,VM,"10.782773, 106.700035",PM2.5,µg/m³,-999.000000,2023-05-31,Vietnam,Asia
18437,VN,"21.048, 105.8",PM2.5,µg/m³,36.900000,2015-08-21,Vietnam,Asia
34381,VM,"21.021939, 105.818806",PM2.5,µg/m³,36.000000,2023-05-31,Vietnam,Asia


In [10]:
# Check for pollutant names
data['pollutant'].unique()

array(['PM2.5', 'NO2', 'CO', 'SO2', 'O3', 'PM10', 'TEMPERATURE', 'NO',
       'NOX', 'RELATIVEHUMIDITY', 'PM1', 'UM003', 'BC'], dtype=object)

In [11]:
# Drop observations not pertaining to pollutants
data = data[data['pollutant'] != 'TEMPERATURE']
data = data[data['pollutant'] != 'RELATIVEHUMIDITY']
data = data[data['pollutant'] != 'UM003']
data = data[data['pollutant'] != 'BC']

data['pollutant'].unique()


array(['PM2.5', 'NO2', 'CO', 'SO2', 'O3', 'PM10', 'NO', 'NOX', 'PM1'],
      dtype=object)

In [12]:
# First need to convert all pollutant values into the unit of measurement needed for AQI calculations

def convert_values(pollutant, value, unit):
    # Conversion factors from https://uk-air.defra.gov.uk/assets/documents/reports/cat06/0502160851_Conversion_Factors_Between_ppb_and.pdf
    if pollutant == "O3":
        # Ozone needs to be in ppm
        if unit == 'µg/m³':
            return value / 1.9957 / 1000
        else:
            return value
    elif pollutant == "CO":
        # CO needs to be in ppm
        if unit == 'µg/m³':
            return value / 1.1642
        else:
            return value
    elif pollutant == "SO2":
        # SO2 needs to be in ppb
        if unit == 'µg/m³':
            return value / 2.6609 
        else:
            return value
    elif pollutant == "NO2":
        # NO2 needs to be in ppb
        if unit == 'µg/m³':
            return value / 1.9125 
        else:
            return value
    else:
        return value

data['converted_value'] = data.apply(lambda x: convert_values(x['pollutant'], x['value'], x['unit']), axis=1)

def convert_units(pollutant, unit):
    if pollutant == "O3":
        return "ppm"
    elif pollutant == "CO":
        return "ppm"
    elif pollutant == "SO2":
        return "ppb"
    elif pollutant == "NO2":
        return "ppb"
    else:
        return unit

data['converted_units'] = data.apply(lambda x: convert_units(x['pollutant'], x['unit']), axis=1)




In [13]:
data

Unnamed: 0,country_code,coordinates,pollutant,unit,value,time,countryname,continent,converted_value,converted_units
46339,AF,"34.554384, 69.204597",PM2.5,µg/m³,136.000000,2021-08-15,Afghanistan,Asia,136.000000,µg/m³
20436,AF,"34.535812, 69.190514",PM2.5,µg/m³,-999.000000,2021-08-16,Afghanistan,Asia,-999.000000,µg/m³
41831,DZ,"36.755798, 3.039114",PM2.5,µg/m³,78.000000,2024-03-25,Algeria,Africa,78.000000,µg/m³
49813,AQ,"-74.16627777777778, 4.577805599999999",NO2,ppm,0.051045,2021-02-04,Antartica,Antartica,0.051045,ppb
49812,AQ,"-74.16627777777778, 4.577805599999999",CO,ppm,2.105187,2021-02-04,Antartica,Antartica,2.105187,ppm
...,...,...,...,...,...,...,...,...,...,...
6286,VN,"21.02177, 105.819002",PM2.5,µg/m³,12.900000,2016-11-09,Vietnam,Asia,12.900000,µg/m³
20327,VM,"10.782773, 106.700035",PM2.5,µg/m³,-999.000000,2023-05-31,Vietnam,Asia,-999.000000,µg/m³
18437,VN,"21.048, 105.8",PM2.5,µg/m³,36.900000,2015-08-21,Vietnam,Asia,36.900000,µg/m³
34381,VM,"21.021939, 105.818806",PM2.5,µg/m³,36.000000,2023-05-31,Vietnam,Asia,36.000000,µg/m³


In [14]:
# # Sanity Check for range of value
# data[data['pollutant']== "O3"]['converted_value'].describe()

# PM needs to be in mg/m3

# Only 6 observations that aren't in mg/m3 so we'll drop them
condition = (data['pollutant'] == "PM2.5") & (data['unit'] == 'ppm')

# Drop observations where the condition is met
data = data.drop(data[condition].index)

data[data['pollutant']=="PM2.5"]['unit'].value_counts()

unit
µg/m³    9536
Name: count, dtype: int64

In [15]:
pollutants = ['NO', 'NO2', 'NOX', 'SO2', 'PM2.5', 'CO', 'O3', 'PM10', 'PM1']

#Engineer the Overall AQI Feature 
def aqi_calculation(C, breakpoints):
    for C_low, C_high, I_low, I_high in breakpoints:
        if C_low <= C <= C_high:
            return ((I_high - I_low) / (C_high - C_low)) * (C - C_low) + I_low
    return np.nan
#AQI breakpoints, based on the US EPA System
    #https://aqs.epa.gov/aqsweb/documents/codetables/aqi_breakpoints.html
breakpoints = {
    'PM2.5': [(0.0, 12.0, 0, 50), (12.1, 35.4, 51, 100), (35.5, 55.4, 101, 150), (55.5, 150.4, 151, 200), (150.5, 250.4, 201, 300), (250.5, 350.4, 301, 400), (350.5, 500.4, 401, 500)],
    'PM10': [(0, 54, 0, 50), (55, 154, 51, 100), (155, 254, 101, 150), (255, 354, 151, 200), (355, 424, 201, 300), (425, 504, 301, 400), (505, 604, 401, 500)],
    'CO': [(0.0, 4.4, 0, 50), (4.5, 9.4, 51, 100), (9.5, 12.4, 101, 150), (12.5, 15.4, 151, 200), (15.5, 30.4, 201, 300), (30.5, 40.4, 301, 400), (40.5, 50.4, 401, 500)],
    'SO2': [(0, 35, 0, 50), (36, 75, 51, 100), (76, 185, 101, 150), (186, 304, 151, 200), (305, 604, 201, 300), (605, 804, 301, 400), (805, 1004, 401, 500)],
    'NO2': [(0, 53, 0, 50), (54, 100, 51, 100), (101, 360, 101, 150), (361, 649, 151, 200), (650, 1249, 201, 300), (1250, 1649, 301, 400), (1650, 2049, 401, 500)],
    'O3': [(0.125, 0.164, 101, 150), (0.165, 0.204, 151, 200), (0.205, 0.404, 201, 300), (0.405, 0.504, 301, 400), (0.505, 0.604, 401, 500)],
    'O3_8hr': [(0.0, 0.054, 0, 50), (0.055, 0.07, 51, 100), (0.071, 0.085, 101, 150), (0.086, 0.105, 151, 200)]
}

#Calculate AQI based on pollutation present in each row
def calculate_row_aqi(row):
    pollutant = row['pollutant']
    concentration = row['value']
    if pollutant in breakpoints:
        return aqi_calculation(concentration, breakpoints[pollutant])
    else:
        return np.nan

data['AQI'] = data.apply(calculate_row_aqi, axis=1)

data

Unnamed: 0,country_code,coordinates,pollutant,unit,value,time,countryname,continent,converted_value,converted_units,AQI
46339,AF,"34.554384, 69.204597",PM2.5,µg/m³,136.000000,2021-08-15,Afghanistan,Asia,136.000000,µg/m³,192.564805
20436,AF,"34.535812, 69.190514",PM2.5,µg/m³,-999.000000,2021-08-16,Afghanistan,Asia,-999.000000,µg/m³,
41831,DZ,"36.755798, 3.039114",PM2.5,µg/m³,78.000000,2024-03-25,Algeria,Africa,78.000000,µg/m³,162.617492
49813,AQ,"-74.16627777777778, 4.577805599999999",NO2,ppm,0.051045,2021-02-04,Antartica,Antartica,0.051045,ppb,0.048155
49812,AQ,"-74.16627777777778, 4.577805599999999",CO,ppm,2.105187,2021-02-04,Antartica,Antartica,2.105187,ppm,23.922580
...,...,...,...,...,...,...,...,...,...,...,...
6286,VN,"21.02177, 105.819002",PM2.5,µg/m³,12.900000,2016-11-09,Vietnam,Asia,12.900000,µg/m³,52.682403
20327,VM,"10.782773, 106.700035",PM2.5,µg/m³,-999.000000,2023-05-31,Vietnam,Asia,-999.000000,µg/m³,
18437,VN,"21.048, 105.8",PM2.5,µg/m³,36.900000,2015-08-21,Vietnam,Asia,36.900000,µg/m³,104.447236
34381,VM,"21.021939, 105.818806",PM2.5,µg/m³,36.000000,2023-05-31,Vietnam,Asia,36.000000,µg/m³,102.231156


In [16]:
# Drop negative values as they are outliers or errors in measurements 
data = data[data['value'] >= 0]
data

Unnamed: 0,country_code,coordinates,pollutant,unit,value,time,countryname,continent,converted_value,converted_units,AQI
46339,AF,"34.554384, 69.204597",PM2.5,µg/m³,136.000000,2021-08-15,Afghanistan,Asia,136.000000,µg/m³,192.564805
41831,DZ,"36.755798, 3.039114",PM2.5,µg/m³,78.000000,2024-03-25,Algeria,Africa,78.000000,µg/m³,162.617492
49813,AQ,"-74.16627777777778, 4.577805599999999",NO2,ppm,0.051045,2021-02-04,Antartica,Antartica,0.051045,ppb,0.048155
49812,AQ,"-74.16627777777778, 4.577805599999999",CO,ppm,2.105187,2021-02-04,Antartica,Antartica,2.105187,ppm,23.922580
43251,AQ,"-74.16627777777778, 4.577805599999999",SO2,ppm,0.005255,2021-02-04,Antartica,Antartica,0.005255,ppb,0.007506
...,...,...,...,...,...,...,...,...,...,...,...
32086,UZ,"41.3672, 69.2725",PM2.5,µg/m³,14.000000,2023-05-31,Uzbekistan,Asia,14.000000,µg/m³,54.995708
6286,VN,"21.02177, 105.819002",PM2.5,µg/m³,12.900000,2016-11-09,Vietnam,Asia,12.900000,µg/m³,52.682403
18437,VN,"21.048, 105.8",PM2.5,µg/m³,36.900000,2015-08-21,Vietnam,Asia,36.900000,µg/m³,104.447236
34381,VM,"21.021939, 105.818806",PM2.5,µg/m³,36.000000,2023-05-31,Vietnam,Asia,36.000000,µg/m³,102.231156


In [17]:
data[data['AQI'].isnull()]


Unnamed: 0,country_code,coordinates,pollutant,unit,value,time,countryname,continent,converted_value,converted_units,AQI
20437,AQ,"-74.16627777777778, 4.577805599999999",O3,ppm,0.005127,2021-02-04,Antartica,Antartica,0.005127,ppm,
51495,AQ,"-72.0117, 2.5351",O3,µg/m³,43.200000,2020-01-08,Antartica,Antartica,0.021647,ppm,
25731,AR,"-34.883175, -58.682542000000005",NO,µg/m³,1.000000,2024-02-22,Argentina,South America,1.000000,µg/m³,
49815,AR,"-34.883175, -58.682542000000005",NOX,µg/m³,19.000000,2024-02-22,Argentina,South America,19.000000,µg/m³,
47732,AR,"-34.667375, -58.329231",O3,µg/m³,28.000000,2024-02-22,Argentina,South America,0.014030,ppm,
...,...,...,...,...,...,...,...,...,...,...,...
20272,US,"37.771861, -111.61541",O3,ppm,0.043000,2022-05-26,United States of America,North America,0.043000,ppm,
20269,US,"34.91263, -80.874283",O3,ppm,0.016000,2022-08-01,United States of America,North America,0.016000,ppm,
20263,US,"33.999449, -117.415831",NOX,ppm,0.012600,2023-04-22,United States of America,North America,0.012600,ppm,
5493,US,"29.8342, -95.4892",O3,ppm,0.045000,2023-05-31,United States of America,North America,0.045000,ppm,


In [18]:
# Create final data frame of converted columns and rename them as original features

converted_data = data.drop(['country_code', 'unit', 'value'], axis=1)
converted_data = converted_data.rename(columns={
    'converted_value': 'value',
    'converted_units': 'unit' 
    })
converted_data

Unnamed: 0,coordinates,pollutant,time,countryname,continent,value,unit,AQI
46339,"34.554384, 69.204597",PM2.5,2021-08-15,Afghanistan,Asia,136.000000,µg/m³,192.564805
41831,"36.755798, 3.039114",PM2.5,2024-03-25,Algeria,Africa,78.000000,µg/m³,162.617492
49813,"-74.16627777777778, 4.577805599999999",NO2,2021-02-04,Antartica,Antartica,0.051045,ppb,0.048155
49812,"-74.16627777777778, 4.577805599999999",CO,2021-02-04,Antartica,Antartica,2.105187,ppm,23.922580
43251,"-74.16627777777778, 4.577805599999999",SO2,2021-02-04,Antartica,Antartica,0.005255,ppb,0.007506
...,...,...,...,...,...,...,...,...
32086,"41.3672, 69.2725",PM2.5,2023-05-31,Uzbekistan,Asia,14.000000,µg/m³,54.995708
6286,"21.02177, 105.819002",PM2.5,2016-11-09,Vietnam,Asia,12.900000,µg/m³,52.682403
18437,"21.048, 105.8",PM2.5,2015-08-21,Vietnam,Asia,36.900000,µg/m³,104.447236
34381,"21.021939, 105.818806",PM2.5,2023-05-31,Vietnam,Asia,36.000000,µg/m³,102.231156


In [19]:
# Save processed data to file
converted_data.to_csv("../data/processed/world_air_quality.csv")

In [20]:
converted_data

Unnamed: 0,coordinates,pollutant,time,countryname,continent,value,unit,AQI
46339,"34.554384, 69.204597",PM2.5,2021-08-15,Afghanistan,Asia,136.000000,µg/m³,192.564805
41831,"36.755798, 3.039114",PM2.5,2024-03-25,Algeria,Africa,78.000000,µg/m³,162.617492
49813,"-74.16627777777778, 4.577805599999999",NO2,2021-02-04,Antartica,Antartica,0.051045,ppb,0.048155
49812,"-74.16627777777778, 4.577805599999999",CO,2021-02-04,Antartica,Antartica,2.105187,ppm,23.922580
43251,"-74.16627777777778, 4.577805599999999",SO2,2021-02-04,Antartica,Antartica,0.005255,ppb,0.007506
...,...,...,...,...,...,...,...,...
32086,"41.3672, 69.2725",PM2.5,2023-05-31,Uzbekistan,Asia,14.000000,µg/m³,54.995708
6286,"21.02177, 105.819002",PM2.5,2016-11-09,Vietnam,Asia,12.900000,µg/m³,52.682403
18437,"21.048, 105.8",PM2.5,2015-08-21,Vietnam,Asia,36.900000,µg/m³,104.447236
34381,"21.021939, 105.818806",PM2.5,2023-05-31,Vietnam,Asia,36.000000,µg/m³,102.231156


In [21]:
missing_AQI = converted_data[data['AQI'].isnull()]
missing_AQI

Unnamed: 0,coordinates,pollutant,time,countryname,continent,value,unit,AQI
20437,"-74.16627777777778, 4.577805599999999",O3,2021-02-04,Antartica,Antartica,0.005127,ppm,
51495,"-72.0117, 2.5351",O3,2020-01-08,Antartica,Antartica,0.021647,ppm,
25731,"-34.883175, -58.682542000000005",NO,2024-02-22,Argentina,South America,1.000000,µg/m³,
49815,"-34.883175, -58.682542000000005",NOX,2024-02-22,Argentina,South America,19.000000,µg/m³,
47732,"-34.667375, -58.329231",O3,2024-02-22,Argentina,South America,0.014030,ppm,
...,...,...,...,...,...,...,...,...
20272,"37.771861, -111.61541",O3,2022-05-26,United States of America,North America,0.043000,ppm,
20269,"34.91263, -80.874283",O3,2022-08-01,United States of America,North America,0.016000,ppm,
20263,"33.999449, -117.415831",NOX,2023-04-22,United States of America,North America,0.012600,ppm,
5493,"29.8342, -95.4892",O3,2023-05-31,United States of America,North America,0.045000,ppm,


In [22]:
missing_AQI[missing_AQI['pollutant']=='O3']['value'].describe()

count    8294.000000
mean        0.039170
std         0.145734
min         0.000000
25%         0.019000
50%         0.034000
75%         0.049000
max        13.000000
Name: value, dtype: float64

I think the NaNs could be due to rounding errors? Once the values are passed through the formulas they become so small that they are essentially zero aka NaN? Maybe a better way to categorize these values is with 'good' 'moderate' 'unhealthy for sensitive groups' etc.? 

https://www.airnow.gov/sites/default/files/2020-05/aqi-technical-assistance-document-sept2018.pdf

https://uk-air.defra.gov.uk/assets/documents/reports/cat06/0502160851_Conversion_Factors_Between_ppb_and.pdf

In [23]:
converted_data[data['AQI'].isnull()]['pollutant'].value_counts()

pollutant
O3       8294
NO       3771
CO       3662
NOX      2009
PM1       124
PM10       30
PM2.5      15
NO2        10
SO2         6
Name: count, dtype: int64