# Data Wrangling

In [1]:
import pandas as pd
import aqi
import pycountry_convert as pc
import pycountry

In [2]:
data = pd.read_csv("../data/raw/world_air_quality.csv", sep=';')

data = data.drop(["City", "Location", "Source Name"], axis=1)

data = data.rename(columns={'Last Updated': 'time', 
                     'Country Label': 'countryname',
                     'Value': 'value',
                     'Pollutant': 'pollutant',
                     'Coordinates': 'coordinates',
                     'Unit': 'unit',
                     'Country Code': 'country_code'})

data['time'] = pd.to_datetime(data['time']).dt.date
data.head()

Unnamed: 0,country_code,coordinates,pollutant,unit,value,time,countryname
0,JP,"33.880833, 130.873056",NO,ppm,0.002,2024-03-10,Japan
1,JP,"33.898056, 130.81",NO2,ppm,0.005,2024-03-10,Japan
2,JP,"33.895833, 130.935833",NOX,ppm,0.013,2024-03-10,Japan
3,JP,"42.786944, 141.605",NO2,ppm,0.004,2024-03-10,Japan
4,JP,"35.653889, 140.097778",NOX,ppm,0.003,2024-03-10,Japan


In [3]:
# Create continent column
def get_continent(country_code):
    try:
        continent_code = pc.country_alpha2_to_continent_code(country_code)
        continent_name = pc.convert_continent_code_to_continent_name(continent_code)
        return continent_name
    except KeyError:
        return 'Unknown'

data['continent'] = data.apply(lambda row: get_continent(row['country_code']), axis=1)
data


Unnamed: 0,country_code,coordinates,pollutant,unit,value,time,countryname,continent
0,JP,"33.880833, 130.873056",NO,ppm,0.002,2024-03-10,Japan,Asia
1,JP,"33.898056, 130.81",NO2,ppm,0.005,2024-03-10,Japan,Asia
2,JP,"33.895833, 130.935833",NOX,ppm,0.013,2024-03-10,Japan,Asia
3,JP,"42.786944, 141.605",NO2,ppm,0.004,2024-03-10,Japan,Asia
4,JP,"35.653889, 140.097778",NOX,ppm,0.003,2024-03-10,Japan,Asia
...,...,...,...,...,...,...,...,...
54250,IT,"40.64389299999999, 15.872893000000001",CO,µg/m³,295.000,2024-03-11,Italy,Europe
54251,IT,"46.030833, 11.905833",O3,µg/m³,35.000,2024-03-11,Italy,Europe
54252,IT,"43.55472, 10.32972",CO,µg/m³,600.000,2024-03-11,Italy,Europe
54253,IT,"43.91611, 11.006939999999998",PM2.5,µg/m³,15.000,2024-03-11,Italy,Europe


In [23]:
# Impute missing country names from country code

def get_country_name(country_code):
    try:
        country = pycountry.countries.get(alpha_2=country_code)
        if country:
            return country.name
        else:
            return None
    except Exception as e:
        return str(e)

data['countryname'] = data.apply(lambda row: get_country_name(row['country_code']), axis=1)
data


Unnamed: 0,country_code,coordinates,pollutant,unit,value,time,countryname,continent
46339,AF,"34.554384, 69.204597",PM2.5,µg/m³,136.0,2021-08-15,Afghanistan,Asia
20436,AF,"34.535812, 69.190514",PM2.5,µg/m³,-999.0,2021-08-16,Afghanistan,Asia
41831,DZ,"36.755798, 3.039114",PM2.5,µg/m³,78.0,2024-03-25,Algeria,Africa
11489,AD,"42.5096939994651, 1.539138",O3,µg/m³,50.5,2023-05-31,Andorra,Europe
20433,AD,"42.5096939994651, 1.539138",SO2,µg/m³,0.2,2023-05-31,Andorra,Europe
...,...,...,...,...,...,...,...,...
53893,XK,"42.661995, 21.15055",PM2.5,µg/m³,18.0,2023-05-31,Country code not found,Europe
53894,XK,"42.659656, 21.157309",PM2.5,µg/m³,13.1,2023-05-26,Country code not found,Europe
53895,XK,"42.659728, 21.083317",PM10,µg/m³,13.7,2023-05-26,Country code not found,Europe
53896,XK,"42.659728, 21.083317",SO2,µg/m³,0.0,2023-05-26,Country code not found,Europe


In [4]:
data['pollutant'].unique()

array(['NO', 'NO2', 'NOX', 'SO2', 'PM2.5', 'CO', 'O3', 'PM10', 'PM1',
       'TEMPERATURE', 'UM003', 'BC', 'RELATIVEHUMIDITY'], dtype=object)

In [9]:
# Create AQI score for each pollutant

# Function to calculate AQI score
def calculate_aqi(pollutant, value):
    if pollutant == 'PM2.5':
        return 1 #float(aqi.to_iaqi(aqi.POLLUTANT_PM25, value, algo=aqi.ALGO_EPA))
    elif pollutant == 'NO':
        return 1
    elif pollutant == 'NO2':
        return 1
    elif pollutant == 'NOX':
        return 1
    elif pollutant == 'SO2':
        return 1
    elif pollutant == 'CO':
        return 1
    elif pollutant == 'O3':
        return 1
    elif pollutant == 'PM10':
        return 1
    elif pollutant == 'PM1':
        return 1
    else:
        return None  # Return None for pollutants without AQI calculation

# Apply the function to create AQI column
#data['AQI'] = data.apply(lambda row: calculate_aqi(row['pollutant'], row['value']), axis=1)

# myaqi = aqi.to_iaqi(aqi.POLLUTANT_PM25, '12', algo=aqi.ALGO_EPA)
# int(myaqi)



Unnamed: 0,country_code,coordinates,pollutant,unit,value,time,countryname,continent
46339,AF,"34.554384, 69.204597",PM2.5,µg/m³,136.0,2021-08-15,Afghanistan,Asia
20436,AF,"34.535812, 69.190514",PM2.5,µg/m³,-999.0,2021-08-16,Afghanistan,Asia
41831,DZ,"36.755798, 3.039114",PM2.5,µg/m³,78.0,2024-03-25,Algeria,Africa
11489,AD,"42.5096939994651, 1.539138",O3,µg/m³,50.5,2023-05-31,Andorra,Europe
20433,AD,"42.5096939994651, 1.539138",SO2,µg/m³,0.2,2023-05-31,Andorra,Europe
...,...,...,...,...,...,...,...,...
53893,XK,"42.661995, 21.15055",PM2.5,µg/m³,18.0,2023-05-31,,Europe
53894,XK,"42.659656, 21.157309",PM2.5,µg/m³,13.1,2023-05-26,,Europe
53895,XK,"42.659728, 21.083317",PM10,µg/m³,13.7,2023-05-26,,Europe
53896,XK,"42.659728, 21.083317",SO2,µg/m³,0.0,2023-05-26,,Europe


In [25]:
correction_mapping = {
    "Czech Republic": "Czechia",
    "Russian Federation": "Russia",
    "Macedonia, The former Yugoslav Rep. of": "North Macedonia",
    "Taiwan, China": "Taiwan",
    "United States": "United States of America",
    "Lao People's Dem. Rep.": "Laos",
    "Moldova, Republic of": "Moldova",
    "Serbia": "Republic of Serbia",
    "Hong Kong, China": "China",  # Assuming you want to map Hong Kong to China 
    "West Bank and Gaza Strip": "Palestine",  # Assuming mapping to Palestine
}

data['countryname'] = data['countryname'].apply(lambda x: correction_mapping.get(x, x))

countries_to_drop = ['Andorra', 'Gibraltar', 'Malta']
filtered_data = data[~data['countryname'].isin(countries_to_drop)]

data = filtered_data

data = data.sort_values(by='countryname')
data

In [26]:
# Save processed data to file
data.to_csv("../data/processed/world_air_quality.csv")

In [27]:
data

Unnamed: 0,country_code,coordinates,pollutant,unit,value,time,countryname,continent
46339,AF,"34.554384, 69.204597",PM2.5,µg/m³,136.0,2021-08-15,Afghanistan,Asia
20436,AF,"34.535812, 69.190514",PM2.5,µg/m³,-999.0,2021-08-16,Afghanistan,Asia
41831,DZ,"36.755798, 3.039114",PM2.5,µg/m³,78.0,2024-03-25,Algeria,Africa
36274,AG,"36.755798, 3.039114",PM2.5,µg/m³,10.0,2023-05-31,Antigua and Barbuda,North America
15655,AR,"-34.883175, -58.682542000000005",PM2.5,µg/m³,1.0,2023-12-16,Argentina,South America
...,...,...,...,...,...,...,...,...
53893,XK,"42.661995, 21.15055",PM2.5,µg/m³,18.0,2023-05-31,Country code not found,Europe
53894,XK,"42.659656, 21.157309",PM2.5,µg/m³,13.1,2023-05-26,Country code not found,Europe
53895,XK,"42.659728, 21.083317",PM10,µg/m³,13.7,2023-05-26,Country code not found,Europe
53896,XK,"42.659728, 21.083317",SO2,µg/m³,0.0,2023-05-26,Country code not found,Europe
