# Data Wrangling

In [1]:
import pandas as pd
import aqi
import pycountry_convert as pc

In [2]:
data = pd.read_csv("../data/raw/world_air_quality.csv", sep=';')
data = data.drop(["City", "Location", "Source Name"], axis=1)
data = data.rename(columns={'Last Updated': 'time', 
                     'Country Label': 'countryname',
                     'Value': 'value',
                     'Pollutant': 'pollutant',
                     'Coordinates': 'coordinates',
                     'Unit': 'unit',
                     'Country Code': 'country_code'})
data['time'] = pd.to_datetime(data['time']).dt.date
data.head()

Unnamed: 0,country_code,coordinates,pollutant,unit,value,time,countryname
0,JP,"33.880833, 130.873056",NO,ppm,0.002,2024-03-10,Japan
1,JP,"33.898056, 130.81",NO2,ppm,0.005,2024-03-10,Japan
2,JP,"33.895833, 130.935833",NOX,ppm,0.013,2024-03-10,Japan
3,JP,"42.786944, 141.605",NO2,ppm,0.004,2024-03-10,Japan
4,JP,"35.653889, 140.097778",NOX,ppm,0.003,2024-03-10,Japan


In [3]:
# Create continent column
def get_continent(country_code):
    try:
        continent_code = pc.country_alpha2_to_continent_code(country_code)
        continent_name = pc.convert_continent_code_to_continent_name(continent_code)
        return continent_name
    except KeyError:
        return 'Unknown'

data['continent'] = data.apply(lambda row: get_continent(row['country_code']), axis=1)
data


Unnamed: 0,country_code,coordinates,pollutant,unit,value,time,countryname,continent
0,JP,"33.880833, 130.873056",NO,ppm,0.002,2024-03-10,Japan,Asia
1,JP,"33.898056, 130.81",NO2,ppm,0.005,2024-03-10,Japan,Asia
2,JP,"33.895833, 130.935833",NOX,ppm,0.013,2024-03-10,Japan,Asia
3,JP,"42.786944, 141.605",NO2,ppm,0.004,2024-03-10,Japan,Asia
4,JP,"35.653889, 140.097778",NOX,ppm,0.003,2024-03-10,Japan,Asia
...,...,...,...,...,...,...,...,...
54250,IT,"40.64389299999999, 15.872893000000001",CO,µg/m³,295.000,2024-03-11,Italy,Europe
54251,IT,"46.030833, 11.905833",O3,µg/m³,35.000,2024-03-11,Italy,Europe
54252,IT,"43.55472, 10.32972",CO,µg/m³,600.000,2024-03-11,Italy,Europe
54253,IT,"43.91611, 11.006939999999998",PM2.5,µg/m³,15.000,2024-03-11,Italy,Europe


In [4]:
data['pollutant'].unique()

array(['NO', 'NO2', 'NOX', 'SO2', 'PM2.5', 'CO', 'O3', 'PM10', 'PM1',
       'TEMPERATURE', 'UM003', 'BC', 'RELATIVEHUMIDITY'], dtype=object)

In [5]:
# Create AQI score for each pollutant

# Function to calculate AQI score
def calculate_aqi(pollutant, value):
    if pollutant == 'PM2.5':
        return 1 #float(aqi.to_iaqi(aqi.POLLUTANT_PM25, value, algo=aqi.ALGO_EPA))
    elif pollutant == 'NO':
        return 1
    elif pollutant == 'NO2':
        return 1
    elif pollutant == 'NOX':
        return 1
    elif pollutant == 'SO2':
        return 1
    elif pollutant == 'CO':
        return 1
    elif pollutant == 'O3':
        return 1
    elif pollutant == 'PM10':
        return 1
    elif pollutant == 'PM1':
        return 1
    else:
        return None  # Return None for pollutants without AQI calculation

# Apply the function to create AQI column
#data['AQI'] = data.apply(lambda row: calculate_aqi(row['pollutant'], row['value']), axis=1)

data


Unnamed: 0,country_code,coordinates,pollutant,unit,value,time,countryname,continent
0,JP,"33.880833, 130.873056",NO,ppm,0.002,2024-03-10,Japan,Asia
1,JP,"33.898056, 130.81",NO2,ppm,0.005,2024-03-10,Japan,Asia
2,JP,"33.895833, 130.935833",NOX,ppm,0.013,2024-03-10,Japan,Asia
3,JP,"42.786944, 141.605",NO2,ppm,0.004,2024-03-10,Japan,Asia
4,JP,"35.653889, 140.097778",NOX,ppm,0.003,2024-03-10,Japan,Asia
...,...,...,...,...,...,...,...,...
54250,IT,"40.64389299999999, 15.872893000000001",CO,µg/m³,295.000,2024-03-11,Italy,Europe
54251,IT,"46.030833, 11.905833",O3,µg/m³,35.000,2024-03-11,Italy,Europe
54252,IT,"43.55472, 10.32972",CO,µg/m³,600.000,2024-03-11,Italy,Europe
54253,IT,"43.91611, 11.006939999999998",PM2.5,µg/m³,15.000,2024-03-11,Italy,Europe


In [6]:
myaqi = aqi.to_iaqi(aqi.POLLUTANT_PM25, '12', algo=aqi.ALGO_EPA)
int(myaqi)

50

In [7]:
# Save processed data to file
data.to_csv("../data/processed/world_air_quality.csv")