# Belgium's Air Quality EDA

In [None]:
import numpy as np
import pandas as pd
import os
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/air-quality-belgium/data.csv',delimiter=';')
df.head(10)

# Mapping

In [None]:
df['Longitude'] = list(map(lambda x: float(x.split(',')[0]), df.Coordinates))
df['Latitude'] = list(map(lambda x: float(x.split(',')[1]), df.Coordinates))

In [None]:
plt.scatter(df.Latitude, df.Longitude)
plt.grid()
plt.show()

In [None]:
bbox_coords = [df.Longitude.min(), df.Longitude.max(), df.Latitude.min(), df.Latitude.max()]
bbox_coords

In [None]:
from cartopy.io import shapereader
import geopandas
import matplotlib.pyplot as plt
import cartopy.crs as ccrs

# get country borders
resolution = '10m'
category = 'cultural'
name = 'admin_0_countries'

shpfilename = shapereader.natural_earth(resolution, category, name)
# read the shapefile using geopandas
dff = geopandas.read_file(shpfilename)
# read the belgian borders
poly = dff.loc[dff['ADMIN'] == 'Belgium']['geometry'].values[0]
shell_coords = list(poly.exterior.coords)

In [None]:
def show_map(pollutant):
    global shell_coords

    df_pollutant = df[df.Pollutant==pollutant]
    df.loc[df.Pollutant==pollutant, 'normalized'] = (df_pollutant.Value-df_pollutant.Value.min())/(df_pollutant.Value.max()-df_pollutant.Value.min())
    
    fig, ax = plt.subplots()
    df[df.Pollutant==pollutant].plot(kind="scatter", x="Latitude", y="Longitude",
        s=df[df.Pollutant==pollutant]['normalized'] * 250, label=f"{pollutant} (µg/m³)", c="Value", 
        cmap=plt.get_cmap("jet"),
        colorbar=True, alpha=0.4, figsize=(12,8), ax=ax
    )

    xs, ys = zip(*shell_coords)
    plt.plot(xs, ys)
    ax.set_xlabel("Longtitude")
    
    plt.legend()
    plt.show()

# Pollutants

In [None]:
pollutants = df[['Pollutant']].drop_duplicates()
pollutants = pollutants['Pollutant'].values.tolist()
pollutants

## PM10

In [None]:
pm10_df = df[df['Pollutant'] == 'PM10']['Value']
pm10_df.describe()

In [None]:
print('Distribution of PM10')
fig,ax = plt.subplots(1,2,figsize=(13,4))
sns.distplot(pm10_df.dropna(), ax=ax[0])
sns.boxplot(pm10_df.dropna(), ax=ax[1])
pm10_skew = pm10_df.skew()
plt.show()
print('Skewness = ', pm10_skew)

In [None]:
show_map('PM10')

## SO2

In [None]:
so2_df = df[df['Pollutant'] == 'SO2']['Value']
so2_df.describe()

In [None]:
print('Distribution of SO2')
fig,ax = plt.subplots(1,2,figsize=(13,4))
sns.distplot(so2_df.dropna(), ax=ax[0])
sns.boxplot(so2_df.dropna(), ax=ax[1])
so2_skew = so2_df.skew()
plt.show()
print('Skewness = ', so2_skew)

In [None]:
show_map('SO2')

## NO2

In [None]:
no2_df = df[df['Pollutant'] == 'NO2']['Value']
no2_df.describe()

In [None]:
print('Distribution of NO2')
fig,ax = plt.subplots(1,2,figsize=(13,4))
sns.distplot(no2_df.dropna(), ax=ax[0])
sns.boxplot(no2_df.dropna(), ax=ax[1])
no2_skew = no2_df.skew()
plt.show()
print('Skewness = ', no2_skew)

In [None]:
show_map('NO2')

## O3

In [None]:
o3_df = df[df['Pollutant'] == 'O3']['Value']
o3_df.describe()

In [None]:
print('Distribution of O3')
fig,ax = plt.subplots(1,2,figsize=(13,4))
sns.distplot(o3_df.dropna(), ax=ax[0])
sns.boxplot(o3_df.dropna(), ax=ax[1])
o3_skew = o3_df.skew()
plt.show()
print('Skewness = ', o3_skew)

In [None]:
show_map('O3')

## CO

In [None]:
co_df = df[df['Pollutant'] == 'CO']['Value']
co_df.describe()

In [None]:
print('Distribution of CO')
fig,ax = plt.subplots(1,2,figsize=(13,4))
sns.distplot(co_df.dropna(), ax=ax[0])
sns.boxplot(co_df.dropna(), ax=ax[1])
co_skew = co_df.skew()
plt.show()
print('Skewness = ', co_skew)

In [None]:
show_map('CO')

## PM2.5

In [None]:
pm25_df = df[df['Pollutant'] == 'PM2.5']['Value']
pm25_df.describe()

In [None]:
print('Distribution of PM2.5')
fig,ax = plt.subplots(1,2,figsize=(13,4))
sns.distplot(pm25_df.dropna(), ax=ax[0])
sns.boxplot(pm25_df.dropna(), ax=ax[1])
pm25_skew = pm25_df.skew()
plt.show()
print('Skewness = ', pm25_skew)

In [None]:
show_map('PM2.5')

# Locations

In [None]:
def show_top_locations(pollutant):
    return df[df['Pollutant'] == pollutant][['Location', 'Value']] \
        .groupby(['Location']) \
        .agg('mean') \
        .sort_values('Value', ascending=False) \
        .head(10) \
        .style \
        .background_gradient(cmap='viridis')

In [None]:
show_top_locations('PM10')

- Gent is a clear outlier if we view the PM10 ('fine dust') pollution. It's almost 3 times the amount than the second most polluted location with PM10!

In [None]:
show_top_locations('SO2')

In [None]:
show_top_locations('NO2')

In [None]:
show_top_locations('O3')

In [None]:
show_top_locations('CO')

In [None]:
show_top_locations('PM2.5')

- All the locations of the other pollutants are not significantly larger than the rest.

## Polluted Cities

In [None]:
# Flanders and Wallonia are two different parts of Belgium (upper and lower) and not cities. Brussels region is in between.
cities_df = df[(df.City != 'Flanders') & (df.City != 'Wallonia') & (df.City != 'Brussels-Capital Region')]

g = sns.FacetGrid(cities_df, col='City', col_wrap=6)
g.map(sns.barplot, "Pollutant", "Value", estimator=sum)
plt.show()

- Brussels and Hainaut are doing really bad on `CO`. Brussels is the capital of Belgium so that explains.
- Antwerpen is doing bad on all other pollutants, mainly because of the Port Of Antwerp, which is a very big port and thus comes with a lot of industrialization.

In [None]:
# Flanders and Wallonia are two different parts of Belgium (upper and lower). Brussels region is in between.
non_cities_df = df.query('City == "Flanders" or City == "Wallonia" or City == "Brussels-Capital Region"')

g = sns.FacetGrid(non_cities_df, col='City', col_wrap=7)
g.map(sns.barplot, "Pollutant", "Value", estimator=sum)
plt.show()

- Flanders is more polluted than Wallonia. This is due to Flanders being much more densely populated and that Wallonia is mostly forest. Brussels region is less polluted but is also a lot smaller than Flanders and Wallonia.

# Wrap up

- Flanders is more polluted than Wallonia. This is due to Flanders being much more densely populated and that Wallonia is mostly forest. Brussels region is less polluted but is also a lot smaller than Flanders and Wallonia.
- Brussels and Hainaut are doing really bad on `CO`. Brussels is the capital of Belgium so that explains.
- Antwerpen is doing bad on all other pollutants, mainly because of the Port Of Antwerp, which is a very big port and thus comes with a lot of industrialization.
- Gent is a clear outlier if we view the PM10 ('fine dust') pollution. It's almost 3 times the amount than the second most polluted location with PM10! (why? 🤔)