In [29]:
# IMPORT
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.path import Path
import numpy as np
import time
from datetime import datetime as dt

# bokeh
import bokeh.palettes
from bokeh.plotting import figure, show, output_file, save
from bokeh.io import output_notebook
from bokeh.models import HoverTool, Legend, ColumnDataSource, Title, Band, LegendItem, CustomJS, Span, Label
from bokeh.tile_providers import get_provider, CARTODBPOSITRON
from bokeh.models import GeoJSONDataSource

output_notebook()

# Data Preprocessing

**Clean and export `air_quality_data.csv`**

**Clean and export `air_quality_stations.csv`**

# Visualisations

Load and join tables

In [2]:
# load air quality stations
df_stations = pd.read_csv('shared_data/air_quality/air_quality_stations.csv')
# load magnitud table
df_magnitud = pd.read_csv('shared_data/air_quality/air_quality_magnitud.csv', sep=';')
# load air quality data
df = pd.read_csv('data/air_quality_data.csv')
# converting Date to datetime type
df["datetime"] = pd.to_datetime(df["datetime"])
# merge with air quality stations
df = pd.merge(df, df_stations, left_on = 'PUNTO_MUESTREO', right_on='punto_muestreo', how='left').drop('PUNTO_MUESTREO', axis=1)
# merge with air quality magnitud
df = pd.merge(df, df_magnitud, left_on = 'MAGNITUD', right_on='magnitud_id', how='left').drop('MAGNITUD', axis=1)

# restrict up to February 2020 included
df = df[(df.datetime < pd.to_datetime('2020-02-29 23:59:59'))]
# remove negative values = errors
df = df[(df.value > 0) ]

df.head()

Unnamed: 0,PROVINCIA,MUNICIPIO,ESTACION,value,datetime,punto_muestreo,name,longitude,latitude,altitude,utm_x,utm_y,magnitud_id,formula,unit_per_m3,Unnamed: 16,Unnamed: 17
0,28,79,4,1,7.00,7.0,2016-04-01 01:00:00,28079004,Plaza de España,-3.712197,40.423883,637,-413239.904502,4927732.0,1.0,SO2,µg
1,28,79,4,1,8.00,8.0,2016-04-02 01:00:00,28079004,Plaza de España,-3.712197,40.423883,637,-413239.904502,4927732.0,1.0,SO2,µg
2,28,79,4,1,10.00,10.0,2016-04-03 01:00:00,28079004,Plaza de España,-3.712197,40.423883,637,-413239.904502,4927732.0,1.0,SO2,µg
3,28,79,4,1,7.00,7.0,2016-04-04 01:00:00,28079004,Plaza de España,-3.712197,40.423883,637,-413239.904502,4927732.0,1.0,SO2,µg
4,28,79,4,1,8.00,8.0,2016-04-05 01:00:00,28079004,Plaza de España,-3.712197,40.423883,637,-413239.904502,4927732.0,1.0,SO2,µg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,,,,
7764331,28,79,60,14,50.48,2021-09-27 00:00:00,28079060,Tres Olivos,-3.689731,40.500547,715,-410738.926609,4.938949e+06,,,,
7764332,28,79,60,14,18.84,2021-09-28 00:00:00,28079060,Tres Olivos,-3.689731,40.500547,715,-410738.926609,4.938949e+06,,,,
7764333,28,79,60,14,61.49,2021-09-29 00:00:00,28079060,Tres Olivos,-3.689731,40.500547,715,-410738.926609,4.938949e+06,,,,
7764334,28,79,60,14,60.72,2021-09-30 00:00:00,28079060,Tres Olivos,-3.689731,40.500547,715,-410738.926609,4.938949e+06,,,,


Get the names of air quality stations:

In [3]:
print(f'AIR QUALITY STATION NAMES {len(df.name.unique())}:')
df.name.unique()

AIR QUALITY STATION NAMES 24:


array(['Plaza de España', 'Escuelas Aguirre', 'Ramón y Cajal',
       'Arturo Soria', 'Villaverde', 'Farolillo', 'Casa de Campo',
       'Barajas Pueblo', 'Plaza del Carmen', 'Moratalaz',
       'Cuatro Caminos', 'Barrio del Pilar', 'Vallecas', 'Méndez Álvaro',
       'Castellana', 'Retiro', 'Plaza Castilla', 'Ensanche de Vallecas',
       'Urbanización Embajada', 'Plaza Elíptica', 'Sanchinarro',
       'El Pardo', 'Juan Carlos I', 'Tres Olivos'], dtype=object)

### Air Quality Stations (map)

In [4]:
# load MC area
cm_points = pd.read_csv('shared_data/districts/central_madrid_points.csv')

# red = outside MC, blue = inside MC

points = df_stations[["utm_x", "utm_y"]].values
path = Path(cm_points[["utm_x", "utm_y"]].values)
points_in_path_mask = path.contains_points(points)

df_stations["color"] = "red"

df_stations.loc[points_in_path_mask, "color"] = "blue"
df_stations.head()

Unnamed: 0,punto_muestreo,name,longitude,latitude,altitude,utm_x,utm_y,color
0,28079035,Plaza del Carmen,-3.703167,40.419208,660,-412234.627656,4927049.0,blue
1,28079004,Plaza de España,-3.712197,40.423883,637,-413239.904502,4927732.0,red
2,28079039,Barrio del Pilar,-3.711536,40.478233,676,-413166.30995,4935683.0,red
3,28079008,Escuelas Aguirre,-3.682283,40.421564,672,-409909.905623,4927393.0,red
4,28079038,Cuatro Caminos,-3.707122,40.445547,699,-412674.958086,4930901.0,red


In [5]:
# plot map
p = figure(title="Air quality stations in Madrid", x_axis_type="mercator", y_axis_type="mercator")

source_in, source_out = ColumnDataSource(df_stations[df_stations.color=='blue']), ColumnDataSource(df_stations[df_stations.color=='red'])
cr_in = p.circle(x="utm_x", y="utm_y",  size=10, color="color", source=source_in)
cr_out = p.circle(x="utm_x", y="utm_y",  size=10, color="color", source=source_out)

cartodb = get_provider(CARTODBPOSITRON)
p.add_tile(cartodb)

p.add_tools(HoverTool(tooltips=[('Name', '@name')], renderers=[cr_in, cr_out]))

# add interactive legend
legend = Legend(items=[('IN Madrid Central area', [cr_in]), ('OUT of Madrid Central area', [cr_out])], location='center') 
legend.click_policy="hide"
legend.location = "top_left"
p.add_layout(legend)

show(p)

output_file("html_plots/air_quality_stations.html")
save(p)

'C:\\Users\\Laurine\\Documents\\DTU Python\\S2\\Social Data Analysis and Visualisation\\final project\\socialdata_madridcentral\\html_plots\\air_quality_stations.html'

---

In [8]:
import json
import utm
import numpy as np
import seaborn as sns

np.random.seed(42)

In [9]:
with open("shared_data/districts/districts.geojson", "r") as geojson:
    geodata = json.load(geojson)

## Visualisation 1: evolution of TRAP in Centro over time

*Compute the average and std of the value for each recorded gas.*

In [54]:
# create annotations for time marks
startMC = time.mktime(dt(2018, 11, 30, 0, 0, 0).timetuple())*1000
startMC_span = Span(location=startMC,
                              dimension='height', line_color='black',
                              line_dash='dashed', line_width=2, line_alpha=0.3)

finesMC = time.mktime(dt(2019, 3, 15, 0, 0, 0).timetuple())*1000
finesMC_span = Span(location=finesMC,
                              dimension='height', line_color='black',
                              line_dash='dashed', line_width=2, line_alpha=0.3)

endMC = time.mktime(dt(2019, 7, 1, 0, 0, 0).timetuple())*1000
endMC_span = Span(location=endMC,
                              dimension='height', line_color='black',
                              line_dash='dashed', line_width=2, line_alpha=0.3)

def get_month_year(aRow):
    return aRow.name.month_name() + ' ' + str(aRow.name.year)

def get_stats_dataframe(station):
    df1 = df[(df.name == station)][['formula','value','datetime']]
    df1 = df1.pivot(index='datetime', columns='formula', values='value').reset_index()
    df1['datetime'] = df1.datetime.dt.floor('D')
    df1['datetime'] = df1['datetime'].apply(lambda dt: dt.replace(day=1))
    tracking_gas = df1.columns.values[1:]
    # cut outliers!!!
    if station == 'Plaza del Carmen':
        df1 = df1[df1.SO2 <500 ]
        df1 = df1[df1.CO < 10 ]
    # homogenization (everything in µg/m^3)
    for aGas in df1.columns.values[1:]:
        unit = df_magnitud[df_magnitud.formula==aGas].unit_per_m3.values[0]
        if ((unit == 'mg') or (unit == '10μg')):
            # we change to 10
            df1[aGas] = df1[aGas] * 100 #*1000 to get it in μg/m3 exactly
            idx = df_magnitud[df_magnitud.formula==aGas].index
            df_magnitud.loc[idx, 'unit_per_m3'] = '10μg'
    # get all stats
    df1_stats = df1.groupby(['datetime']).agg(['mean','std'])
    df1_stats.columns = df1_stats.columns.to_flat_index()
    df1_stats.columns = pd.Index([a+'_'+b for a,b in df1_stats.columns])
    df1_stats['date'] = df1_stats.apply(get_month_year, axis=1)
    for aGas in tracking_gas:
        meanColumn = aGas+'_mean'
        stdColumn = aGas+'_std'
        df1_stats[aGas+'_upper'] = df1_stats[meanColumn]+df1_stats[aGas+'_std']
        df1_stats[aGas+'_lower'] = df1_stats[meanColumn]-df1_stats[aGas+'_std']
    return tracking_gas, df1_stats

def get_bokeh_viz_evolution_over_time(df1_stats, aText, tracking_gas):
    cds_stats = ColumnDataSource(data=df1_stats)

    p = figure(
        x_axis_type="datetime",
        width=950,
        height=450,
        title='Evolution of pollutant concentrations over time in '+aText, 
        y_axis_label='Gas Concentration', 
        x_axis_label='Date'
    )

    # create color palette
    colors_gas = dict(zip(tracking_gas,list(bokeh.palettes.brewer['Dark2'][len(tracking_gas)])))

    # add the data of each gas + interactive legend
    lines, circles, bands = {}, {}, {}
    items = [] 
    for aGas in tracking_gas:
        unit = df_magnitud[df_magnitud.formula==aGas].unit_per_m3.values[0]
        # add line of mean
        lines[aGas] = p.line('datetime', aGas+'_mean', source=cds_stats, color = colors_gas[aGas])
        # add dots of mean
        circles[aGas] = p.circle('datetime',aGas+'_mean', source=cds_stats, color=colors_gas[aGas], size=5, alpha=0.3)
        p.add_tools(HoverTool(tooltips=[
            ('Gas',aGas),
            ('Date', '@date'),
            ('Average value', f'@{aGas}_mean {unit}/m3'), 
            ('Standard Deviation', f'@{aGas}_std {unit}/m3')
        ], renderers=[circles[aGas]]))
        # add variance
        bands[aGas] = p.varea(x='datetime', y1=aGas+'_upper', y2=aGas+'_lower', source=cds_stats, fill_alpha=0.1, fill_color=colors_gas[aGas])
        # append legend list
        items.append((f'{aGas} ({unit}/m3)', [lines[aGas], circles[aGas], bands[aGas]]))

    # add legend
    legend = Legend(items=items, location='center') 
    legend.click_policy="hide"
    legend.location = 'top_left'
    p.add_layout(legend)

    # add annotations to plot
    p.add_layout(startMC_span)
    p.add_layout(finesMC_span)
    p.add_layout(endMC_span)
    
    return p

In [55]:
station = 'Plaza del Carmen'

all_tracking_gas, df_stats_plaza_del_carmen = get_stats_dataframe('Plaza del Carmen')
p_plaza_del_carmen = get_bokeh_viz_evolution_over_time(df_stats_plaza_del_carmen, station, all_tracking_gas)
show(p_plaza_del_carmen)

# output_file("html_plots/air_quality_evolution_centro.html")
# save(p)

In [76]:
station = 'El Pardo'

all_tracking_gas, df_stats_retiro = get_stats_dataframe(station)
p_retiro = get_bokeh_viz_evolution_over_time(df_stats_retiro, station, all_tracking_gas)
show(p_retiro)

# output_file("html_plots/air_quality_evolution_centro.html")
# save(p)

### Compare with the month of the previous year

In [109]:
def get_df_comparison(station):
    
    def get_progress_percent(aRow):
        previousRow = df2_mean[(df2_mean.month==aRow.month) & (df2_mean.year==aRow.year-1)]
        if len(previousRow) != 0:
            previousRow = previousRow.iloc[0]
            ratios = 100*(aRow - previousRow)/previousRow
            datetime_to_keep = aRow.name
            ratios.name = datetime_to_keep
            ratios.month = datetime_to_keep.month
            ratios.year = datetime_to_keep.year
            return ratios
        return None

    df1 = df[(df.name == station)][['formula','value','datetime']]
    df1 = df1.pivot(index='datetime', columns='formula', values='value').reset_index()
    df1['datetime'] = df1.datetime.dt.floor('D')
    df1['datetime'] = df1['datetime'].apply(lambda dt: dt.replace(day=1))
    tracking_gas = df1.columns.values[1:]
    
    # cut outliers!!!
    if station == 'Plaza del Carmen':
        df1 = df1[df1.SO2 <500 ]
        df1 = df1[df1.CO < 10 ]

    # get mean
    df2_mean = df1.groupby(['datetime']).mean().reset_index()
    df2_mean = df2_mean.set_index('datetime')
    df2_mean["month"] = df2_mean.index.month
    df2_mean["year"] = df2_mean.index.year

    # get percentage
    df2_ratios = df2_mean[df2_mean.year > 2016].apply(get_progress_percent, axis=1)

    # get date display
    df2_ratios['date'] = df2_ratios.apply(get_month_year, axis=1)
    
    return tracking_gas, df2_ratios

def get_bokeh_viz_ratios(df2_ratios, aText, tracking_gas):
    cds_ratios = ColumnDataSource(data=df2_ratios)
    
    # create color palette
    colors_gas = dict(zip(tracking_gas,list(bokeh.palettes.brewer['Dark2'][len(tracking_gas)])))

    p = figure(
        width=950, 
        height=450, 
        x_axis_type="datetime",
        title='Comparisons of pollutant concentrations with the month of previous year in '+aText, 
        y_axis_label='+/- percentage based on the previous month', 
        x_axis_label='Date'
    )

    # add the data of each gas + interactive legend
    items = [] 
    lines, circles = {}, {}
    for aGas in tracking_gas:
        unit = df_magnitud[df_magnitud.formula==aGas].unit_per_m3.values[0]
        # add line
        lines[aGas] = p.line('datetime', aGas, source=cds_ratios, color = colors_gas[aGas])
        # add dots
        circles[aGas] = p.circle('datetime',aGas, source=cds_ratios, color=colors_gas[aGas], size=5, alpha=0.3)
        p.add_tools(HoverTool(tooltips=[
            ('Gas',aGas),
            ('Date', '@date'),
            ('Previous Month comparison','@'+aGas+'%')
        ], renderers=[circles[aGas]]))
        # append legend item
        items.append((f'{aGas} ({unit}/m3)', [lines[aGas], circles[aGas]]))

    # add annotations to plot
    p.add_layout(startMC_span)
    p.add_layout(finesMC_span)
    p.add_layout(endMC_span)

    # add "zero" annotation
    zero_line_span = Span(location=0, dimension='width', line_color='red', line_width=1, line_alpha=0.3)
    p.add_layout(zero_line_span)

    legend = Legend(items=items) 
    legend.click_policy="hide"
    legend.location = 'top_left'
    p.add_layout(legend)
    
    return p

In [110]:
station = 'Plaza del Carmen'

all_tracking_gas, df_ratios_carmen = get_df_comparison(station)
p_ratios_carmen = get_bokeh_viz_ratios(df_ratios_carmen, station, all_tracking_gas)

show(p_ratios_carmen)

#output_file("html_plots/air_quality_evolution_with_previous_month_centro.html")
#save(p)

In [111]:
df_ratios_around = []
around_stations = ['Plaza de España', 'Castellana','Retiro','Méndez Álvaro']

for aStation in around_stations:
    all_tracking_gas, df_ratios = get_df_comparison(aStation)
    df_ratios_around.append(df_ratios)

df_ratios_around = pd.concat(df_ratios_around)
df_ratios_around.dropna(axis=1, how="any", inplace=True)

df_ratios_around = df_ratios_around.groupby('datetime').mean()
df_ratios_around['date'] = df_ratios_around.apply(get_month_year, axis=1)

p_ratios_around = get_bokeh_viz_ratios(df_ratios_around, ', '.join(around_stations), ['NO','NO2','NOx'])
show(p_ratios_around)

In [112]:
df_ratios_far = []
far_stations = ['El Pardo', 'Barajas Pueblo', 'Villaverde', 'Ensanche de Vallecas']

for aStation in far_stations:
    all_tracking_gas, df_ratios = get_df_comparison(aStation)
    df_ratios_far.append(df_ratios)

df_ratios_far = pd.concat(df_ratios_far)
df_ratios_far.dropna(axis=1, how="any", inplace=True)

df_ratios_far = df_ratios_far.groupby('datetime').mean()
df_ratios_far['date'] = df_ratios_far.apply(get_month_year, axis=1)

In [104]:
p_ratios_far = get_bokeh_viz_ratios(df_ratios_far, ', '.join(far_stations), ['NO','NO2','NOx', 'O3'])
show(p_ratios_far)

## Animated Map

In [None]:
import folium
from folium import plugins

In [None]:
selected_air_formula = 'NO2'

# filter by air type
df_map = df[df.formula == selected_air_formula]
# consider useful columns only
df_map = df_map[['value', 'datetime', 'name', 'longitude', 'latitude']]
# create year and month column
df_map['month'], df_map['year'] = df_map.datetime.dt.month, df_map.datetime.dt.year
# group by station name, year then month and get mean value
df_map = df_map.groupby(['name','year','month']).mean().reset_index()
# create an index, based on Month and Year
df_map['day'] = 1
df_map['date'] = pd.to_datetime(df_map[['month','year','day']])
# fill in with the basic color
df_map['fillColor'] = '#53c688'
# fill in the circle radius based on 'value'
min_value, max_value = min(df_map.value), max(df_map.value)
radius=20
df_map['radius'] = radius*(df_map['value']-min_value)/(max_value-min_value)

In [None]:
def create_geojson_features(df):
    "source: https://www.linkedin.com/pulse/visualizing-nyc-bike-data-interactive-animated-maps-folium-toso/"
    features = []
    
    for _, row in df.iterrows():
        feature = {
            'type': 'Feature',
            'geometry': {
                'type':'Point', 
                'coordinates':[row['longitude'],row['latitude']]
            },
            'properties': {
                'time': row['date'].__str__(),
                'style': {'color' : ''},
                'icon': 'circle',
                'iconstyle':{
                    'fillColor': row['fillColor'],
                    'fillOpacity': 0.8,
                    'stroke': 'true',
                    'radius': row['radius']
                }
            }
        }
        features.append(feature)
    return features

air_geojson = create_geojson_features(df_map)

In [None]:
from folium.plugins import TimestampedGeoJson

madrid_lat, madrid_long = 40.416775, -3.703790


madrid_map = folium.Map(location = [madrid_lat, madrid_long],
                    tiles = "CartoDB Positron",
                    zoom_start = 11)

TimestampedGeoJson(air_geojson,
                  period = 'P1M',
                  duration = 'P1M',
                  date_options='MM/YYYY',
                  transition_time = 400,
                  loop_button = True,
                  auto_play = True).add_to(madrid_map)

for i, aRow in df_stations.iterrows():
    popup_text = f"""
    <p><b>{aRow['name']}</b></p>
    <p>{'IN' if aRow['color']=='blue' else 'OUT of '} Madrid Central</p>
    """
    folium.Marker(
        [aRow.latitude, aRow.longitude],
        icon=folium.DivIcon(html=""),
        popup = popup_text
    ).add_to(madrid_map)

madrid_map

In [None]:
outfp = "html_plots/air_quality_map_v1.html"
madrid_map.save(outfp)

## Visualisation 2: evolution of a given gas with different