In [1]:
# IMPORT
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.path import Path
import numpy as np

# bokeh
import bokeh.palettes
from bokeh.plotting import figure, show, output_file, save
from bokeh.io import output_notebook
from bokeh.models import HoverTool, Legend, ColumnDataSource, Title, Band, LegendItem, CustomJS, Span, Label
from bokeh.tile_providers import get_provider, CARTODBPOSITRON
from bokeh.models import GeoJSONDataSource

output_notebook()

# Data Preprocessing

**Clean and export `air_quality_data.csv`**

**Clean and export `air_quality_stations.csv`**

# Visualisations

Load and join tables

In [554]:
# load air quality stations
df_stations = pd.read_csv('shared_data/air_quality/air_quality_stations.csv')
# load magnitud table
df_magnitud = pd.read_csv('shared_data/air_quality/air_quality_magnitud.csv', sep=';')
# load air quality data
df = pd.read_csv('data/air_quality_data.csv')
# converting Date to datetime type
df["datetime"] = pd.to_datetime(df["datetime"])
# merge with air quality stations
df = pd.merge(df, df_stations, left_on = 'PUNTO_MUESTREO', right_on='punto_muestreo', how='left').drop('PUNTO_MUESTREO', axis=1)
# merge with air quality magnitud
df = pd.merge(df, df_magnitud, left_on = 'MAGNITUD', right_on='magnitud_id', how='left').drop('MAGNITUD', axis=1)

# restrict up to February 2020 included
df = df[(df.datetime < pd.to_datetime('2020-02-29 23:59:59'))]
# remove negative values = errors
df = df[(df.value > 0) ]

df.head()

Unnamed: 0,PROVINCIA,MUNICIPIO,ESTACION,value,datetime,punto_muestreo,name,longitude,latitude,altitude,utm_x,utm_y,magnitud_id,formula,unit_per_m3
0,28,79,4,7.0,2016-04-01 01:00:00,28079004,Plaza de España,-3.712197,40.423883,637,-413239.904502,4927732.0,1,SO2,µg
1,28,79,4,8.0,2016-04-02 01:00:00,28079004,Plaza de España,-3.712197,40.423883,637,-413239.904502,4927732.0,1,SO2,µg
2,28,79,4,10.0,2016-04-03 01:00:00,28079004,Plaza de España,-3.712197,40.423883,637,-413239.904502,4927732.0,1,SO2,µg
3,28,79,4,7.0,2016-04-04 01:00:00,28079004,Plaza de España,-3.712197,40.423883,637,-413239.904502,4927732.0,1,SO2,µg
4,28,79,4,8.0,2016-04-05 01:00:00,28079004,Plaza de España,-3.712197,40.423883,637,-413239.904502,4927732.0,1,SO2,µg


Get the names of air quality stations:

In [555]:
print(f'AIR QUALITY STATION NAMES {len(df.name.unique())}:')
df.name.unique()

AIR QUALITY STATION NAMES 24:


array(['Plaza de España', 'Escuelas Aguirre', 'Ramón y Cajal',
       'Arturo Soria', 'Villaverde', 'Farolillo', 'Casa de Campo',
       'Barajas Pueblo', 'Plaza del Carmen', 'Moratalaz',
       'Cuatro Caminos', 'Barrio del Pilar', 'Vallecas', 'Méndez Álvaro',
       'Castellana', 'Retiro', 'Plaza Castilla', 'Ensanche de Vallecas',
       'Urbanización Embajada', 'Plaza Elíptica', 'Sanchinarro',
       'El Pardo', 'Juan Carlos I', 'Tres Olivos'], dtype=object)

### Air Quality Stations (map)

In [556]:
# load MC area
cm_points = pd.read_csv('shared_data/districts/central_madrid_points.csv')

# red = outside MC, blue = inside MC

points = df_stations[["utm_x", "utm_y"]].values
path = Path(cm_points[["utm_x", "utm_y"]].values)
points_in_path_mask = path.contains_points(points)

df_stations["color"] = "red"

df_stations.loc[points_in_path_mask, "color"] = "blue"
df_stations.head()

Unnamed: 0,punto_muestreo,name,longitude,latitude,altitude,utm_x,utm_y,color
0,28079035,Plaza del Carmen,-3.703167,40.419208,660,-412234.627656,4927049.0,blue
1,28079004,Plaza de España,-3.712197,40.423883,637,-413239.904502,4927732.0,red
2,28079039,Barrio del Pilar,-3.711536,40.478233,676,-413166.30995,4935683.0,red
3,28079008,Escuelas Aguirre,-3.682283,40.421564,672,-409909.905623,4927393.0,red
4,28079038,Cuatro Caminos,-3.707122,40.445547,699,-412674.958086,4930901.0,red


In [557]:
# plot map
p = figure(title="Air quality stations in Madrid", x_axis_type="mercator", y_axis_type="mercator")

source_in, source_out = ColumnDataSource(df_stations[df_stations.color=='blue']), ColumnDataSource(df_stations[df_stations.color=='red'])
cr_in = p.circle(x="utm_x", y="utm_y",  size=10, color="color", source=source_in)
cr_out = p.circle(x="utm_x", y="utm_y",  size=10, color="color", source=source_out)

cartodb = get_provider(CARTODBPOSITRON)
p.add_tile(cartodb)

p.add_tools(HoverTool(tooltips=[('Name', '@name')], renderers=[cr_in, cr_out]))

# add interactive legend
legend = Legend(items=[('IN Madrid Central area', [cr_in]), ('OUT of Madrid Central area', [cr_out])], location='center') 
legend.click_policy="hide"
legend.location = "top_left"
p.add_layout(legend)

show(p)

output_file("html_plots/air_quality_stations.html")
save(p)

'C:\\Users\\Laurine\\Documents\\DTU Python\\S2\\Social Data Analysis and Visualisation\\final project\\socialdata_madridcentral\\html_plots\\air_quality_stations.html'

### Compare two districts

## Visualisation 1: evolution of TRAP in Centro over time

*Compute the average and std of the value for each recorded gas.*

In [672]:
tracking_gas = ['CO', 'NO2', 'O3', 'SO2']
station = 'Plaza del Carmen'

In [673]:
df1 = df[(df.name == station)&(df.formula.isin(tracking_gas))][['formula','value','datetime']]
df1 = df1.pivot(index='datetime', columns='formula', values='value').reset_index()
df1['datetime'] = df1.datetime.dt.floor('D')
df1['datetime'] = df1['datetime'].apply(lambda dt: dt.replace(day=1))

In [674]:
# a SO2 outlier
df1[df1.SO2 >500 ]

formula,datetime,CO,NO2,O3,SO2
15950,2017-11-01,0.2,75.0,23.72,84530.0


In [675]:
# some CO outliers
df1[df1.CO > 10 ]

formula,datetime,CO,NO2,O3,SO2
15114,2017-10-01,20.8,120.0,2.15,18.0
15115,2017-10-01,28.5,63.0,23.52,18.0
15116,2017-10-01,30.2,45.0,34.09,17.0
15117,2017-10-01,33.9,58.0,25.13,17.0
15118,2017-10-01,35.6,43.0,30.85,17.0
15119,2017-10-01,33.5,47.0,25.31,17.0
15120,2017-10-01,20.6,39.0,34.17,17.0
15121,2017-10-01,11.7,42.0,31.08,17.0


In [676]:
# cut outliers!!!
df1 = df1[df1.SO2 <500 ]
df1 = df1[df1.CO < 10 ]

In [677]:
# homogenization (everything in µg/m^3)
for aGas in df1.columns.values[1:]:
    unit = df_magnitud[df_magnitud.formula==aGas].unit_per_m3.values[0]
    if ((unit == 'mg') or (unit == '10μg')):
        # we change to 10
        df1[aGas] = df1[aGas] * 100 #*1000 to get it in μg/m3 exactly
        idx = df_magnitud[df_magnitud.formula==aGas].index
        df_magnitud.loc[idx, 'unit_per_m3'] = '10μg'

In [678]:
# get all stats
df1_stats = df1.groupby(['datetime']).agg(['mean','std'])
df1_stats.columns = df1_stats.columns.to_flat_index()
df1_stats.columns = pd.Index([a+'_'+b for a,b in df1_stats.columns])

In [679]:
def get_month_year(aRow):
    return aRow.name.month_name() + ' ' + str(aRow.name.year)

df1_stats['date'] = df1_stats.apply(get_month_year, axis=1)

In [680]:
for aGas in tracking_gas:
    meanColumn = aGas+'_mean'
    stdColumn = aGas+'_std'
    df1_stats[aGas+'_upper'] = df1_stats[meanColumn]+df1_stats[aGas+'_std']
    df1_stats[aGas+'_lower'] = df1_stats[meanColumn]-df1_stats[aGas+'_std']

In [681]:
df1_stats.head()

Unnamed: 0_level_0,CO_mean,CO_std,NO2_mean,NO2_std,O3_mean,O3_std,SO2_mean,SO2_std,date,CO_upper,CO_lower,NO2_upper,NO2_lower,O3_upper,O3_lower,SO2_upper,SO2_lower
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2016-01-01,40.861373,21.769706,45.985195,20.628365,23.598197,17.309997,8.12786,3.751523,January 2016,62.631079,19.091667,66.61356,25.35683,40.908194,6.288199,11.879383,4.376337
2016-02-01,37.715517,20.576849,42.564655,22.30163,23.096273,12.555148,10.965517,11.855313,February 2016,58.292366,17.138668,64.866286,20.263025,35.651421,10.541126,22.82083,-0.889796
2016-03-01,35.222073,14.196273,45.866756,20.416413,31.79891,17.734705,10.61642,4.222849,March 2016,49.418346,21.025799,66.28317,25.450343,49.533614,14.064205,14.839269,6.393571
2016-04-01,29.5,9.726439,41.590278,17.17229,40.560946,18.886012,12.375,4.638046,April 2016,39.226439,19.773561,58.762567,24.417988,59.446958,21.674934,17.013046,7.736954
2016-05-01,29.771505,10.861985,38.678763,20.216933,44.898387,21.055534,13.033602,4.388312,May 2016,40.63349,18.909521,58.895697,18.46183,65.953921,23.842853,17.421914,8.64529


*Generate Bokeh figure*

In [682]:
# create annotations for time marks
startMC = time.mktime(dt(2018, 11, 30, 0, 0, 0).timetuple())*1000
startMC_span = Span(location=startMC,
                              dimension='height', line_color='black',
                              line_dash='dashed', line_width=2, line_alpha=0.3)

finesMC = time.mktime(dt(2019, 3, 15, 0, 0, 0).timetuple())*1000
finesMC_span = Span(location=finesMC,
                              dimension='height', line_color='black',
                              line_dash='dashed', line_width=2, line_alpha=0.3)

endMC = time.mktime(dt(2019, 7, 1, 0, 0, 0).timetuple())*1000
endMC_span = Span(location=endMC,
                              dimension='height', line_color='black',
                              line_dash='dashed', line_width=2, line_alpha=0.3)

In [683]:
cds_stats = ColumnDataSource(data=df1_stats)

p = figure(
    x_axis_type="datetime",
    width=950,
    height=450,
    title='Evolution of pollutant concentrations over time in Centro, Madrid', 
    y_axis_label='Gas Concentration', 
    x_axis_label='Date'
)

# create color palette
colors_gas = dict(zip(df1_mean.columns.values,list(bokeh.palettes.brewer['Dark2'][len(df1_mean.columns.values)])))

# add the data of each gas + interactive legend
lines, circles, bands = {}, {}, {}
items = [] 
for aGas in tracking_gas:
    unit = df_magnitud[df_magnitud.formula==aGas].unit_per_m3.values[0]
    # add line of mean
    lines[aGas] = p.line('datetime', aGas+'_mean', source=cds_stats, color = colors_gas[aGas])
    # add dots of mean
    circles[aGas] = p.circle('datetime',aGas+'_mean', source=cds_stats, color=colors_gas[aGas], size=5, alpha=0.3)
    p.add_tools(HoverTool(tooltips=[
        ('Gas',aGas),
        ('Date', '@date'),
        ('Average value', f'@{aGas}_mean {unit}/m3'), 
        ('Standard Deviation', f'@{aGas}_std {unit}/m3')
    ], renderers=[circles[aGas]]))
    # add variance
    bands[aGas] = p.varea(x='datetime', y1=aGas+'_upper', y2=aGas+'_lower', source=cds_stats, fill_alpha=0.1, fill_color=colors_gas[aGas])
    # append legend list
    items.append((f'{aGas} ({unit}/m3)', [lines[aGas], circles[aGas], bands[aGas]]))

# add legend
legend = Legend(items=items, location='center') 
legend.click_policy="hide"
legend.location = 'top_left'
p.add_layout(legend)

# add annotations to plot
p.add_layout(startMC_span)
p.add_layout(finesMC_span)
p.add_layout(endMC_span)

show(p)

output_file("html_plots/air_quality_evolution_centro.html")
save(p)

'C:\\Users\\Laurine\\Documents\\DTU Python\\S2\\Social Data Analysis and Visualisation\\final project\\socialdata_madridcentral\\html_plots\\air_quality_evolution_centro.html'

### Compare with the month of the previous year

In [684]:
from datetime import datetime as dt
import time

In [685]:
# get mean
df2_mean = df1.groupby(['datetime']).mean().reset_index()
df2_mean = df2_mean.set_index('datetime')
df2_mean["month"] = df2_mean.index.month
df2_mean["year"] = df2_mean.index.year

# get percentage
def get_progress_percent(aRow):
    previousRow = df2_mean[(df2_mean.month==aRow.month) & (df2_mean.year==aRow.year-1)]
    if len(previousRow) != 0:
        previousRow = previousRow.iloc[0]
        ratios = (aRow - previousRow)/previousRow
        datetime_to_keep = aRow.name
        ratios.name = datetime_to_keep
        ratios.month = datetime_to_keep.month
        ratios.year = datetime_to_keep.year
        return ratios
    return None
df2_ratios = df2_mean[df2_mean.year > 2016].apply(get_progress_percent, axis=1)

# get date display
df2_ratios['date'] = df2_ratios.apply(get_month_year, axis=1)

# bokeh
cds_ratios = ColumnDataSource(data=df2_ratios)

p = figure(
    width=950, 
    height=450, 
    x_axis_type="datetime",
    title='Comparisons of pollutant concentrations with the previous month in Centro, Madrid', 
    y_axis_label='+/- percentage based on the previous month', 
    x_axis_label='Date'
)

# add the data of each gas + interactive legend
items = [] 
lines, circles = {}, {}
for aGas in tracking_gas:
    unit = df_magnitud[df_magnitud.formula==aGas].unit_per_m3.values[0]
    # add line
    lines[aGas] = p.line('datetime', aGas, source=cds_ratios, color = colors_gas[aGas])
    # add dots
    circles[aGas] = p.circle('datetime',aGas, source=cds_ratios, color=colors_gas[aGas], size=5, alpha=0.3)
    p.add_tools(HoverTool(tooltips=[
        ('Gas',aGas),
        ('Date', '@date'),
        ('Previous Month comparison','@'+aGas)
    ], renderers=[circles[aGas]]))
    # append legend item
    items.append((f'{aGas} ({unit}/m3)', [lines[aGas], circles[aGas]]))

# add annotations to plot
p.add_layout(startMC_span)
p.add_layout(finesMC_span)
p.add_layout(endMC_span)

# add "zero" annotation
zero_line_span = Span(location=0, dimension='width', line_color='red', line_width=1, line_alpha=0.3)
p.add_layout(zero_line_span)
    
legend = Legend(items=items) 
legend.click_policy="hide"
legend.location = 'top_left'
p.add_layout(legend)

show(p)

output_file("html_plots/air_quality_evolution_with_previous_month_centro.html")
save(p)

'C:\\Users\\Laurine\\Documents\\DTU Python\\S2\\Social Data Analysis and Visualisation\\final project\\socialdata_madridcentral\\html_plots\\air_quality_evolution_with_previous_month_centro.html'

## Animated Map

In [686]:
import folium
from folium import plugins

In [687]:
selected_air_formula = 'NO2'

# filter by air type
df_map = df[df.formula == selected_air_formula]
# consider useful columns only
df_map = df_map[['value', 'datetime', 'name', 'longitude', 'latitude']]
# create year and month column
df_map['month'], df_map['year'] = df_map.datetime.dt.month, df_map.datetime.dt.year
# group by station name, year then month and get mean value
df_map = df_map.groupby(['name','year','month']).mean().reset_index()
# create an index, based on Month and Year
df_map['day'] = 1
df_map['date'] = pd.to_datetime(df_map[['month','year','day']])
# fill in with the basic color
df_map['fillColor'] = '#53c688'
# fill in the circle radius based on 'value'
min_value, max_value = min(df_map.value), max(df_map.value)
radius=20
df_map['radius'] = radius*(df_map['value']-min_value)/(max_value-min_value)

In [688]:
def create_geojson_features(df):
    "source: https://www.linkedin.com/pulse/visualizing-nyc-bike-data-interactive-animated-maps-folium-toso/"
    features = []
    
    for _, row in df.iterrows():
        feature = {
            'type': 'Feature',
            'geometry': {
                'type':'Point', 
                'coordinates':[row['longitude'],row['latitude']]
            },
            'properties': {
                'time': row['date'].__str__(),
                'style': {'color' : ''},
                'icon': 'circle',
                'iconstyle':{
                    'fillColor': row['fillColor'],
                    'fillOpacity': 0.8,
                    'stroke': 'true',
                    'radius': row['radius']
                }
            }
        }
        features.append(feature)
    return features

air_geojson = create_geojson_features(df_map)

In [689]:
from folium.plugins import TimestampedGeoJson

madrid_lat, madrid_long = 40.416775, -3.703790


madrid_map = folium.Map(location = [madrid_lat, madrid_long],
                    tiles = "CartoDB Positron",
                    zoom_start = 11)

TimestampedGeoJson(air_geojson,
                  period = 'P1M',
                  duration = 'P1M',
                  date_options='MM/YYYY',
                  transition_time = 400,
                  loop_button = True,
                  auto_play = True).add_to(madrid_map)

for i, aRow in df_stations.iterrows():
    popup_text = f"""
    <p><b>{aRow['name']}</b></p>
    <p>{'IN' if aRow['color']=='blue' else 'OUT of '} Madrid Central</p>
    """
    folium.Marker(
        [aRow.latitude, aRow.longitude],
        icon=folium.DivIcon(html=""),
        popup = popup_text
    ).add_to(madrid_map)

madrid_map

In [216]:
outfp = "html_plots/air_quality_map_v1.html"
madrid_map.save(outfp)

## Visualisation 2: evolution of a given gas with different