# Traffic analysis - Madrid Central

In this notebook we want to analyze if there is a significant change in the use of traffic inside the area of Madrid Central when the measure was instaured. For that we analyze data from 2016 until 2021.

## Imports

In [None]:
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
from matplotlib.path import Path
import json
import zipfile
import io
import os
import utm
import seaborn as sns

from IPython.display import display

from bokeh.plotting import figure, show, output_file, save, reset_output
from bokeh.io import output_notebook
from bokeh.models import HoverTool, Legend, ColumnDataSource,\
                        Title, GeoJSONDataSource, DatetimeTickFormatter,\
                        Span, CheckboxGroup, CustomJS, Button
from bokeh.models.tickers import CategoricalTicker
from bokeh.tile_providers import get_provider, CARTODBPOSITRON
from bokeh.transform import linear_cmap, dodge
from bokeh.layouts import row, column, layout
from datetime import datetime as dt

from tqdm import tqdm

output_notebook()
np.random.seed(42)

## Data download

The data we want to work with is very large, thus we need to download it from the source as it is not possible to upload it to the version control system we use (GitHub). 

In [None]:
def download_data():
    """ Download all traffic data from January 2016 (ID=32) until December 2021 (ID=103)
        Some files do not follow the same naming convention, and need repairing.
        The name convention that most files follow is '{num_month}-{num_year}.yaml',
        so everyone will follow that
    """
    FIRST_MONTH_ID = 32
    LAST_MONTH_ID = 81
    DATA_PATH = "data"
    
    for month_id in tqdm(range(FIRST_MONTH_ID, LAST_MONTH_ID+1), desc="Downloading data", unit="file"):
        
        # Get month number, from 1 to 12
        current_month = ((month_id - FIRST_MONTH_ID) % 12) + 1
        
        # Get year number, from 2016 to 2021
        current_year = int((month_id - FIRST_MONTH_ID) / 12) + 2016    
               
        # If it has been downloaded already, skip it

        file_path = f"{DATA_PATH}/{current_month:02d}-{current_year}.csv"

        if not os.path.isfile(file_path):

            url = f"https://datos.madrid.es/egob/catalogo/208627-{month_id}-transporte-ptomedida-historico.zip"
            r = requests.get(url)
            z = zipfile.ZipFile(io.BytesIO(r.content))
            zipcsv = z.infolist()[-1]
            
            # Rename file
            zipcsv.filename = file_path
            
            # Extract file
            z.extract(zipcsv)

In [None]:
do_download = int(input("WRITE '1' TO DOWNLOAD DATA OR '0' TO NOT "))

if do_download:
    download_data()

## Display location of traffic measurement points

Before diving into the actual data, we need to contextualize. Madrid is divided into districts. There are *21* one of them, being the area of **Madrid Central** exactly the same as the **Centro district** area (thus the name).

We have a dataset of where the measure of traffic points are located. As expected, they are not evenly distributed. Our first task is to see in which district each traffic measurement point is located.

In [None]:
traffic_points = pd.read_csv("shared_data/traffic_points/pmed_trafico_03052016.csv", sep=";")
traffic_points.head()

First we need to calculate the correct *utm* for displaying in `bokeh` maps.

In [None]:
def utm_from_latlon(lat, lon):
    """ From a given lat and lon, calculates the correct UTM coordinates to 
        plot using `bokeh` 
    """
    r_major = 6378137.000
    x = r_major * np.radians(lon)
    scale = x/lon
    y = 180.0/np.pi * np.log(np.tan(np.pi/4.0 + 
        lat * (np.pi/180.0)/2.0)) * scale

    return x, y

def get_lat_lon_utm(row):
    """ From a row containing the columns 'st_x' and 'st_y' calculates both the lat and lon
        and the correct UTM coordinates to plot using `bokeh`
    """

    # 30 and 'T' is the zone of Madrid
    lat, lon = utm.to_latlon(row["st_x"], row["st_y"], 30, "T")
    
    x, y = utm_from_latlon(lat, lon)

    return pd.Series([lat, lon, x, y])

In [None]:
traffic_points[["latitude", "longitude", "utm_x", "utm_y"]] = traffic_points.apply(get_lat_lon_utm, axis=1)
traffic_points.head()

Then load the districts information to display them in the map.

In [None]:
with open("shared_data/districts/districts.geojson", "r") as geojson:
    geodata = json.load(geojson)

In [None]:
df_districts = pd.DataFrame([], columns=["name", "latitude",
                                         "longitude", "utm_x",
                                         "utm_y"])
for district in geodata["features"]:
    # Get district name
    district_name = district["properties"]["NOMBRE"]
    
    # Get district coordinates
    district_coord = district["geometry"]["coordinates"][0]
    df_district = pd.DataFrame(district["geometry"]["coordinates"][0], columns=["st_x", "st_y"])
    df_district["name"] = district_name
    
    # Calculate correct utm
    df_district[["latitude", "longitude", "utm_x", "utm_y"]] = df_district.apply(get_lat_lon_utm, axis=1)
    df_district = df_district.drop(columns=["st_x", "st_y"])
    
    # Append to all districts dataframe
    df_districts = pd.concat([df_districts, df_district]).reset_index(drop=True)


district_name = df_districts["name"].unique()
df_districts

Save in which district is each traffic point.

In [None]:
traffic_points["district"] = "None"
points = traffic_points[["utm_x", "utm_y"]]

for name in district_name:
    path = Path(df_districts[df_districts["name"] == name][["utm_x", "utm_y"]])
    points_in_path_mask = path.contains_points(points)
    traffic_points.loc[points_in_path_mask, "district"] = name

# Discard the traffic points outside any district of Madrid, as they are outside the city
    
traffic_points = traffic_points.drop(traffic_points[traffic_points["district"] == "None"].index)\
                .reset_index(drop=True)

traffic_points.head()

In [None]:
district_colors = sns.color_palette('colorblind', len(district_name))
np.random.shuffle(district_colors)
district_colors

In [None]:
def get_color_from_palette(color):
    """ Getting colors for plotting """
    return tuple([int(c * 255) for c in color])

def get_dark_color_from_palette(color):
    """ Getting darker colors for plotting """
    return tuple([int(c * 200) for c in color])

## District map

In [None]:
p = figure(title="Districts of Madrid", x_axis_type="mercator", y_axis_type="mercator",
           height=600, width=600, tools="")

p.axis.visible = False
p.toolbar.logo = None
p.toolbar_location = None


for name, color in zip(district_name, district_colors):
    # Districts
    source_dict = dict(utm_x = [[x for x in df_districts[df_districts["name"] == name]["utm_x"]]],
                       utm_y = [[y for y in df_districts[df_districts["name"] == name]["utm_y"]]],
                       name = [name])

    source = ColumnDataSource(source_dict)
    p.patches(xs="utm_x", ys="utm_y", color=get_color_from_palette(color), line_width=3, alpha=0.4, 
            source=source, muted=False, muted_alpha=0.1)

cartodb = get_provider(CARTODBPOSITRON)
p.add_tile(cartodb)

TOOLTIPS = [
    ("", "@name"),
]
p.add_tools(HoverTool(tooltips=TOOLTIPS))

p.background_fill_color = None
p.border_fill_color = None

p = column(p, sizing_mode='scale_both')

output_file("html_plots/districts.html", title="Districts Map")
save(p)

reset_output()

output_notebook()

# show(p)

In [None]:
print(f"Number of traffic measurement stations: {len(traffic_points)}")
print(f"Number of traffic measurement stations inside Madrid Central: {len(traffic_points[traffic_points['district'] == 'Centro'])}")

In [None]:
traffic_points

In [None]:
colors = sns.color_palette('Set2', 2)
# np.random.shuffle(plot_colors)
colors

In [None]:
p = figure(title="Traffic measurement stations in Madrid", x_axis_type="mercator", y_axis_type="mercator",
           height=700, width=800)
p.axis.visible = False

# for name, color in zip(district_name, district_colors):
#     # Districts
#     source = ColumnDataSource(df_districts[df_districts["name"] == name])
#     p.patch(x="utm_x", y="utm_y", color=get_color_from_palette(color), line_width=3, alpha=0.4, 
#             source=source, legend_label=name, muted=True, muted_alpha=0.1)
#     # Traffic points
#     source = ColumnDataSource(traffic_points[traffic_points["district"] == name])
#     p.circle(x="utm_x", y="utm_y", color=get_color_from_palette(color), line_width=1,
#             source=source, legend_label=name, muted=True, muted_alpha=0.3, radius=30,
#             line_color=get_dark_color_from_palette(color))
    
# Madrid Central
source = ColumnDataSource(df_districts[df_districts["name"] == "Centro"])
p.line(x="utm_x", y="utm_y", color=get_color_from_palette(colors[1]), line_width=4, 
        source=source, legend_label="Madrid Central limit", muted=False, muted_alpha=0.3)

source = ColumnDataSource(traffic_points[traffic_points["district"] != "Centro"])
circles_out = p.circle(x="utm_x", y="utm_y", color=get_color_from_palette(colors[0]), line_width=1,
        source=source, muted_alpha=0.3, size=5,
        line_color=get_dark_color_from_palette(colors[0]), legend_label="OUT of Madrid Central Area")

source = ColumnDataSource(traffic_points[traffic_points["district"] == "Centro"])
circles_in = p.circle(x="utm_x", y="utm_y", color=get_color_from_palette(colors[1]), line_width=1,
        source=source, muted_alpha=0.3, size=5,
        line_color=get_dark_color_from_palette(colors[1]), legend_label="IN Madrid Central Area")


# Hover tooltip
TOOLTIPS = [
    ("Name", "@nombre"),
    ("District", "@district")
]
p.add_tools(HoverTool(tooltips=TOOLTIPS, renderers=[circles_out, circles_in]))

cartodb = get_provider(CARTODBPOSITRON)
p.add_tile(cartodb)
# p.add_layout(p.legend[0], "right")
p.legend.click_policy = "mute"

p = column(p, sizing_mode='scale_both')

# output_file("html_plots/traffic_points.html", title="Traffic stations")
# save(p)

# reset_output()

# output_notebook()

show(p)

## Loading traffic information

The next step is to finally load the datasets for traffic information. This datasets have a lot of rows, as each of the more than 3000 measurement points record mutiple parameters each 15 minutes, so a rough approximation of how many rows each month file has is:

$$ 30(days) \cdot 24(hours) \cdot 4(measures\_per\_hour) \cdot 3000(traffic\_points) = 8640000 $$

And once again, if we take into account that we are using data from 2016 until the end of 2021, a more accurate row count would be:

$$ 6(years) \cdot 365(days) \cdot 24(hours) \cdot 4(measures\_per\_hour) \cdot 3000(traffic\_points) = 630720000 $$

This amount of data (more than 630 million rows) is too much to handle efficiently, and obtain relevant information (REWRITE THIS A BIT, SEEMS MORE TO BE AN EXCUSE RATHER THAN A DECISION). To reduce the amount of rows, we decide on keeping the average intensity of traffic (Number of cars) per day in each district. That way, we will have:

$$ 6(years) \cdot 365(days) \cdot 21(number\_districts) = 45990 $$

which is more manageable number, from where we aspire to detect the relevant information in the data. Around 13714 times less data.

In [None]:
def process_traffic_data(filepath, traffic_points_df):
    """ Function to process each traffic data file. This preoprocess has as objective to reduce
        the dimensionality od the data, only keeping one value per district per day, reducing this
        way the number of rows to handle.
        
        Arguments:
            filepath          -> path to load the csv
            traffic_points_df -> traffic_points dataset (where they are located)
    """
    
    # Load file
    traffic_df = pd.read_csv(filepath, sep=";")
    
    # For god knows why, there is one file that is separated by ',' instead of ';'
    # so we reread the file if it only has one column
    if len(traffic_df.columns) == 1:
        traffic_df = pd.read_csv(filepath, sep=",")
    
    # If the 'idelem' column does not exists, is because is called 'id', so rename column
    if "idelem" not in traffic_df.columns:
        traffic_df = traffic_df.rename(columns = {'id':'idelem'})
    
    # Use only the traffic points for whom we have information 
    traffic_df = traffic_df[traffic_df["idelem"].isin(traffic_points_df["idelem"])]
    
    # Transform date to datime type
    traffic_df["fecha"] = pd.to_datetime(traffic_df["fecha"])
    
    # Get date in separate columns
    traffic_df["day"] = traffic_df["fecha"].dt.day
    traffic_df["month"] = traffic_df["fecha"].dt.month
    traffic_df["year"] = traffic_df["fecha"].dt.year

    # Group by id and date, up to day, and get the average intensity perr traffic point
    traffic_df = traffic_df.groupby(["idelem",
                                     "day",
                                     "month",
                                     "year"]).agg(mean_intensity=("intensidad", "mean")).reset_index()
    
    # Merge with the traffic points to get the district for each point
    traffic_df = traffic_df.merge(traffic_points_df[["idelem", "district"]], on="idelem")
    
    # Group by again, to get nly one value per district per day
    traffic_df = traffic_df.groupby(["district", "day", "month", "year"]).mean()["mean_intensity"].reset_index()
    
    # Get the date and day of the week for plotting purpose
    traffic_df["date"] = pd.to_datetime(traffic_df[["day", "month", "year"]])
    traffic_df["day_of_week"] = traffic_df["date"].dt.day_name()
    
    return traffic_df

In [None]:
def load_all_trafic_data(traffic_points_df):
    """ Function to load all trafic data from the data folder,
        after being processed
        
        Arguments:
            traffic_points_df -> traffic_points dataset (where they are located)
    """
    
    DATA_PATH = "data"
    
    traffic_data = pd.DataFrame([], columns=["district", "date", "day_of_week",
                                             "day", "month", "year", "mean_intensity"])
    
    for filepath in tqdm(os.listdir(DATA_PATH), desc="Processing files", unit="file"):
        traffic_df = process_traffic_data(os.path.join(DATA_PATH, filepath), traffic_points_df)
        
        traffic_data = pd.concat([traffic_data, traffic_df])
    
    return traffic_data.sort_values(by=["district", "date"]).reset_index(drop="True")

In [None]:
df_path = "shared_data/traffic_intensity.csv"

if os.path.isfile(df_path):
    total_traffic_df = pd.read_csv(df_path)
    total_traffic_df["date"] = pd.to_datetime(total_traffic_df["date"])
else:
    total_traffic_df = load_all_trafic_data(traffic_points)
    total_traffic_df.to_csv(df_path, index=False)

total_traffic_df

In [None]:
p = figure(title="Traffic intensity through time by district", x_axis_label="Date",
           y_axis_label="Traffic intensity", width=800)

fig_lines = []

for name, color in zip(district_name, district_colors):
    source = ColumnDataSource(total_traffic_df[total_traffic_df["district"] == name])
    l = p.line(x="date", y="mean_intensity", source=source,
               color=get_color_from_palette(color), legend_label=name, visible=True,
               line_width=3, alpha=0.8)
    fig_lines.append(l)
    
p.renderers.extend(fig_lines)

    
p.add_layout(p.legend[0], "right")
p.legend.click_policy = "hide"

p.xaxis.formatter=DatetimeTickFormatter(
        days=['%a %d/%m/%Y'],
        months=['%b %Y'],
        years = ['%Y']
    )

# Hover tooltip
TOOLTIPS = [
    ("District", "@district"),
    ("Intensity", "@mean_intensity"),
    ("Day", "@day_of_week @day/@month/@year")
]
p.add_tools(HoverTool(tooltips=TOOLTIPS, mode="vline"))

# Button
button = Button(
    label="Switch all lines visibility", button_type="success", width=100
)
callback = CustomJS(args=dict(lines=fig_lines),
    code="""
    for(var i=0; i<lines.length; i++){
        lines[i].visible = !lines[i].visible;
    }
    """
)
button.js_on_click(callback)

layout = column(p, button)

show(layout)

## Display exploratory analysis focusing on Madrid Central

Now we are going to focus on a more in depth analysis over the *Centro* district.

In [None]:
df_centro = total_traffic_df[total_traffic_df["district"] == "Centro"].reset_index(drop=True)

df_centro

### Year Analysis

First we are going to start focusing on the year evolution, to see whether or not we detect a change in the traffic intensity. Take into account that the Covid-19 lockdown started in Spain the **15th of May of 2020**, and ended the **21st of June of 2020** and that explains a huge decrease in the overall traffic in Madrid.

In [None]:
df_centro_year = df_centro.groupby("year").agg(mean_intensity_year = ("mean_intensity", "mean")).reset_index()

df_year = total_traffic_df.groupby("year").agg(mean_intensity_year = ("mean_intensity", "mean")).reset_index()

display(df_centro_year)
df_year

In [None]:
p = figure(title="Average traffic intensity per day through the years in Madrid Central and in the city of Madrid", x_axis_label="Year",
           y_axis_label="Traffic intensity average by day", width=800)

source = ColumnDataSource(df_centro_year)
p.vbar(x="year", top="mean_intensity_year", source=source, width=0.5, legend_label="Madrid Central", color=get_color_from_palette(district_colors[0]))

source = ColumnDataSource(df_year)
p.line(x="year", y="mean_intensity_year", source=source, line_width=3, legend_label="Madrid", color=get_color_from_palette(district_colors[1]))

# Hover tooltip
TOOLTIPS = [
    ("Average Intensity per day", "@mean_intensity_year"),
    ("Year", "@year")
    ]
p.add_tools(HoverTool(tooltips=TOOLTIPS, mode="vline"))

p.add_layout(p.legend[0], "right")


show(p)

### Month Analysis

Once we have done a brief analysis over the years, we want to investigate if we are able to detect any pattern in a year using monthly information.

In [None]:
months_name = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

df_centro_month = df_centro.groupby("month").agg(mean_intensity_month = ("mean_intensity","mean")).reset_index()
df_month = total_traffic_df.groupby("month").agg(mean_intensity_month = ("mean_intensity", "mean")).reset_index()

df_centro_month["month_name"] = df_centro_month.apply(lambda x: months_name[int(x["month"]-1)], axis=1)
df_month["month_name"] = df_month.apply(lambda x: months_name[int(x["month"]-1)], axis=1)


display(df_centro_month)                                                                         
df_month

In [None]:
p = figure(title="Average traffic intensity per day in a month in Madrid Central and in the city of Madrid", x_axis_label="Month",
           y_axis_label="Traffic intensity average by day", width=1000, x_range=months_name)

source = ColumnDataSource(df_centro_month)
p.vbar(x="month_name", top="mean_intensity_month", source=source, width=0.5, legend_label="Madrid Central", color=get_color_from_palette(district_colors[0]))

source = ColumnDataSource(df_month)
p.line(x="month_name", y="mean_intensity_month", source=source, line_width=3, legend_label="Madrid", color=get_color_from_palette(district_colors[1]))

# Hover tooltip
TOOLTIPS = [
    ("Average Intensity per day", "@mean_intensity_month"),
    ("Month", "@month_name")
    ]
p.add_tools(HoverTool(tooltips=TOOLTIPS, mode="vline"))

p.add_layout(p.legend[0], "right")


show(p)

### Weekly Analysis

Same as with months, we will try now to analyze the weekly traffic change.

In [None]:
week_days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

df_centro_week = df_centro.groupby("day_of_week").agg(mean_intensity_week = ("mean_intensity", "mean")).reset_index()
df_week = total_traffic_df.groupby("day_of_week").agg(mean_intensity_week = ("mean_intensity", "mean")).reset_index()

df_centro_week["day_of_week"] = df_centro_week["day_of_week"].astype("category")
df_centro_week["day_of_week"] = df_centro_week["day_of_week"].cat.set_categories(week_days)

df_centro_week = df_centro_week.sort_values("day_of_week")


df_week["day_of_week"] = df_week["day_of_week"].astype("category")
df_week["day_of_week"] = df_week["day_of_week"].cat.set_categories(week_days)

df_week = df_week.sort_values("day_of_week")

display(df_centro_week)
df_week

In [None]:
p = figure(title="Average traffic intensity per day in a week in Madrid Central and in the city of Madrid", x_axis_label="Weekday",
           y_axis_label="Traffic intensity average by day", width=1000, x_range=week_days)

source = ColumnDataSource(df_centro_week)
p.vbar(x="day_of_week", top="mean_intensity_week", source=source, width=0.5, legend_label="Madrid Central", color=get_color_from_palette(district_colors[0]))

source = ColumnDataSource(df_week)
p.line(x="day_of_week", y="mean_intensity_week", source=source, line_width=3, legend_label="Madrid", color=get_color_from_palette(district_colors[1]))

# Hover tooltip
TOOLTIPS = [
    ("Average Intensity per day", "@mean_intensity_week"),
    ("Day of the Week", "@day_of_week")
    ]
p.add_tools(HoverTool(tooltips=TOOLTIPS, mode="vline"))

p.add_layout(p.legend[0], "left")


show(p)

In [None]:
# Difference between inside and outside

week_diff = []


for day in week_days:
    inside_mc = df_centro_week[df_centro_week["day_of_week"] == day]["mean_intensity_week"].values[0]
    outside_mc = df_week[df_week["day_of_week"] == day]["mean_intensity_week"].values[0]
    
    week_diff.append([day, outside_mc-inside_mc])


df_week_diff = pd.DataFrame(week_diff, columns=["day_of_week", "difference_of_mean_intensity"])

abs(df_week_diff["difference_of_mean_intensity"].min())

In [None]:
week_colors = sns.color_palette('colorblind', 8)
week_colors

In [None]:
# WEEK BAR PLOT
p_week = figure(title="Average traffic intensity per day in a week IN and OUT of Madrid Central",
           x_axis_label="Weekday", y_axis_label="Traffic intensity average by day",
           width=700, height=300, x_range=week_days, tools="")

p_week.toolbar.logo = None
p_week.toolbar_location = None

# WEEK DIFF LOLLIPOP PLOT

x_lim = max(abs(df_week_diff["difference_of_mean_intensity"].min()),
            df_week_diff["difference_of_mean_intensity"].max()) + 5

p_diff = figure(title="Difference of averages of traffic intensity IN and OUT of Madrid Central",
                y_axis_label="Weekday", x_axis_label="Difference in traffic",
                width=700, height=300, y_range=week_days[::-1], x_range=(-x_lim, x_lim), tools="")

p_diff.toolbar.logo = None
p_diff.toolbar_location = None

circles = []
for i, week_day in enumerate(week_days):

       source = ColumnDataSource(df_centro_week[df_centro_week["day_of_week"] == week_day])
       p_week.vbar(x="day_of_week", top="mean_intensity_week", source=source, width=0.5,
              legend_label="Madrid Central", alpha=0.2,
              line_alpha=1, line_color=get_dark_color_from_palette(week_colors[i]),
              color=get_color_from_palette(week_colors[i]))

       source = ColumnDataSource(df_week_diff[df_week_diff["day_of_week"] == week_day])

       p_diff.hbar(y="day_of_week", right="difference_of_mean_intensity", source=source,
              height=0.05, color=get_color_from_palette(week_colors[i]))

       circle = p_diff.circle(x="difference_of_mean_intensity", y="day_of_week", source=source, size=15, 
              color=get_color_from_palette(week_colors[i]))

       circles.append(circle)

source = ColumnDataSource(df_week)
p_week.line(x="day_of_week", y="mean_intensity_week", source=source,
       line_width=3, legend_label="OUT of Madrid Central", color=get_color_from_palette(week_colors[7]))

# Hover tooltip
TOOLTIPS = [
    ("Average Intensity per day", "@mean_intensity_week"),
    ("Day of the Week", "@day_of_week")
    ]
p_week.add_tools(HoverTool(tooltips=TOOLTIPS, mode="vline"))

p_week.add_layout(p_week.legend[0], "right")



TOOLTIPS = [
    ("Difference of intensity IN and OUT of Madrid Central", "@difference_of_mean_intensity{00.00}"),
    ("Day of the Week", "@day_of_week")
    ]
p_diff.add_tools(HoverTool(tooltips=TOOLTIPS, mode="hline", renderers=circles))


p = column([p_week, p_diff], sizing_mode='stretch_both')

output_file("html_plots/week_analysis.html", title="Week Analysis")
save(p)
reset_output()
output_notebook()

# show(p)

## NEW PLOTS

In [None]:
# Festivities of Madrid

festive_days = [
    [1, 1, "New Year"],
    [1, 6, "Epiphany of the Lord"],
    [5, 1, "Labor Day"],
    [5, 2, "Day of the Community of Madrid"],
    [5, 15, "San Isidro"],
    [7, 25, "Santiago Apóstol"],
    [8, 6, "Fiestas de San Cayetano"],
    [8, 15, "Asunción de la Virgen"],
    [10, 12, "National Day of Spain"],
    [11, 1, "All Saints' Day"],
    [11, 9, "Días de la Almudena."],
    [12, 6, "Day of the Constitution"],
    [12, 8, "Inmaculada Conceptción"],
    [12, 25, "Christmas"]
]

months_name = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]


festivities = pd.DataFrame(festive_days, columns=["month", "day", "festivity"])

festivities["month_name"] = festivities.apply(lambda x: months_name[int(x["month"]-1)], axis=1)


festivities["date"] = festivities["day"].astype(str) + " of " + festivities["month_name"].astype(str)


festivities

In [None]:


df_centro_day_month = df_centro.groupby(["month", "day"]).agg(mean_intensity_day_month = ("mean_intensity","mean")).reset_index()
df_day_month = total_traffic_df.groupby(["month", "day"]).agg(mean_intensity_day_month = ("mean_intensity", "mean")).reset_index()

df_centro_day_month["month_name"] = df_centro_day_month.apply(lambda x: months_name[int(x["month"]-1)], axis=1)
df_day_month["month_name"] = df_day_month.apply(lambda x: months_name[int(x["month"]-1)], axis=1)

df_centro_day_month["date"] = df_centro_day_month["day"].astype(str) + " of " + df_centro_day_month["month_name"].astype(str)
df_day_month["date"] = df_day_month["day"].astype(str) + " of " + df_day_month["month_name"].astype(str)


df_centro_day_month = pd.merge(df_centro_day_month, festivities, on=["date", "month", "day", "month_name"], how="left")
df_centro_day_month.loc[df_centro_day_month["festivity"].isna(), "festivity"] = "None"

df_day_month = pd.merge(df_day_month, festivities, on=["date", "month", "day", "month_name"], how="left")
df_day_month.loc[df_day_month["festivity"].isna(), "festivity"] = "None"

days_of_year = df_centro_day_month["date"].unique()

display(df_centro_day_month)                                                                         
display(df_day_month)
days_of_year[:5]

In [None]:
p = figure(title="Average traffic intensity per day in a year in Madrid Central and in the city of Madrid",
           y_axis_label="Traffic intensity average by day", x_axis_label="Days of the year",
           height=400, width=1000, x_range=days_of_year, y_range=(0, 500))

# p.xaxis.visible = False
p.xaxis.major_tick_line_color = None  # turn off x-axis major ticks
p.xaxis.minor_tick_line_color = None  # turn off x-axis minor ticks
p.xaxis.major_label_text_font_size = '0pt'  # turn off x-axis tick labels

source = ColumnDataSource(df_centro_day_month)
p.vbar(x="date", top="mean_intensity_day_month", source=source, width=0.99, alpha=0.2,
       line_alpha=1, line_color=get_dark_color_from_palette(district_colors[0]),
       legend_label="Madrid Central", color=get_color_from_palette(district_colors[0]))


source = ColumnDataSource(df_centro_day_month[df_centro_day_month["festivity"] != "None"])
p.vbar(x="date", top="mean_intensity_day_month", source=source, width=1,
       line_color=get_dark_color_from_palette(district_colors[2]),
       legend_label="Festive days in Madrid", color=get_color_from_palette(district_colors[2]))

source = ColumnDataSource(df_day_month)
p.line(x="date", y="mean_intensity_day_month", source=source, line_width=2,
       legend_label="Madrid", color=get_color_from_palette(district_colors[1]))

# Hover tooltip
TOOLTIPS = [
    ("Average Intensity per day", "@mean_intensity_day_month"),
    ("Day", "@date"),
    ("Festivity", "@festivity")
    ]
p.add_tools(HoverTool(tooltips=TOOLTIPS, mode="vline"))

p.add_layout(p.legend[0], "right")


output_file("html_plots/year_analysis.html", title="Year Analysis")
save(p)
reset_output()
output_notebook()

# show(p)

### Timeline

In [None]:
# Key dates
key_dates = [
    [2018, 11, 30, "Madrid Central inauguration "],
    [2019, 3, 15, "Fines begin to be issued"],
    [2019, 7, 1, "Cancellation of Madrid Central"],
    [2021, 12, 11, "New measure and fines"],
]


key_dates_df = pd.DataFrame(key_dates, columns=["year", "month", "day", "key_date"])

key_dates_df["month_name"] = key_dates_df.apply(lambda x: months_name[int(x["month"]-1)], axis=1)

key_dates_df["date"] = pd.to_datetime(key_dates_df[["year", "month", "day"]])


key_dates_df

In [None]:
df_centro_year = df_centro.groupby(["year", "month", "day"]).agg(mean_intensity_year = ("mean_intensity", "mean")).reset_index()

df_year = total_traffic_df.groupby(["year", "month", "day"]).agg(mean_intensity_year = ("mean_intensity", "mean")).reset_index()

df_centro_year["date"] = pd.to_datetime(df_centro_year[["year", "month", "day"]])
df_year["date"] = pd.to_datetime(df_year[["year", "month", "day"]])


df_centro_year = pd.merge(df_centro_year, festivities[["month", "day", "festivity"]], on=["month", "day"], how="left")
df_centro_year.loc[df_centro_year["festivity"].isna(), "festivity"] = "None"

df_centro_year = pd.merge(df_centro_year, key_dates_df[["year", "month", "day", "key_date"]], on=["year", "month", "day"], how="left")
df_centro_year.loc[df_centro_year["key_date"].isna(), "key_date"] = "None"




df_year = pd.merge(df_year, festivities[["month", "day", "festivity"]], on=["month", "day"], how="left")
df_year.loc[df_year["festivity"].isna(), "festivity"] = "None"

df_year = pd.merge(df_year, key_dates_df[["year", "month", "day", "key_date"]], on=["year", "month", "day"], how="left")
df_year.loc[df_year["key_date"].isna(), "key_date"] = "None"


display(df_centro_year)
df_year

In [None]:
p = figure(title="Average traffic intensity per day through the years in Madrid Central and in the city of Madrid", x_axis_label="Year",
           y_axis_label="Traffic intensity average by day", width=1300)

source = ColumnDataSource(df_centro_year)
p.vbar(x="date", top="mean_intensity_year", source=source,  width=0.99, alpha=0.2,
       line_alpha=1, line_color=get_dark_color_from_palette(district_colors[0]),
       legend_label="Madrid Central", color=get_color_from_palette(district_colors[0]))


source = ColumnDataSource(df_centro_year[df_centro_year["festivity"] != "None"])
p.vbar(x="date", top="mean_intensity_year", source=source, width=1,
       line_color=get_dark_color_from_palette(district_colors[2]),
       legend_label="Festive days in Madrid", color=get_color_from_palette(district_colors[2]))

source = ColumnDataSource(df_centro_year[df_centro_year["key_date"] != "None"])
p.vbar(x="date", top=1200, source=source, width=1,
       line_color=get_dark_color_from_palette(district_colors[3]),
       legend_label="Key dates", color=get_color_from_palette(district_colors[3]))

source = ColumnDataSource(df_year)
p.line(x="date", y="mean_intensity_year", source=source, line_width=3, alpha=0.6,
       legend_label="Madrid", color=get_color_from_palette(district_colors[1]))


p.xaxis.formatter=DatetimeTickFormatter(
        days=['%a %d/%m/%Y'],
        months=['%b %Y'],
        years = ['%Y']
    )

# Hover tooltip
TOOLTIPS = [
    ("Average Intensity per day", "@mean_intensity_year"),
    ("Date", "@day/@month/@year"),
    ("Festivity", "@festivity"),
    ("Key date", "@key_date")
    ]
p.add_tools(HoverTool(tooltips=TOOLTIPS, mode="vline"))

p.add_layout(p.legend[0], "right")


show(p)

## Traffic analysis per month in all years

In [None]:
total_traffic_df["month_and_day"] = total_traffic_df["day"].astype(str) + "/" + total_traffic_df["month"].astype(str) 

total_traffic_df

In [None]:
df_month_districts = total_traffic_df.groupby(["district", "year", "month"])["mean_intensity"].mean().reset_index()

df_month_districts["month_name"] = df_month_districts["month"].apply(lambda x: months_name[x-1])

df_month_districts["date"] = pd.to_datetime(df_month_districts["month_name"] + "-" + df_month_districts["year"].astype(str))

df_month_districts

In [None]:
plot_colors = sns.color_palette('colorblind', len(df_month_districts["year"].unique()))
# np.random.shuffle(plot_colors)
plot_colors

In [None]:
p = figure(x_range=months_name, title="Average traffic intensity per month in a year in Madrid Central, from 2016 to February 2020", x_axis_label="Month", 
            y_axis_label="Traffic intensity by month")

for i, year in enumerate(df_month_districts["year"].unique()):
    source = ColumnDataSource(df_month_districts[(df_month_districts["district"] == "Centro") & (df_month_districts["year"] == year)])

    p.line(x="month_name", y="mean_intensity", source=source, legend_label=str(year), color=get_color_from_palette(plot_colors[i]), line_width=2)
    p.circle(x="month_name", y="mean_intensity", source=source, legend_label=str(year), color=get_color_from_palette(plot_colors[i]), size=10, alpha=0.5)


TOOLTIPS = [
    ("Average Intensity per month", "@mean_intensity"),
    ("Month", "@month_name"),
    ("Year", "@year")
    ]
p.add_tools(HoverTool(tooltips=TOOLTIPS))
p.add_layout(p.legend[0], "right")

p.legend.click_policy = "hide"

p = column(p, sizing_mode='stretch_both')

output_file("html_plots/month_analysis.html", title="Month Analysis")
save(p)
reset_output()
output_notebook()

# show(p)

In [None]:
df_month_districts_centro = df_month_districts[df_month_districts["district"] == "Centro"].reset_index(drop=True)

df_month_districts_not_centro = df_month_districts[df_month_districts["district"] != "Centro"].reset_index(drop=True)

df_month_districts_not_centro = df_month_districts_not_centro.groupby(["year", "month"]).mean("mean_intensity").reset_index()

df_month_districts_centro["mean_intensity"] = df_month_districts_centro["mean_intensity"] - df_month_districts_not_centro["mean_intensity"]


In [None]:
p = figure(x_range=months_name, title="Difference of averages of traffic intensity IN and OUT of Madrid Central, from 2016 to February 2020", x_axis_label="Month", 
            y_axis_label="Difference in traffic")

p.ray(x=[0], y=[0], line_color='grey', line_width=3, line_dash="dashed", line_alpha=0.7)

for i, year in enumerate(df_month_districts_centro["year"].unique()):
    source = ColumnDataSource(df_month_districts_centro[df_month_districts_centro["year"] == year])

    p.line(x="month_name", y="mean_intensity", source=source, legend_label=str(year), color=get_color_from_palette(plot_colors[i]), line_width=2)
    p.circle(x="month_name", y="mean_intensity", source=source, legend_label=str(year), color=get_color_from_palette(plot_colors[i]), size=10)

TOOLTIPS = [
    ("Difference of intensity IN and OUT of Madrid Central", "@mean_intensity"),
    ("Month", "@month_name"),
    ("Year", "@year")
    ]
p.add_tools(HoverTool(tooltips=TOOLTIPS))
p.add_layout(p.legend[0], "right")

p.legend.click_policy = "hide"

p = column(p, sizing_mode='stretch_both')

output_file("html_plots/month_difference_analysis.html", title="Month Difference Analysis")
save(p)
reset_output()
output_notebook()

# show(p)

In [None]:
df_month_districts

In [None]:
import time

In [None]:
# create annotations for time marks
startMC = time.mktime(dt(2018, 11, 30, 0, 0, 0).timetuple())*1000
startMC_span = Span(location=startMC,
                              dimension='height', line_color='black',
                              line_dash='dashed', line_width=2, line_alpha=0.3)

finesMC = time.mktime(dt(2019, 3, 15, 0, 0, 0).timetuple())*1000
finesMC_span = Span(location=finesMC,
                              dimension='height', line_color='black',
                              line_dash='dashed', line_width=2, line_alpha=0.3)

endMC = time.mktime(dt(2019, 7, 1, 0, 0, 0).timetuple())*1000
endMC_span = Span(location=endMC,
                              dimension='height', line_color='black',
                              line_dash='dashed', line_width=2, line_alpha=0.3)

In [None]:
df_month_districts_outside = df_month_districts[df_month_districts["district"] != "Centro"].groupby(["date", "month_name"]).mean("mean_intensity").reset_index()

p = figure(x_axis_type='datetime', title="Average traffic intensity per month IN and OUT of Madrid Central, from 2016 to February 2020", x_axis_label="Year", 
            y_axis_label="Average traffic intensity by month")


# p.ray(x = dt(2018, 11, 30), y=0, length=0, angle_units="deg", angle=90, color ='grey', line_width=3, line_dash="dashed", line_alpha=0.7)
# p.ray(x = dt(2019, 3, 15), y=0, length=0, angle_units="deg", angle=90, color ='grey', line_width=3, line_dash="dashed", line_alpha=0.7)
# p.ray(x = dt(2019, 7, 1), y=0, length=0, angle_units="deg", angle=90, color ='grey', line_width=3, line_dash="dashed", line_alpha=0.7)


source = ColumnDataSource(df_month_districts[df_month_districts["district"] == "Centro"])

p.line(x="date", y="mean_intensity", source=source, legend_label="IN Madrid Central", color=get_color_from_palette(plot_colors[0]), line_width=2)
p.circle(x="date", y="mean_intensity", source=source, legend_label="IN Madrid Central", color=get_color_from_palette(plot_colors[0]), size=10)

source = ColumnDataSource(df_month_districts_outside)

p.line(x="date", y="mean_intensity", source=source, legend_label="OUT of Madrid Central", color=get_color_from_palette(plot_colors[1]), line_width=2)
p.circle(x="date", y="mean_intensity", source=source, legend_label="OUT of Madrid Central", color=get_color_from_palette(plot_colors[1]), size=10)



TOOLTIPS = [
    ("Average intensity per month", "@mean_intensity"),
    ("Month", "@month_name"),
    ("Year", "@year")
    ]

p.add_tools(HoverTool(tooltips=TOOLTIPS))
# p.add_layout(p.legend[0], "right")

p.legend.click_policy = "hide"

# add annotations to plot
p.add_layout(startMC_span)
p.add_layout(finesMC_span)
p.add_layout(endMC_span)

p = column(p, sizing_mode='stretch_both')


output_file("html_plots/year_in_out_analysis.html", title="Year in out Analysis")
save(p)
reset_output()
output_notebook()

# show(p)