# Tasks for data preparation

- ✅ Aggregate movement data by hour
- ✅ Join together all datasets
- ❌ Aggregate data by day

## Table of contents
### 1. Load the data
### 2. Transform data
### 3. Join the data

## 1. Load the data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import plotly.express as px

In [2]:
# Load movements
movements2023 = pd.read_csv('../data/2023_verkehrszaehlungen_werte_fussgaenger_velo.csv')
# Rename columns
new_column_names = ['Standort', 'Datum', 'VELO_IN', 'VELO_OUT', 'FUSS_IN', 'FUSS_OUT', 'Ost', 'Nord']
movements2023.columns = new_column_names

# Load metadata
meta = pd.read_csv('../data/GeoData_StadtZurich/data/taz.view_eco_standorte.csv')

# Load weather data
weather2023 = pd.read_csv('../data/ugz_ogd_meteo_h1_2023.csv')

# Load population data
population = pd.read_csv('../data/bev324od3243.csv')

## 2. Overview of the data

### Display movements data

In [3]:
# Format Datum
movements2023['Datum'] = pd.to_datetime(movements2023['Datum'])
# Remove minutes information 
movements2023['Datum'] = movements2023['Datum'].dt.floor('1h')
# Format the timestamps and put time and date it into extra columns
movements2023['Date'] = movements2023['Datum'].dt.strftime('%Y-%m-%d')
movements2023['Time'] = movements2023['Datum'].dt.strftime('%H:%M')
movements2023['Datetime'] = movements2023['Datum'].dt.strftime('%Y-%m-%d %H:%M')

# Overview movements2023
movements2023.head()

Unnamed: 0,Standort,Datum,VELO_IN,VELO_OUT,FUSS_IN,FUSS_OUT,Ost,Nord,Date,Time,Datetime
0,5003,2023-01-01,,,1.0,0.0,2682978,1248744,2023-01-01,00:00,2023-01-01 00:00
1,4257,2023-01-01,0.0,0.0,,,2681857,1251991,2023-01-01,00:00,2023-01-01 00:00
2,394,2023-01-01,,,1.0,1.0,2683573,1251687,2023-01-01,00:00,2023-01-01 00:00
3,2986,2023-01-01,0.0,0.0,,,2684578,1251966,2023-01-01,00:00,2023-01-01 00:00
4,3598,2023-01-01,0.0,0.0,,,2684006,1246566,2023-01-01,00:00,2023-01-01 00:00


### Display meta data

In [19]:
# Remove columns that are not needed
meta.drop(['bis', 'fk_zaehler', 'von'], axis = 1)

# Select most recent records for each 'id1'
meta = meta[meta['bis'].isnull()]

# Show
meta.head()

Unnamed: 0,abkuerzung,bezeichnung,bis,fk_zaehler,id1,richtung_in,richtung_out,von,objectid,korrekturfaktor,geometry
31,VZS_BASL,Baslerstrasse,,Y2H20063173,3003,Bhf. Altstetten,Letzipark,20200716000000,32,1.0,POINT (2679767.2 1248986.1)
43,VZS_SCHE,Scheuchzerstrasse,,Y2H19111477,2993,Innenstadt,Irchel,20200110000000,44,1.05,POINT (2683573.2 1248544.9)
58,VZS_BINZ,Binzmühlestrasse,,Y2H21015036,4257,Glaubtenstrasse,Oerlikon,20221208000000,59,1.22,POINT (2681857 1251990.9)
59,FZS_LANS,Langstrasse (Unterführung Süd),,U15G3063867,4260,Gleisanlagen,Limmat,20230211000000,60,1.0,POINT (2682347.8 1248427.2)
66,FZS_MILI,Militärbrücke,,U15G3063864,20,Löwenplatz,Langstrasse,20130711000000,67,0.58,POINT (2682689 1247734.9)


### Display weather data
The weather data is available for three different locations `['Zch_Stampfenbachstrasse', 'Zch_Schimmelstrasse', 'Zch_Rosengartenstrasse']`. We will need only one of these.

In [5]:
# Filter only Stampfenbachstrasse
weather2023 = weather2023[weather2023['Standort'] == 'Zch_Stampfenbachstrasse']

# Remove coumns
weather2023.pop('Standort')
weather2023.pop('Status')

# Overview
weather2023.head()

Unnamed: 0,Datum,Parameter,Intervall,Einheit,Wert
0,2023-01-01T00:00+0100,T,h1,°C,11.57
1,2023-01-01T00:00+0100,Hr,h1,%Hr,72.29
2,2023-01-01T00:00+0100,p,h1,hPa,971.62
3,2023-01-01T00:00+0100,RainDur,h1,min,0.0
4,2023-01-01T00:00+0100,StrGlo,h1,W/m2,0.01


### Display population data

In [6]:
population.tail()

Unnamed: 0,StichtagDatJahr,AnzBestWir
118,2019,434008
119,2020,434736
120,2021,436332
121,2022,443037
122,2023,447082


## 2. Transform data

### Long to wide transformation for weather data

In [7]:
# Transform weather data into wide format
weather2023['Param_Unit'] = weather2023['Parameter'] + ' [' + weather2023['Einheit'] + ']'
wide_weather2023 = weather2023.pivot_table(index=['Datum'], columns='Param_Unit', values='Wert').reset_index()

# Display the wide format DataFrame
wide_weather2023.head()

Param_Unit,Datum,Hr [%Hr],RainDur [min],StrGlo [W/m2],T [°C],WD [°],WVs [m/s],WVv [m/s],p [hPa]
0,2023-01-01T00:00+0100,72.29,0.0,0.01,11.57,169.49,1.95,1.69,971.62
1,2023-01-01T01:00+0100,63.66,0.0,0.02,13.47,205.89,3.4,2.77,971.86
2,2023-01-01T02:00+0100,68.85,0.0,0.02,12.39,149.11,1.98,1.49,971.76
3,2023-01-01T03:00+0100,70.72,0.0,0.02,11.69,157.08,1.79,1.46,972.01
4,2023-01-01T04:00+0100,70.45,0.0,0.02,11.55,178.54,2.98,2.74,972.1


In [8]:
# Format Datum
wide_weather2023['Datum'] = pd.to_datetime(wide_weather2023['Datum'])
# Remove minutes information 
wide_weather2023['Datum'] = wide_weather2023['Datum'].dt.floor('1h')
# Format the timestamps and put time and date it into extra columns
wide_weather2023['Year'] = wide_weather2023['Datum'].dt.year
wide_weather2023['Date'] = wide_weather2023['Datum'].dt.strftime('%Y-%m-%d')
wide_weather2023['Time'] = wide_weather2023['Datum'].dt.strftime('%H:%M')
wide_weather2023['Datetime'] = wide_weather2023['Datum'].dt.strftime('%Y-%m-%d %H:%M')

# Overview
wide_weather2023.head()

Param_Unit,Datum,Hr [%Hr],RainDur [min],StrGlo [W/m2],T [°C],WD [°],WVs [m/s],WVv [m/s],p [hPa],Year,Date,Time,Datetime
0,2023-01-01 00:00:00+01:00,72.29,0.0,0.01,11.57,169.49,1.95,1.69,971.62,2023,2023-01-01,00:00,2023-01-01 00:00
1,2023-01-01 01:00:00+01:00,63.66,0.0,0.02,13.47,205.89,3.4,2.77,971.86,2023,2023-01-01,01:00,2023-01-01 01:00
2,2023-01-01 02:00:00+01:00,68.85,0.0,0.02,12.39,149.11,1.98,1.49,971.76,2023,2023-01-01,02:00,2023-01-01 02:00
3,2023-01-01 03:00:00+01:00,70.72,0.0,0.02,11.69,157.08,1.79,1.46,972.01,2023,2023-01-01,03:00,2023-01-01 03:00
4,2023-01-01 04:00:00+01:00,70.45,0.0,0.02,11.55,178.54,2.98,2.74,972.1,2023,2023-01-01,04:00,2023-01-01 04:00


### Aggregate movement data per hour

In [9]:
movements2023.head()
# Aggregate based on 'Standort' and 'Time', sum 'VELO_IN', and get max 'Ost'
movements2023_hourly = movements2023.groupby(['Standort', 'Date', 'Time', 'Datetime']).agg({'VELO_IN' : 'sum',
                                                                        'VELO_OUT' : 'sum',
                                                                        'FUSS_IN' : 'sum',
                                                                        'FUSS_OUT' : 'sum',
                                                                        'Ost' : 'max',
                                                                        'Nord' : 'max'})

In [12]:
movements2023_hourly.head(25)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,VELO_IN,VELO_OUT,FUSS_IN,FUSS_OUT,Ost,Nord
Standort,Date,Time,Datetime,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20,2023-01-01,00:00,2023-01-01 00:00,0.0,0.0,46.0,31.0,2682689,1247735
20,2023-01-01,01:00,2023-01-01 01:00,0.0,0.0,43.0,94.0,2682689,1247735
20,2023-01-01,02:00,2023-01-01 02:00,0.0,0.0,36.0,27.0,2682689,1247735
20,2023-01-01,03:00,2023-01-01 03:00,0.0,0.0,22.0,27.0,2682689,1247735
20,2023-01-01,04:00,2023-01-01 04:00,0.0,0.0,11.0,33.0,2682689,1247735
20,2023-01-01,05:00,2023-01-01 05:00,0.0,0.0,18.0,14.0,2682689,1247735
20,2023-01-01,06:00,2023-01-01 06:00,0.0,0.0,19.0,6.0,2682689,1247735
20,2023-01-01,07:00,2023-01-01 07:00,0.0,0.0,7.0,0.0,2682689,1247735
20,2023-01-01,08:00,2023-01-01 08:00,0.0,0.0,4.0,6.0,2682689,1247735
20,2023-01-01,09:00,2023-01-01 09:00,0.0,0.0,14.0,1.0,2682689,1247735


## 3. Join the data

In [22]:
# Join movements and weather
df_agg_hourly = pd.merge(movements2023_hourly.reset_index(), 
                         wide_weather2023.drop('Datum', axis = 1), 
                         left_on = 'Datetime', 
                         right_on = 'Datetime', 
                         how = 'left')

# Join population data
df_agg_hourly = pd.merge(df_agg_hourly, 
                         population,
                         left_on = 'Year', 
                         right_on = 'StichtagDatJahr', 
                         how = 'left')

# Remove 'StichtagDatJahr'
df_agg_hourly = df_agg_hourly.drop('StichtagDatJahr', axis = 1)

# Join meta data
df_agg_hourly = pd.merge(df_agg_hourly, 
                         meta.drop(['bis', 'von'], axis = 1),
                         left_on = 'Standort',
                         right_on = 'id1',
                         how = 'left'
                         )

# Display
df_agg_hourly.head()

Unnamed: 0,Standort,Date_x,Time_x,Datetime,VELO_IN,VELO_OUT,FUSS_IN,FUSS_OUT,Ost,Nord,...,AnzBestWir,abkuerzung,bezeichnung,fk_zaehler,id1,richtung_in,richtung_out,objectid,korrekturfaktor,geometry
0,20,2023-01-01,00:00,2023-01-01 00:00,0.0,0.0,46.0,31.0,2682689,1247735,...,447082.0,FZS_MILI,Militärbrücke,U15G3063864,20.0,Löwenplatz,Langstrasse,67.0,0.58,POINT (2682689 1247734.9)
1,20,2023-01-01,01:00,2023-01-01 01:00,0.0,0.0,43.0,94.0,2682689,1247735,...,447082.0,FZS_MILI,Militärbrücke,U15G3063864,20.0,Löwenplatz,Langstrasse,67.0,0.58,POINT (2682689 1247734.9)
2,20,2023-01-01,02:00,2023-01-01 02:00,0.0,0.0,36.0,27.0,2682689,1247735,...,447082.0,FZS_MILI,Militärbrücke,U15G3063864,20.0,Löwenplatz,Langstrasse,67.0,0.58,POINT (2682689 1247734.9)
3,20,2023-01-01,03:00,2023-01-01 03:00,0.0,0.0,22.0,27.0,2682689,1247735,...,447082.0,FZS_MILI,Militärbrücke,U15G3063864,20.0,Löwenplatz,Langstrasse,67.0,0.58,POINT (2682689 1247734.9)
4,20,2023-01-01,04:00,2023-01-01 04:00,0.0,0.0,11.0,33.0,2682689,1247735,...,447082.0,FZS_MILI,Militärbrücke,U15G3063864,20.0,Löwenplatz,Langstrasse,67.0,0.58,POINT (2682689 1247734.9)


# 4. Export the data

In [23]:
# Export as csv
df_agg_hourly.to_csv('../results/df_agg_hourly.csv', index=False)