In [1]:
import numpy as np
import pandas as pd

# Display the dataframe
pd.set_option('display.max_columns', 100)  # or 1000
pd.set_option('display.max_rows', 100)  # or 1000
pd.set_option('display.max_colwidth', 100)  # or 199

## GVB

In [10]:
gvb_checkin_hourly = pd.read_csv("../data/gvb/check_ins_hourly.csv", sep=';')

In [12]:
gvb_checkout_hourly = pd.read_csv("../data/gvb/check_outs_hourly.csv", sep=';')

In [15]:
gvb_checkin_hourly.HalteNaam.unique()

array(['Nieuwmarkt', 'Laan v.Vlaanderen', 'Lambertus Zijlplein',
       'Leidseplein', 'Linnaeusstraat', 'Louis Davidsstraat', 'Louwesweg',
       'Lumi?restraat', 'Lutmastraat', 'Maasstraat', 'La Guardiaweg',
       'Marco Polostraat', 'Marnixplein', 'Marnixstraat', 'Matterhorn',
       'Meer en Vaart', 'Mercatorplein', 'Meteorenweg',
       'Michel de Klerkhof', 'Molenwerf', 'Molenwijk',
       'Marie Heinekenplein', 'Molukkenstraat', 'Kruislaan',
       'Kraaienneststation', 'Inaristraat', 'Insulindeweg', 'Isolatorweg',
       'J.P. Heijestraat', 'Jan Tooropstraat', 'Jan Voermanstraat',
       'Jan Zwanenburghof', 'Jan v.Galenstraat', 'Javaplein',
       'Kronenburg', 'Johan Huizingalaan', 'Kabelweg', 'Kadijksplein',
       'Kasterleepark', 'Kattenburgerstraat', 'Keizersgracht',
       'Kinkerstraat', 'Koivistokade', 'Koningsplein',
       'Kostverlorenstraat', "K. 's-Gravesandestr.", 'IJburg',
       'Mr. Visserplein', 'Muntplein', 'Poortwachter', 'Postjesweg',
       'Prins Hendri

In [64]:
gvb_nieuwmarkt_checkin = gvb_checkin_hourly[gvb_checkin_hourly.HalteNaam == 'Nieuwmarkt']
gvb_dam_checkin = gvb_checkin_hourly[gvb_checkin_hourly.HalteNaam == 'Dam']

In [65]:
gvb_nieuwmarkt_checkout = gvb_checkout_hourly[gvb_checkout_hourly.HalteNaam == 'Nieuwmarkt']
gvb_dam_checkout = gvb_checkout_hourly[gvb_checkout_hourly.HalteNaam == 'Dam']

In [18]:
gvb_nieuwmarkt_checkin.dtypes

HalteNaam       object
AantalReizen     int64
datetime        object
hour             int64
week             int64
month            int64
year             int64
weekday          int64
dtype: object

In [21]:
# df.rename(columns={"A": "a", "B": "c"})

gvb_nieuwmarkt_checkin_copy = gvb_nieuwmarkt_checkin.copy()
gvb_nieuwmarkt_checkin_copy.rename(columns={'HalteNaam':'stop_name','AantalReizen':'count','datetime':'date'}, inplace=True)
gvb_nieuwmarkt_checkin_copy.drop(['week','month','year','weekday'], axis=1, inplace=True)
gvb_nieuwmarkt_checkin_copy.head()

gvb_nieuwmarkt_checkin_copy['datetime'] = gvb_nieuwmarkt_checkin_copy.date + " " + gvb_nieuwmarkt_checkin_copy.hour.astype(str) + ":00:00"
gvb_nieuwmarkt_checkin_copy.head()

# df['end_time'] = pd.to_datetime(df['end_time'], format=format_string)
gvb_nieuwmarkt_checkin_copy.datetime = pd.to_datetime(gvb_nieuwmarkt_checkin_copy.datetime)

Unnamed: 0,stop_name,count,date,hour
0,Nieuwmarkt,20,2019-01-01,0
78,Nieuwmarkt,409,2019-01-01,17
245,Nieuwmarkt,533,2019-01-01,18
572,Nieuwmarkt,296,2019-01-01,15
620,Nieuwmarkt,370,2019-01-01,16


In [33]:
gvb_nieuwmarkt_checkin_copy.dtypes

stop_name    object
count         int64
date         object
hour          int64
datetime     object
dtype: object

In [66]:
def preprocessing(df):
    df.rename(columns={'HalteNaam':'stop_name','AantalReizen':'count','datetime':'date'}, inplace=True)
    df.drop(['week','month','year','weekday'], axis=1, inplace=True)
    df.sort_values(by=['date','hour'], inplace=True)
    df['datetime'] = df.date + " " + df.hour.astype(str) + ":00:00"
    df.datetime = pd.to_datetime(df.datetime)
    df = df[df.date >= "2020-09-01"]
    return df

In [67]:
gvb_nieuwmarkt_checkin = preprocessing(gvb_nieuwmarkt_checkin)
gvb_nieuwmarkt_checkout = preprocessing(gvb_nieuwmarkt_checkout)
gvb_dam_checkin = preprocessing(gvb_dam_checkin)
gvb_dam_checkout = preprocessing(gvb_dam_checkout)

In [92]:
gvb_nieuwmarkt_checkin.head()

Unnamed: 0,stop_name,count,date,hour,datetime
2305877,Nieuwmarkt,58,2020-09-01,0,2020-09-01 00:00:00
2305959,Nieuwmarkt,22,2020-09-01,6,2020-09-01 06:00:00
2305813,Nieuwmarkt,65,2020-09-01,7,2020-09-01 07:00:00
2305996,Nieuwmarkt,102,2020-09-01,8,2020-09-01 08:00:00
2305273,Nieuwmarkt,61,2020-09-01,9,2020-09-01 09:00:00


In [100]:
gvb_nieuwmarkt = pd.merge(gvb_nieuwmarkt_checkin, gvb_nieuwmarkt_checkout, on=['stop_name','date','hour','datetime'], how='outer')
gvb_nieuwmarkt.rename(columns={'count_x':'checkin', 'count_y':'checkout'}, inplace=True)
gvb_nieuwmarkt.sort_values(by=['date','hour'], inplace=True)
gvb_nieuwmarkt = gvb_nieuwmarkt[['stop_name', 'date', 'hour', 'datetime', 'checkin', 'checkout']]

In [101]:
gvb_nieuwmarkt.head()

Unnamed: 0,stop_name,date,hour,datetime,checkin,checkout
0,Nieuwmarkt,2020-09-01,0,2020-09-01 00:00:00,58.0,17.0
1,Nieuwmarkt,2020-09-01,6,2020-09-01 06:00:00,22.0,36.0
2,Nieuwmarkt,2020-09-01,7,2020-09-01 07:00:00,65.0,70.0
3,Nieuwmarkt,2020-09-01,8,2020-09-01 08:00:00,102.0,135.0
4,Nieuwmarkt,2020-09-01,9,2020-09-01 09:00:00,61.0,168.0


In [102]:
gvb_dam = pd.merge(gvb_dam_checkin, gvb_dam_checkout, on=['stop_name','date','hour','datetime'], how='outer')
gvb_dam.rename(columns={'count_x':'checkin', 'count_y':'checkout'}, inplace=True)
gvb_dam.sort_values(by=['date','hour'], inplace=True)
gvb_dam = gvb_dam[['stop_name', 'date', 'hour', 'datetime', 'checkin', 'checkout']]

In [103]:
gvb_dam.head()

Unnamed: 0,stop_name,date,hour,datetime,checkin,checkout
0,Dam,2020-09-01,0,2020-09-01 00:00:00,48.0,
1,Dam,2020-09-01,1,2020-09-01 01:00:00,12.0,
2,Dam,2020-09-01,2,2020-09-01 02:00:00,23.0,
7039,Dam,2020-09-01,6,2020-09-01 06:00:00,,32.0
7040,Dam,2020-09-01,7,2020-09-01 07:00:00,,61.0


In [104]:
gvb_dam.to_csv("../data/gvb/gvb_dam.csv")

In [105]:
gvb_nieuwmarkt.to_csv("../data/gvb/gvb_nieuwmarkt.csv")

## COVID stringency

In [110]:
covid = pd.read_csv("../data/netherlands-covid-stringency-index.csv", usecols=['Day', 'stringency_index'])
covid.head()

Unnamed: 0,Day,stringency_index
0,2020-01-21,0.0
1,2020-01-22,0.0
2,2020-01-23,0.0
3,2020-01-24,0.0
4,2020-01-25,0.0


In [111]:
covid = covid[covid.Day >= '2020-01-09']

In [112]:
covid.tail()

Unnamed: 0,Day,stringency_index
706,2021-12-27,63.89
707,2021-12-28,63.89
708,2021-12-29,63.89
709,2021-12-30,63.89
710,2021-12-31,63.89
