In [38]:
import pandas as pd
import statsmodels.api as sm
from pathlib import Path
import numpy as np
import src.utils as utils
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.pylabtools import figsize
import src.data_processing.data as data
from sklearn.preprocessing import MinMaxScaler

In [2]:
#Get total number of facilities per station:
facil_col = ['ticket_vending_machine', 'luggage_lockers', 'free_parking', 'taxi', 'bicycle_spots', 'blue-bike',
             'bus', 'tram', 'metro', 'wheelchair_available', 'ramp', 'disabled_parking_spots', 'elevated_platform',
             'escalator_up', 'escalator_down', 'elevator_platform', 'audio_induction_loop']
# PROBLEM: disabled_parking_spots is the number of spots instead of a dummy indicator
# so let's create a dummy variable
data.facilities['disabled_parking_spots_indicator'] = np.where(data.facilities['disabled_parking_spots']==0,0,1)
# drop old variable and add new name to our list
facil_col.remove('disabled_parking_spots')
facil_col = facil_col + ['disabled_parking_spots_indicator']
# compute total number of facilities per station
data.facilities['number_facilities'] = data.facilities[facil_col].sum(axis = 1)
data.facilities['number_facilities']

2      12.0
3       1.0
4       7.0
5       9.0
6       1.0
       ... 
668     3.0
669     3.0
670     8.0
672     3.0
673     4.0
Name: number_facilities, Length: 557, dtype: float64

In [13]:
## Number of daily travelers
# create total
data.travelers["week_total"] = 5 * data.travelers["week"] + data.travelers["saturday"] + data.travelers["sunday"]
data.travelers["avg_day"] = data.travelers["week_total"] / float(7)
merge = pd.merge(data.facilities, data.travelers, left_on='name', right_on='Station')
data.travelers
merge


Unnamed: 0.1,URI,name,street,zip,city,ticket_vending_machine,luggage_lockers,free_parking,taxi,bicycle_spots,...,disabled_parking_spots_indicator,number_facilities,Unnamed: 0,Station,week,saturday,sunday,sum,week_total,avg_day
0,008895000,aalst,Stationsplein 9,9300,Aalst,1.0,0.0,1.0,1.0,1.0,...,1,12.0,0,aalst,6444.0,1768.0,1592.0,9804.0,35580.0,5082.857143
1,008895125,aalst-kerrebroek,Ledebaan,9300,Aalst,0.0,0.0,1.0,0.0,0.0,...,0,1.0,1,aalst-kerrebroek,27.0,0.0,0.0,27.0,135.0,19.285714
2,008891140,aalter,Stationsplein 2,9880,Aalter,1.0,0.0,1.0,0.0,1.0,...,1,7.0,2,aalter,2288.0,1055.0,855.0,4198.0,13350.0,1907.142857
3,008833209,aarschot,Statieplein,3200,Aarschot,1.0,0.0,1.0,0.0,1.0,...,1,9.0,3,aarschot,6270.0,1954.0,1395.0,9619.0,34699.0,4957.000000
4,008892288,aarsele,Hogenhovenstraat Zn,8700,Aarsele,1.0,0.0,0.0,0.0,0.0,...,0,1.0,4,aarsele,34.0,0.0,0.0,34.0,170.0,24.285714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,008832250,zolder,Stationsstraat Z/n,3550,Heusden - Zolder,1.0,0.0,1.0,0.0,0.0,...,1,3.0,549,zolder,123.0,38.0,37.0,198.0,690.0,98.571429
546,008832334,zonhoven,Engstegenseweg,3520,Zonhoven,1.0,0.0,1.0,0.0,0.0,...,1,3.0,550,zonhoven,67.0,29.0,19.0,115.0,383.0,54.714286
547,008895208,zottegem,Stationsplein 12,9620,Zottegem,1.0,0.0,1.0,0.0,1.0,...,1,8.0,551,zottegem,4936.0,986.0,931.0,6853.0,26597.0,3799.571429
548,008894821,zwijndrecht,Fortlaan 1,2070,Zwijndrecht,1.0,0.0,1.0,0.0,1.0,...,0,3.0,552,zwijndrecht,401.0,43.0,44.0,488.0,2092.0,298.857143


In [41]:
#First we need to do scaling in order to be able to compare daily travelers with daily trains:
scaler = MinMaxScaler()
scaler.fit(merge[['avg_day', 'number_facilities']])
merge[['avg_day', 'number_facilities']] = scaler.transform(merge[['avg_day', 'number_facilities']])
#univariate regression for daily travelers:
X = merge['avg_day']
Y = merge['number_facilities']
X = sm.add_constant(X) # adding a constant: Y = beta0 + beta1*X1 + beta2*X2 + espilon instead of Y = beta1*X1 + beta2*X2 + epsilon

model = sm.OLS(Y, X).fit()
print_model = model.summary()
print_model
merge

Unnamed: 0.1,URI,name,street,zip,city,ticket_vending_machine,luggage_lockers,free_parking,taxi,bicycle_spots,...,disabled_parking_spots_indicator,number_facilities,Unnamed: 0,Station,week,saturday,sunday,sum,week_total,avg_day
0,008895000,aalst,Stationsplein 9,9300,Aalst,1.0,0.0,1.0,1.0,1.0,...,1,0.705882,0,aalst,6444.0,1768.0,1592.0,9804.0,35580.0,0.097048
1,008895125,aalst-kerrebroek,Ledebaan,9300,Aalst,0.0,0.0,1.0,0.0,0.0,...,0,0.058824,1,aalst-kerrebroek,27.0,0.0,0.0,27.0,135.0,0.000284
2,008891140,aalter,Stationsplein 2,9880,Aalter,1.0,0.0,1.0,0.0,1.0,...,1,0.411765,2,aalter,2288.0,1055.0,855.0,4198.0,13350.0,0.036360
3,008833209,aarschot,Statieplein,3200,Aarschot,1.0,0.0,1.0,0.0,1.0,...,1,0.529412,3,aarschot,6270.0,1954.0,1395.0,9619.0,34699.0,0.094642
4,008892288,aarsele,Hogenhovenstraat Zn,8700,Aarsele,1.0,0.0,0.0,0.0,0.0,...,0,0.058824,4,aarsele,34.0,0.0,0.0,34.0,170.0,0.000379
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,008832250,zolder,Stationsstraat Z/n,3550,Heusden - Zolder,1.0,0.0,1.0,0.0,0.0,...,1,0.176471,549,zolder,123.0,38.0,37.0,198.0,690.0,0.001799
546,008832334,zonhoven,Engstegenseweg,3520,Zonhoven,1.0,0.0,1.0,0.0,0.0,...,1,0.176471,550,zonhoven,67.0,29.0,19.0,115.0,383.0,0.000961
547,008895208,zottegem,Stationsplein 12,9620,Zottegem,1.0,0.0,1.0,0.0,1.0,...,1,0.470588,551,zottegem,4936.0,986.0,931.0,6853.0,26597.0,0.072524
548,008894821,zwijndrecht,Fortlaan 1,2070,Zwijndrecht,1.0,0.0,1.0,0.0,1.0,...,0,0.176471,552,zwijndrecht,401.0,43.0,44.0,488.0,2092.0,0.005626


In [10]:
## Number of daily trains
trips = data.full_trips
daily_trains = trips["Name of the stop"].value_counts().reset_index().rename(columns={"index": "Name of the stop", "Name of the stop": "count"}).astype({"count": int})
daily_trains['count_perDay'] = daily_trains["count"]/7
daily_trains

Unnamed: 0,Name of the stop,count,count_perDay
0,brussel-zuid/bruxelles-midi,7421,1060.142857
1,brussel-noord/bruxelles-nord,6899,985.571429
2,brussel-kapellekerk/bruxelles-chapelle,6892,984.571429
3,brussel-centraal/bruxelles-central,6892,984.571429
4,brussel-congres/bruxelles-congrès,6892,984.571429
...,...,...,...
622,oostende-zeehaven,3,0.428571
623,leuven-bundel m,2,0.285714
624,hasselt-aflos l.35/1,1,0.142857
625,gent-sint-pieters-bundels,1,0.142857


In [34]:
## merge: when we want to merge, we see that there are 4 stations in facilities that are not in full_trips:
## antwerpen-haven, bastogne-nord, bastogne-sud and florée. These are 4 stations that are permanently closed now, so it is not interesting to take these stations into consideration
## Hence, we will merge the same stations in full trips with facilities as we did for travelers
len(list(set(data.facilities['name']).intersection(set(daily_trains['Name of the stop']))))
intersection = list(set(data.facilities['name']).intersection(set(daily_trains['Name of the stop'])))
still_needed = set(data.facilities['name']).difference(intersection)
still_needed

{'antwerpen-haven', 'bastogne-nord', 'bastogne-sud', 'florée'}

In [35]:
merge_trains = pd.merge(data.facilities, daily_trains, left_on= 'name', right_on= 'Name of the stop')
merge_trains

Unnamed: 0,URI,name,street,zip,city,ticket_vending_machine,luggage_lockers,free_parking,taxi,bicycle_spots,...,sales_close_friday,sales_open_saturday,sales_close_saturday,sales_open_sunday,sales_close_sunday,disabled_parking_spots_indicator,number_facilities,Name of the stop,count,count_perDay
0,008895000,aalst,Stationsplein 9,9300,Aalst,1.0,0.0,1.0,1.0,1.0,...,0 days 20:00:00,0 days 06:00:00,0 days 20:00:00,0 days 06:00:00,0 days 20:00:00,1,12.0,aalst,1262,180.285714
1,008895125,aalst-kerrebroek,Ledebaan,9300,Aalst,0.0,0.0,1.0,0.0,0.0,...,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0,1.0,aalst-kerrebroek,70,10.000000
2,008891140,aalter,Stationsplein 2,9880,Aalter,1.0,0.0,1.0,0.0,1.0,...,0 days 14:15:00,0 days 07:45:00,0 days 15:00:00,0 days 07:45:00,0 days 15:00:00,1,7.0,aalter,1285,183.571429
3,008833209,aarschot,Statieplein,3200,Aarschot,1.0,0.0,1.0,0.0,1.0,...,0 days 20:00:00,0 days 06:00:00,0 days 20:00:00,0 days 07:00:00,0 days 14:15:00,1,9.0,aarschot,1250,178.571429
4,008892288,aarsele,Hogenhovenstraat Zn,8700,Aarsele,1.0,0.0,0.0,0.0,0.0,...,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0,1.0,aarsele,268,38.285714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
548,008832250,zolder,Stationsstraat Z/n,3550,Heusden - Zolder,1.0,0.0,1.0,0.0,0.0,...,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,1,3.0,zolder,217,31.000000
549,008832334,zonhoven,Engstegenseweg,3520,Zonhoven,1.0,0.0,1.0,0.0,0.0,...,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,1,3.0,zonhoven,217,31.000000
550,008895208,zottegem,Stationsplein 12,9620,Zottegem,1.0,0.0,1.0,0.0,1.0,...,0 days 20:00:00,0 days 07:15:00,0 days 14:30:00,0 days 07:15:00,0 days 14:30:00,1,8.0,zottegem,974,139.142857
551,008894821,zwijndrecht,Fortlaan 1,2070,Zwijndrecht,1.0,0.0,1.0,0.0,1.0,...,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0,3.0,zwijndrecht,827,118.142857


In [42]:
#First we need to do scaling in order to be able to compare daily travelers with daily trains:
scaler = MinMaxScaler()
scaler.fit(merge_trains[['count_perDay', 'number_facilities']])
merge_trains[['count_perDay', 'number_facilities']] = scaler.transform(merge_trains[['count_perDay', 'number_facilities']])
#univariate regression for daily travelers:
X = merge_trains['count_perDay']
Y = merge_trains['number_facilities']
X = sm.add_constant(X) # adding a constant: Y = beta0 + beta1*X1 + beta2*X2 + espilon instead of Y = beta1*X1 + beta2*X2 + epsilon

model = sm.OLS(Y, X).fit()
print_model = model.summary()
print_model

0,1,2,3
Dep. Variable:,number_facilities,R-squared:,0.178
Model:,OLS,Adj. R-squared:,0.177
Method:,Least Squares,F-statistic:,119.6
Date:,"Sat, 05 Nov 2022",Prob (F-statistic):,2.47e-25
Time:,12:38:39,Log-Likelihood:,206.85
No. Observations:,553,AIC:,-409.7
Df Residuals:,551,BIC:,-401.1
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.1732,0.009,18.555,0.000,0.155,0.192
count_perDay,0.6676,0.061,10.938,0.000,0.548,0.788

0,1,2,3
Omnibus:,39.538,Durbin-Watson:,1.886
Prob(Omnibus):,0.0,Jarque-Bera (JB):,49.182
Skew:,0.613,Prob(JB):,2.09e-11
Kurtosis:,3.794,Cond. No.,8.69


In [None]:
##As we see the daily travelers Adj R² = 0.336 while the Adj R² for daily trains is only 0.177. Based on this goodness-of-fit measure we can say that daily travelers will be more a primary driver than daily trains. This is quite logical because the number of facilities is important for travelers. If you have a lot of trains but each time with a low amount of travelers, than the number of facilities will be less important than if you have a lot of travelers travelling in less trains.