In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from matplotlib import pyplot

In [None]:
cmsa = pd.read_csv("../data/cmsa_combined.csv", index_col="Unnamed: 0", parse_dates=True)

## Prediction on sensor GAWW-11

In [None]:
sensors = ['GAWW-11', 'GAWW-12', 'GAWW-14']
gaww11 = cmsa.drop(['GAWW-12', 'GAWW-14'], axis=1)
gaww11.rename(columns={'GAWW-11':'crowd_count'}, inplace=True)
gaww11.columns

In [None]:
gaww11['datetime'] = pd.to_datetime(gaww11['datetime'])

gaww11['year']=gaww11['datetime'].dt.year 
gaww11['month']=gaww11['datetime'].dt.month 
gaww11['day']=gaww11['datetime'].dt.day
gaww11['dayofweek']=gaww11['datetime'].dt.dayofweek
gaww11['hour'] = gaww11['datetime'].dt.hour 
gaww11['minute'] = gaww11['datetime'].dt.minute

In [None]:
start_time_train = '2020-09-01 00:00:00'
start_time_test = '2021-12-01 12:00:00'
end_time_test = '2021-12-08 11:45:00'
train = gaww11[(gaww11['datetime'] >= start_time_train) & (gaww11['datetime'] < start_time_test)]
test = gaww11[(gaww11['datetime'] >= start_time_test) & (gaww11['datetime'] <= end_time_test)]

In [None]:
# Lagged crowd
first_lag = 4*24*7 # the same time point of the week before
lags = [first_lag, first_lag+1, first_lag+2, first_lag+4, first_lag+4*2, first_lag+4*3, first_lag+4*4, first_lag+4*5]
for lag in lags:
    gaww11['crowd_lag_'+str(lag)] = gaww11['crowd_count'].shift(lag)

# GVB checkin checkout lags
gvb_lag = 4*24*7
gaww11['checkin_dam_lag'] = gaww11['checkin_dam'].shift(gvb_lag)
gaww11['checkout_dam_lag'] = gaww11['checkout_dam'].shift(gvb_lag)
gaww11['checkin_nieuwmarkt_lag'] = gaww11['checkin_nieuwmarkt'].shift(gvb_lag)
gaww11['checkout_nieuwmarkt_lag'] = gaww11['checkout_nieuwmarkt'].shift(gvb_lag)

In [None]:
# mean crowd grouped by hour
gaww11['avg_hour_crowd'] = train.groupby('hour')['crowd_count'].transform('mean')

# mean crowd grouped by day of week
gaww11['avg_dow_crowd'] = train.groupby('dayofweek')['crowd_count'].transform('mean')

# mean crowd grouped by month
gaww11['avg_month_crowd'] = train.groupby('month')['crowd_count'].transform('mean')

In [None]:
# move 1 week later for training set, because first week lag features are null 
start_time_train = '2020-09-08 00:00:00'
start_time_test = '2021-12-01 12:00:00'
end_time_test = '2021-12-08 11:45:00'

train = gaww11[(gaww11['datetime'] >= start_time_train) & (gaww11['datetime'] < start_time_test)]
test = gaww11[(gaww11['datetime'] >= start_time_test) & (gaww11['datetime'] <= end_time_test)]

In [None]:
xgb = XGBRegressor()

X_train, y_train = train.drop(["crowd_count", "datetime", "checkin_dam", "checkout_dam", "checkin_nieuwmarkt", "checkout_nieuwmarkt"], axis=1), train["crowd_count"]
X_test, y_test = test.drop(["crowd_count", "datetime", "checkin_dam", "checkout_dam", "checkin_nieuwmarkt", "checkout_nieuwmarkt"], axis=1), test['crowd_count']

xgb.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error

eval_pred = xgb.predict(X_test)
mse = mean_squared_error(y_test, eval_pred, squared=False)
print("RMSE of XGBoost: ", mse)

In [None]:
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 5]

ft_importances = pd.Series(xgb.feature_importances_, index=X_train.columns)
ft_importances.nlargest(15).plot(kind='barh')
plt.show()

In [None]:
test['crowd_pred'] = eval_pred

In [None]:
# Plot of actual vs predicted crowd count

import plotly
import plotly.graph_objects as go
import plotly.io as pio

trace1 = go.Scatter(x=test.datetime, y=test.crowd_count, mode='lines', name='Actual')
trace2 = go.Scatter(x=test.datetime, y=test.crowd_pred, mode='lines', name='Predicted')
fig = go.Figure([trace1, trace2])

fig.show()