# NeuralProphet
This is the notebook for the NeuralProphet predictions for Station Crowdedness. 

## Functions
Here the functions are declared which will then be used to make predictions.


In [None]:
import pandas as pd
import numpy as np
import datetime as dt
from neuralprophet import NeuralProphet
from neuralprophet import set_random_seed 
import time
from datetime import datetime, timedelta

#%centraal station first try
Data = pd.read_csv('M_GVB_weather_events_all_covidindex_holidays_filled (6).csv' )
Events = pd.read_csv('events_all (1).csv', sep = ';')


In [2]:
#for this example we will only keep data until late november 2021 because afterwards there were no events which we could test for
#only keep data until november 2021
Data = Data[Data['Date_time']< '2021-11-22'] #last event on 18th so cut of there
#only keep 2020 data due to covid
Data = Data[Data['Date_time']> '2019-12-32']
Data.reset_index()


Unnamed: 0.1,index,Unnamed: 0,Station,Date_time,Checked_in_passengers,Checked_out_passengers,Passengers_total,BezoekersVerwacht,StringencyIndex,Event starting,...,Holiday_Liberation Day,"Holiday_Liberation Day, May holiday",Holiday_May holiday,Holiday_Not a holiday,Holiday_Summer holiday,Holiday_Whit Monday,Holiday_Whit Sunday,Is_Holiday_Binary,HOLIDAY_BINARY,VACATION_BINARY
0,0,131400,Centraal Station,2020-01-01 00:00:00,389.0,210.0,599.0,0.0,0.00,0,...,0,0,0,0,0,0,0,1,0,1
1,1,131401,Station Bijlmer ArenA,2020-01-01 00:00:00,81.0,110.0,191.0,0.0,0.00,0,...,0,0,0,0,0,0,0,1,0,1
2,2,131402,Station Zuid,2020-01-01 00:00:00,61.0,97.0,158.0,0.0,0.00,0,...,0,0,0,0,0,0,0,1,0,1
3,3,131403,"Joined Stations Line 51,53,54",2020-01-01 00:00:00,50.0,20.0,70.0,0.0,0.00,0,...,0,0,0,0,0,0,0,1,0,1
4,4,131404,Station RAI,2020-01-01 00:00:00,15.0,31.0,46.0,0.0,0.00,0,...,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248755,248755,380155,Station Zuid,2021-11-21 23:00:00,108.0,92.0,200.0,0.0,52.78,0,...,0,0,0,1,0,0,0,0,0,0
248756,248756,380156,Station Duivendrecht,2021-11-21 23:00:00,17.0,18.0,35.0,0.0,52.78,0,...,0,0,0,1,0,0,0,0,0,0
248757,248757,380157,"Joined Stations Line 50,51",2021-11-21 23:00:00,23.0,31.0,54.0,0.0,52.78,0,...,0,0,0,1,0,0,0,0,0,0
248758,248758,380158,Overamstel,2021-11-21 23:00:00,10.0,13.0,23.0,0.0,52.78,0,...,0,0,0,1,0,0,0,0,0,0


In [3]:
def station_selector(data, station, option = 'all'): 
    '''This function takes in the raw data, and slices it to have the right time frame (1-1-2020 to 18-11-2021). It also Filters
       the station you want, and selects the relevant variables and renames them as expected for neural prophet. Make sure that 
       the station variable is in quotation marks! Option decides whether it is check in, check out, or all passengers as target value'''
    #select station of choice
    df = data[data.Station == station]
    
    #now there are three possibilities, total, check in or check out
    if option == 'check-out':
        df = df[['Date_time', 'Checked_out_passengers',  'StringencyIndex', 'Is_Holiday_Binary', 
             'Temp(F)', 'Wind(MpH)', 'sum_rain_knmi','duration_rain_knmi', 
             'C3_Cancel public events_1.0', 'C3_Cancel public events_2.0']]
        #rename columns into what NeuralProphet expects
        df.rename(columns={'Date_time': 'ds', 'Checked_out_passengers' : 'y', 
                       'C3_Cancel public events_1.0':'event_measure_1',
                       'C3_Cancel public events_2.0':'event_measure_2',}, inplace=True)
    #check out data    
    elif option == 'check-in':
        df = df[['Date_time', 'Checked_in_passengers',  'StringencyIndex', 'Is_Holiday_Binary', 
             'Temp(F)', 'Wind(MpH)', 'sum_rain_knmi','duration_rain_knmi', 
             'C3_Cancel public events_1.0', 'C3_Cancel public events_2.0']]
        #rename columns into what NeuralProphet expects
        df.rename(columns={'Date_time': 'ds', 'Checked_in_passengers' : 'y', 
                       'C3_Cancel public events_1.0':'event_measure_1',
                       'C3_Cancel public events_2.0':'event_measure_2',}, inplace=True)
    #Total passengers    
    elif option =='all':
        #only keep var of interest (still need to extend)
        df = df[['Date_time', 'Passengers_total',  'StringencyIndex', 'Is_Holiday_Binary', 
             'Temp(F)', 'Wind(MpH)', 'sum_rain_knmi','duration_rain_knmi', 
             'C3_Cancel public events_1.0', 'C3_Cancel public events_2.0']]
        #rename columns into what NeuralProphet expects
        df.rename(columns={'Date_time': 'ds', 'Passengers_total' : 'y', 
                       'C3_Cancel public events_1.0':'event_measure_1',
                       'C3_Cancel public events_2.0':'event_measure_2',}, inplace=True)
    else: 
        print('Error, not sure which data to use')

    return(df)


In [4]:
def event_preparer(events): 
    '''Takes the events all data frame and  prepares it to what the Neural Prophet expects'''
    events = events[['Starting_datetime' , 'Evenementlocatie']] #only use starting time
    events.rename(columns={'Starting_datetime': 'ds', 'Evenementlocatie' : 'event'}, inplace=True)
    #drop duplicates, because now I don't care about starting and stopping, that is done through window
    events = events.drop_duplicates()

    return(events)    

In [5]:
def model_preparer(df, events):
    ''' This function prepares the model in the loosest sense of the word. Since the model specs will be the same for
        All stations they are hard coded. ALso the variables considered stay the same, so they are added to the model here
        (they have to be explicitly added to the model after specification for NeuralProphet. Next, the events are added 
         the way the model expects them too and finally the events, are added to the rest of the df in the way NeuralProphet
         wants it.'''
    #specify the model to be fitted
    npr = NeuralProphet(n_lags= 24 * 7,   #amount of lags for the AR net to take into account
                        #n_forecasts = 168, #how far into the future you want to forecast (if used change the forecasting function)
                        daily_seasonality=True, #expecting daily seasonality
                        yearly_seasonality= False,  #not enough data for this
                        weekly_seasonality= False,  #so far was worse with this set to auto so forcing it off
                        learning_rate=0.01,         #learning rate for deep model, suggested to be between .01 and .001 if unsure
                        num_hidden_layers=2)        #amount of hidden layers
    
    
    #add lagged regressor for each additional variable you want in the model
    #check if you want last value only one to be considered
    npr.add_lagged_regressor('StringencyIndex', normalize = 'standardize', only_last_value = True) 
    npr.add_lagged_regressor('Is_Holiday_Binary', only_last_value = True)
    npr.add_lagged_regressor('Temp(F)', normalize = 'standardize') 
    npr.add_lagged_regressor('Wind(MpH)', normalize = 'standardize') 
    npr.add_lagged_regressor('sum_rain_knmi', normalize = 'standardize', only_last_value = True) 
    npr.add_lagged_regressor('duration_rain_knmi', normalize = 'standardize', only_last_value = True) 
    npr.add_lagged_regressor('event_measure_1', only_last_value = True)
    npr.add_lagged_regressor('event_measure_2',  only_last_value = True)
   
    
    #specify which events you will add
    npr = npr.add_events(['AFAS Live', 'De Toekomst', 'Johan Cruijff ArenA', 'Ziggo Dome'], lower_window=-3, upper_window=5)
    
    #add the events to the df
    df_events = npr.create_df_with_events(df, events)
    
    return(df_events, npr)


#df_events, npr = model_preparer(df_2, events_2)

In [6]:
def get_train_val_test_split(df, end_year=2021, end_month=11, end_day=21, end_hour=23):
    """
    Create train, validation, and test split for 1-week ahead models. This means that the last week of the data will be used
    as a test set, the second-last will be the validation set, and the rest will be the training set. The input is a dataframe, 
    and the date you want to use as the end of the dataframe. The output are two train sets, one for the training with 
    validation, the other one for the test set, a validation set, and a test set.
    """
    set_testing_date = datetime(end_year,end_month,end_day,end_hour)

    #set_testing_date = df['datetime'].max()
    
    last_week = pd.date_range(set_testing_date  - timedelta(hours=167), set_testing_date, freq = 'H')
    two_weeks_before = pd.date_range(set_testing_date  - timedelta(hours=335), set_testing_date - timedelta(hours=168),freq = 'H')
    df['ds'] = pd.to_datetime(df['ds'])
    train1 = df[df['ds']<two_weeks_before.min() + timedelta(hours=1)]
    train2 = df[df['ds']<two_weeks_before.max() + timedelta(hours=1)]
    validation = df[(df['ds']>=two_weeks_before.min()) & (df['ds']<=two_weeks_before.max())]
    test = df[(df['ds']>=last_week.min()) & (df['ds']<=last_week.max())]
    
    return [train1, train2, validation, test]


In [8]:
def forcaster(npr, train, test, periods, option):
    '''Here the forecasting happens. Since the model relies on previous values, I can always just forecast one step in advance. 
       Therefore, one step is forecasted at a time and then the forecasted value is added as 'ground truth' for the next prediction. 
       '''
    #first, check if there is enough data to predict the amount of periods you want
    if periods > len(test): 
        print('Error, you are trying to predict further than you have data for.')
        return(None)
    
    for i in range(periods):
        print(i)
        #next we need to compute a future data_frame with only one observation
        future = test.copy() #copy test set 
        future = future.head(1) #only keep first observation
        future.y = float('NaN') #hide the y value by replacing it as nan
        current_time = int(future.ds.dt.hour) #get int of current time, needed for if else statement
        #now lets add this to a future dataframe that has all values but the last y value is nan
        future = pd.concat([train, future])
        
        #2 options, either metro is closed and passengers are 0, or its open and passengers have to be predicted
        if option == 'check-in': #metro closes at 1 so no sense to check in afterwards
            metro_closed = [1,2,3,4]
        elif option == 'check-out':
            metro_closed = [2,3,4,5] #metro opens at 5 so makes no sense to check out before
        else:
            metro_closed = []
            print('error')
        #now we need to predict    
        if current_time in metro_closed:
            future.y.iloc[-1:, ] = 0
        else:
            #next, lets forecast the value
            forecast = npr.predict(future)
            yhat = float(forecast.yhat1.iloc[-1:, ])
            #now we have to add the predicted value back to the df to be able to predict the next one, 
            #however if the value is below 0 (which is impossible) it predicts 0 instead of the negative value
            if yhat > 0:
                future.y.iloc[-1:, ] = yhat
            else:
                future.y.iloc[-1:, ] = 0
            
        #the future df is now basically the new train_large, so replace it as such
        train = future
        
        #slice the first observation of the test df as we just already predicted it
        test = test.iloc[1:, ]
        
    
    #create df that only has the predicitons
    predictions = train.iloc[-periods:, :]
    #return predictions
    return(predictions) #return df with all observations + predicitons

The last function essentially combines the previous functions so they can be quickly run for all stations. If you want to make changes to the prediction pipeline they have to be made here. 

In [25]:

def pred_getter(Station, Data = Data, Events = Events):
    #checkin
    df_in = station_selector(Data, Station, option = 'check-in')
    #prepare the events
    events = event_preparer(Events)
    #get model ready
    df_events, npr = model_preparer(df_in, events)
    #split data
    train_short, train_large, val, test = get_train_val_test_split(df_events)
    #test_y_in = list(test.y)
    #fit model npr, train, val
    npr.fit(train_large, freq="H", minimal=True)
    #forecaster
    data_pred_in = forcaster(npr = npr, train = train_large,  test = test, periods = 168, option = 'check-in')
    #adjust for closed metro hours
    
    #checkout
    df_out = station_selector(Data, Station, option = 'check-out')
    #prepare the events
    events = event_preparer(Events)
    #get model ready
    df_events, npr = model_preparer(df_out, events)
    #split data
    train_short, train_large, val, test = get_train_val_test_split(df_events)
    #test_y_out = list(test.y)
    npr.fit(train_large, freq="H", minimal=True)
    #forecaster
    data_pred_out = forcaster(npr = npr, train = train_large,  test = test, periods = 168, option = 'check-out')


    #combine in and out
    pred_in = data_pred_in[['ds', 'y']]
    pred_in.rename(columns={'y' : 'check_in_prediction'}, inplace = True)
    pred_out = data_pred_out[['ds', 'y']]
    pred_out.rename(columns={'y' : 'check_out_prediction'}, inplace = True)
    #combining both dfs
    
    predictions = pred_in.merge(pred_out, how = 'inner', on = 'ds')
    predictions['Station'] = Station
    print(predictions)
    
    return(predictions)

Lastly, we now use the previously defined functions and obtain the predictions for the next week. 

In [None]:
#get predictions from stations
JS_5051 = pred_getter(Station = 'Joined Stations Line 50,51')
JS_515354 = pred_getter(Station = 'Joined Stations Line 51,53,54')
JS_5054 = pred_getter(Station =  'Joined Stations Line 50,54 South')
JS_52N = pred_getter(Station =  'Joined Stations Line 52 North')
JS_52S = pred_getter(Station = 'Joined Stations Line 52 South')
JS_53 = pred_getter(Station =  'Joined Stations Line 53 South')
OverA = pred_getter(Station =  'Overamstel')
Spak = pred_getter(Station = 'Spaklerweg')
Duiv = pred_getter(Station =  'Station Duivendrecht')
Rai = pred_getter(Station =  'Station RAI')
Stra = pred_getter(Station =  'Strandvliet')
Vdm = pred_getter(Station =  'Van der Madeweg')
Cs = pred_getter(Station = 'Centraal Station')
Sz = pred_getter(Station = 'Station Zuid')
Sba = pred_getter(Station = 'Station Bijlmer ArenA')


#combining the predictions
predictions = pd.concat([JS_5051, JS_515354, JS_5054, JS_52N, JS_52S, JS_53, 
                         OverA, Spak, Duiv, Rai, Stra, Vdm, Cs, Sz, Sba])

#Save predictions
predictions.to_csv('Predictions_2.csv')