In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import Normalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Flatten, LSTM
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Lambda

# Data & Targets

In [73]:
df = pd.read_csv('/Users/frederickjohannson/code/Sweetpatata/bike-or-nei/data_all_cleaned.csv')
df = df.drop(columns=['Trips_in','Trips_out', 'Unnamed: 0'])

In [74]:
print(df.shape)
df.columns

(293762, 27)


Index(['Station_Id', 'Date', 'In_Out', 'temp_min', 'temp_max',
       'wind_speed_avg', 'rainfall_total', 'snow_total', 'day_of_week_0',
       'day_of_week_1', 'day_of_week_2', 'day_of_week_3', 'day_of_week_4',
       'day_of_week_5', 'day_of_week_6', 'Month_1', 'Month_2', 'Month_3',
       'Month_4', 'Month_5', 'Month_6', 'Month_7', 'Month_8', 'Month_9',
       'Month_10', 'Month_11', 'Month_12'],
      dtype='object')

In [4]:
TARGET = 'In_Out'
N_TARGETS = 1
N_FEATURES = 24

# Extracting Data for one station

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,Station_Id,Date,Trips_in,Trips_out,In_Out,temp_min,temp_max,wind_speed_avg,rainfall_total,...,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12
0,183879,556,2019-04-02,1,1,0,-2.04,11.21,4.633333,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,377,2019-04-03,12,15,-3,-2.79,6.61,4.620833,1.71,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,177199,550,2019-04-03,6,11,-5,-2.79,6.61,4.620833,1.71,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,178251,551,2019-04-03,68,40,28,-2.79,6.61,4.620833,1.71,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,179453,552,2019-04-03,22,17,5,-2.79,6.61,4.620833,1.71,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
len(df.groupby('Date'))

1269

In [83]:
df_list = []
station_list = []
for station in df['Station_Id'].unique():
    df_station = df[df['Station_Id'] == station]
    df_station.set_index('Date', inplace=True)
    df_list.append(df_station)
    station_list.append(station)

In [84]:
len(df_list)

279

In [85]:
df_list[0]

Unnamed: 0_level_0,Station_Id,In_Out,temp_min,temp_max,wind_speed_avg,rainfall_total,snow_total,day_of_week_0,day_of_week_1,day_of_week_2,...,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-04-02,556,0,-2.04,11.21,4.633333,0.00,0.00,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-04-03,556,-5,-2.79,6.61,4.620833,1.71,0.60,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-04-04,556,0,-1.76,15.38,4.389583,0.00,0.00,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-04-05,556,-7,-1.41,12.05,6.741667,0.39,0.50,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-04-06,556,-2,-0.49,13.98,3.275000,1.00,0.88,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-10-27,556,-1,5.21,13.72,3.071250,0.10,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2022-10-28,556,3,7.69,14.30,2.264583,3.01,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2022-10-29,556,-3,4.06,16.58,3.171667,0.00,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2022-10-30,556,0,0.21,9.83,1.096667,0.19,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [7]:
df_station_377 = df[df['Station_Id'] == 377]

In [8]:
df_station_377.shape

(1203, 30)

In [9]:
df_station_377 = df_station_377.drop(columns=['Trips_in','Trips_out', 'Unnamed: 0', 'Station_Id'])

In [10]:
df_station_377.set_index('Date', inplace=True)

In [49]:
df_station_377.dtypes

In_Out              int64
temp_min          float64
temp_max          float64
wind_speed_avg    float64
rainfall_total    float64
snow_total        float64
day_of_week_0     float64
day_of_week_1     float64
day_of_week_2     float64
day_of_week_3     float64
day_of_week_4     float64
day_of_week_5     float64
day_of_week_6     float64
Month_1           float64
Month_2           float64
Month_3           float64
Month_4           float64
Month_5           float64
Month_6           float64
Month_7           float64
Month_8           float64
Month_9           float64
Month_10          float64
Month_11          float64
Month_12          float64
dtype: object

# Folds for cross-validation

In [12]:
FOLD_LENGTH = 30 # total days 1269 / total months 43
FOLD_STRIDE = 14
TRAIN_TEST_RATIO = 0.66

In [52]:
def get_folds(
    df,
    fold_length,
    fold_stride):
    '''
    This function slides through the Time Series dataframe of shape (n_timesteps, n_features) to create folds
    - of equal `fold_length`
    - using `fold_stride` between each fold
    
    Returns a list of folds, each as a DataFrame
    '''
    
    folds = []
    for idx in range(0, len(df), fold_stride):
        # Exits the loop as soon as the last fold index would exceed the last index
        if (idx + fold_length) > len(df):
            break
        fold = df.iloc[idx:idx + fold_length, :]
        folds.append(fold)#(np.asarray(fold).astype('float32'))
    return folds

In [53]:
folds = get_folds(df_station_377, FOLD_LENGTH, FOLD_STRIDE)

print(f'The function generated {len(folds)} folds.')
print(f'Each fold has a shape equal to {folds[0].shape}.')

The function generated 84 folds.
Each fold has a shape equal to (30, 25).


In [15]:
input_length = 5
output_length = 1

# Test-train split (one fold)

In [16]:
def train_test_split(fold,
                     train_test_ratio,
                     input_length):
    '''
    Returns a train dataframe and a test dataframe (fold_train, fold_test)
    from which one can sample (X,y) sequences.
    df_train should contain all the timesteps until round(train_test_ratio * len(fold))   
    '''
    # TRAIN SET
    # ======================
    last_train_idx = round(train_test_ratio * len(fold))
    fold_train = fold.iloc[0:last_train_idx, :]

    # TEST SET
    # ======================    
    first_test_idx = last_train_idx - input_length
    fold_test = fold.iloc[first_test_idx:, :]

    return (fold_train, fold_test)

In [17]:
(fold_train, fold_test) = train_test_split(folds[0], TRAIN_TEST_RATIO, input_length)

In [18]:
def get_Xi_yi(
    fold, 
    input_length, 
    output_length):
    '''
    - given a fold, it returns one sequence (X_i, y_i)
    - with the starting point of the sequence being chosen at random
    '''
    first_possible_start = 0
    last_possible_start = len(fold) - (input_length + output_length) + 1
    random_start = np.random.randint(first_possible_start, last_possible_start)
    X_i = fold.iloc[random_start:random_start+input_length]    
    y_i = fold.iloc[random_start+input_length:
                  random_start+input_length+output_length][[TARGET]]
    
    return (X_i, y_i)

In [59]:
X_train_i, y_train_i = get_Xi_yi(fold_train, input_length, output_length)
X_test_i, y_test_i = get_Xi_yi(fold_test, input_length, output_length)

In [20]:
def get_X_y(
    fold,
    number_of_sequences,
    input_length,
    output_length
    ):
    
    X, y = [], []

    for i in range(number_of_sequences):
        (Xi, yi) = get_Xi_yi(fold, input_length, output_length)
        X.append(Xi)
        y.append(yi)
        
    return np.array(X), np.array(y)

In [21]:
N_TRAIN = 60 # number_of_sequences_train
N_TEST =  40 # number_of_sequences_test

X_train, y_train = get_X_y(fold_train, N_TRAIN, input_length, output_length)
X_test, y_test = get_X_y(fold_test, N_TEST, input_length, output_length)

In [36]:
X_test.shape

(40, 5, 25)

In [22]:
#for fold in folds:
#    fold_train, fold_test = train_test_split(fold, TRAIN_TEST_RATIO, input_length)
#    return fold 

# RNN Model (one fold)

In [23]:
def init_model(X_train, y_train):
    
    # $CHALLENGIFY_BEGIN    
    
    # 0 - Normalization
    # ======================    
    normalizer = Normalization()
    normalizer.adapt(X_train)
    
    # 1 - RNN architecture
    # ======================    
    model = Sequential()
    ## 1.0 - All the rows will be standardized through the already adapted normalization layer
    model.add(normalizer)
    ## 1.1 - Recurrent Layer
    model.add(LSTM(64, 
                          activation='tanh', 
                          return_sequences = False,
                          recurrent_dropout = 0.3))
    ## 1.2 - Predictive Dense Layers
    output_length = y_train.shape[1]
    model.add(Dense(output_length, activation='linear'))

    # 2 - Compiler
    # ======================    
    adam = optimizers.Adam(learning_rate=0.02)    
    model.compile(loss='mse', optimizer=adam, metrics=["mae"])
    
    return model

In [25]:
def fit_model(model, verbose=1):

    # $CHALLENGIFY_BEGIN
    es = EarlyStopping(monitor = "val_loss",
                      patience = 3,
                      mode = "min",
                      restore_best_weights = True)


    history = model.fit(X_train, y_train,
                        validation_split = 0.3,
                        shuffle = False,
                        batch_size = 32,
                        epochs = 50,
                        callbacks = [es],
                        verbose = verbose)

    return model, history

In [26]:
# 1 - Initialising the RNN model
# ====================================

model = init_model(X_train, y_train)
model.summary()

# 2 - Training
# ====================================
model, history = fit_model(model)

2022-12-01 16:21:03.572387: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization (Normalizatio  (None, None, 25)         51        
 n)                                                              
                                                                 
 lstm (LSTM)                 (None, 64)                23040     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 23,156
Trainable params: 23,105
Non-trainable params: 51
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50


In [27]:
res = model.evaluate(X_test, y_test)



# Baseline Model (for one fold)

In [28]:
def init_baseline():

    model = Sequential()
    model.add(Lambda(lambda x: x[:,-1,1,None]))

    adam = optimizers.Adam(learning_rate=0.02)
    model.compile(loss='mse', optimizer=adam, metrics=["mae"])

    return model

In [29]:
baseline_model = init_baseline()
baseline_score = baseline_model.evaluate(X_test, y_test)



# Cross-validation (all folds)

In [54]:
def cross_validate_baseline_and_lstm():
    '''
    This function cross-validates 
    - the "last seen value" baseline model
    - the RNN model
    '''
    
    list_of_mae_baseline_model = []
    list_of_mae_recurrent_model = []
    
    # 0 - Creating folds
    # =========================================    
    folds = get_folds(df_station_377, FOLD_LENGTH, FOLD_STRIDE)
    
    for fold_id, fold in enumerate(folds):
        
        # 1 - Train/Test split the current fold
        # =========================================
        (fold_train, fold_test) = train_test_split(fold, TRAIN_TEST_RATIO, input_length)                   

        X_train, y_train = get_X_y(fold_train, N_TRAIN, input_length, output_length)
        X_test, y_test = get_X_y(fold_test, N_TEST, input_length, output_length)
        
        # 2 - Modelling
        # =========================================
        
        ##### Baseline Model
        baseline_model = init_baseline()
        
        print(type(X_test[0,0,0]))
        
        mae_baseline = baseline_model.evaluate(X_test, y_test, verbose=0)[1]
        list_of_mae_baseline_model.append(mae_baseline)
        print("-"*50)
        print(f"MAE baseline fold n°{fold_id} = {round(mae_baseline, 2)}")

        ##### LSTM Model
        model = init_model(X_train, y_train)
        es = EarlyStopping(monitor = "val_mae",
                           mode = "min",
                           patience = 2, 
                           restore_best_weights = True)
        history = model.fit(X_train, y_train,
                            validation_split = 0.3,
                            shuffle = False,
                            batch_size = 32,
                            epochs = 50,
                            callbacks = [es],
                            verbose = 0)
        res = model.evaluate(X_test, y_test, verbose=0)
        mae_lstm = res[1]
        list_of_mae_recurrent_model.append(mae_lstm)
        print(f"MAE LSTM fold n°{fold_id} = {round(mae_lstm, 2)}")
        
        ##### Comparison LSTM vs Baseline for the current fold
        print(f"🏋🏽‍♂️ improvement over baseline: {round((1 - (mae_lstm/mae_baseline))*100,2)} % \n")

    return list_of_mae_baseline_model, list_of_mae_recurrent_model

In [86]:
mae_list = []
for i in df_list:
                      

    X_train, y_train = get_X_y(i, N_TRAIN, input_length, output_length)
    X_test, y_test = get_X_y(i, N_TEST, input_length, output_length)

    # 2 - Modelling
    # =========================================

    ##### LSTM Model
    model = init_model(X_train, y_train)
    es = EarlyStopping(monitor = "val_mae",
                       mode = "min",
                       patience = 2, 
                       restore_best_weights = True)
    history = model.fit(X_train, y_train,
                        validation_split = 0.3,
                        shuffle = False,
                        batch_size = 32,
                        epochs = 50,
                        callbacks = [es],
                        verbose = 0)
    res = model.evaluate(X_test, y_test, verbose=0)
    mae_lstm = res[1]
    mae_list.append(mae_lstm)
    

    

In [87]:
mae_list

[2.079939365386963,
 2.7373292446136475,
 2.183718204498291,
 13.436391830444336,
 4.472980976104736,
 7.5922532081604,
 6.412288665771484,
 2.9414238929748535,
 13.4185209274292,
 4.35196590423584,
 3.3075268268585205,
 7.084357261657715,
 4.481198787689209,
 3.043718099594116,
 2.8073012828826904,
 4.4678802490234375,
 3.1924681663513184,
 3.1045684814453125,
 7.727598667144775,
 4.149720668792725,
 3.5571722984313965,
 2.620628833770752,
 3.82080340385437,
 6.596154689788818,
 3.2003378868103027,
 4.790660381317139,
 9.98023509979248,
 4.216279029846191,
 6.139786243438721,
 7.0306549072265625,
 8.084578514099121,
 2.001192569732666,
 1.7682855129241943,
 12.33507251739502,
 2.7202868461608887,
 3.6841368675231934,
 2.7075271606445312,
 3.4725775718688965,
 8.158315658569336,
 2.88555908203125,
 2.2327446937561035,
 2.9615254402160645,
 2.284156322479248,
 3.839937686920166,
 13.681694030761719,
 4.059967994689941,
 2.563697338104248,
 3.753520965576172,
 8.57215690612793,
 7.209758

In [92]:
pred_list = []
for i in df_list:
                      

    X_train, y_train = get_X_y(i, N_TRAIN, input_length, output_length)
    X_test, y_test = get_X_y(i, N_TEST, input_length, output_length)

    prediction_test = model.predict(X_test[10])
    
    pred_list.append(int(prediction_test))





In [93]:
pred_list

[-1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,


In [55]:
mae_baselines, mae_lstms = cross_validate_baseline_and_lstm()

<class 'numpy.float64'>
--------------------------------------------------
MAE baseline fold n°0 = 8.86
MAE LSTM fold n°0 = 7.29
🏋🏽‍♂️ improvement over baseline: 17.69 % 

<class 'numpy.float64'>
--------------------------------------------------
MAE baseline fold n°1 = 7.95
MAE LSTM fold n°1 = 6.1
🏋🏽‍♂️ improvement over baseline: 23.3 % 

<class 'numpy.float64'>
--------------------------------------------------
MAE baseline fold n°2 = 14.84
MAE LSTM fold n°2 = 9.35
🏋🏽‍♂️ improvement over baseline: 37.02 % 

<class 'numpy.float64'>
--------------------------------------------------
MAE baseline fold n°3 = 14.97
MAE LSTM fold n°3 = 17.76
🏋🏽‍♂️ improvement over baseline: -18.68 % 

<class 'numpy.float64'>
--------------------------------------------------
MAE baseline fold n°4 = 19.9
MAE LSTM fold n°4 = 12.59
🏋🏽‍♂️ improvement over baseline: 36.74 % 

<class 'numpy.float64'>
--------------------------------------------------
MAE baseline fold n°5 = 12.79
MAE LSTM fold n°5 = 5.35
🏋🏽‍♂️ i

MAE LSTM fold n°39 = 3.09
🏋🏽‍♂️ improvement over baseline: -24.2 % 

<class 'numpy.float64'>
--------------------------------------------------
MAE baseline fold n°40 = 1.84
MAE LSTM fold n°40 = 2.4
🏋🏽‍♂️ improvement over baseline: -30.26 % 

<class 'numpy.float64'>
--------------------------------------------------
MAE baseline fold n°41 = 11.02
MAE LSTM fold n°41 = 1.67
🏋🏽‍♂️ improvement over baseline: 84.83 % 

<class 'numpy.float64'>
--------------------------------------------------
MAE baseline fold n°42 = 10.12
MAE LSTM fold n°42 = 1.45
🏋🏽‍♂️ improvement over baseline: 85.68 % 

<class 'numpy.float64'>
--------------------------------------------------
MAE baseline fold n°43 = 17.25
MAE LSTM fold n°43 = 4.62
🏋🏽‍♂️ improvement over baseline: 73.22 % 

<class 'numpy.float64'>
--------------------------------------------------
MAE baseline fold n°44 = 1.97
MAE LSTM fold n°44 = 2.48
🏋🏽‍♂️ improvement over baseline: -25.96 % 

<class 'numpy.float64'>
---------------------------------

MAE LSTM fold n°78 = 1.53
🏋🏽‍♂️ improvement over baseline: 85.55 % 

<class 'numpy.float64'>
--------------------------------------------------
MAE baseline fold n°79 = 11.36
MAE LSTM fold n°79 = 1.93
🏋🏽‍♂️ improvement over baseline: 83.05 % 

<class 'numpy.float64'>
--------------------------------------------------
MAE baseline fold n°80 = 7.37
MAE LSTM fold n°80 = 2.24
🏋🏽‍♂️ improvement over baseline: 69.63 % 

<class 'numpy.float64'>
--------------------------------------------------
MAE baseline fold n°81 = 6.98
MAE LSTM fold n°81 = 3.89
🏋🏽‍♂️ improvement over baseline: 44.35 % 

<class 'numpy.float64'>
--------------------------------------------------
MAE baseline fold n°82 = 6.18
MAE LSTM fold n°82 = 2.77
🏋🏽‍♂️ improvement over baseline: 55.17 % 

<class 'numpy.float64'>
--------------------------------------------------
MAE baseline fold n°83 = 3.55
MAE LSTM fold n°83 = 3.39
🏋🏽‍♂️ improvement over baseline: 4.57 % 



In [56]:
print(f"average percentage improvement over baseline = {round(np.mean(1 - (np.array(mae_lstms)/np.array(mae_baselines))),2)*100}%")

average percentage improvement over baseline = 31.0%


# Prediction for one station on date x

In [62]:
X_test.shape

(40, 5, 25)

In [63]:
prediction_test = model.predict(X_test[0])



In [68]:
int(prediction_test)

-4

In [66]:
y_test[0]

array([[-15]])

# 9 Prediction for all stations

In [None]:
#df_station_377.set_index('Date', inplace=True)
#df_station_377 = df_station_377.drop(columns=['Trips_in','Trips_out', 'Unnamed: 0', 'Station_Id'])

In [None]:
for station in range(len())