In [35]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import time
from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy.spatial.distance import cdist
from sklearn.neighbors import NearestNeighbors
from geopy.distance import geodesic
from datetime import date
import holidays

## Data Exploration

In [165]:
data = pd.read_parquet("train.parquet")
data.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,log_bike_count
48321,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 02:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0
48324,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2020-09-01 03:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.693147
48327,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 04:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0
48330,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2020-09-01 15:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,1.609438
48333,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2020-09-01 18:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2.302585


In [6]:
data.info()
data.describe()

# The dataset has no null values
# It goes from jan 2020 to december 2021
# We have 496 827 values
# dtypes: category(5), datetime64[ns](2), float64(4), int64(1)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496827 entries, 48321 to 929187
Data columns (total 12 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 496827 non-null  category      
 1   counter_name               496827 non-null  category      
 2   site_id                    496827 non-null  int64         
 3   site_name                  496827 non-null  category      
 4   bike_count                 496827 non-null  float64       
 5   date                       496827 non-null  datetime64[ns]
 6   counter_installation_date  496827 non-null  datetime64[ns]
 7   coordinates                496827 non-null  category      
 8   counter_technical_id       496827 non-null  category      
 9   latitude                   496827 non-null  float64       
 10  longitude                  496827 non-null  float64       
 11  log_bike_count             496827 non-null  floa

Unnamed: 0,site_id,bike_count,latitude,longitude,log_bike_count
count,496827.0,496827.0,496827.0,496827.0,496827.0
mean,105345000.0,60.191475,48.854343,2.345479,3.079917
std,32103460.0,87.590566,0.018554,0.038026,1.659899
min,100007000.0,0.0,48.82636,2.26542,0.0
25%,100047500.0,5.0,48.840801,2.31444,1.791759
50%,100056200.0,29.0,48.85209,2.35387,3.401197
75%,100056300.0,79.0,48.86461,2.37587,4.382027
max,300014700.0,1302.0,48.89172,2.40969,7.172425


In [7]:
data['counter_id'].value_counts()

# The dataset has similar number of values for each station
# It is a balanced dataset

100007049-101007049    8974
100056335-103056335    8974
100056327-104056327    8974
100056329-103056329    8974
100056329-104056329    8974
100056330-103056330    8974
100056330-104056330    8974
100056331-103056331    8974
100056331-104056331    8974
100056332-103056332    8974
100056332-104056332    8974
100056334-103056334    8974
100056334-104056334    8974
100056335-104056335    8974
100056226-104056226    8974
100056336-105056336    8974
100056336-106056336    8974
100057329-103057329    8974
100057329-104057329    8974
100057380-103057380    8974
100057445-103057445    8974
100057445-104057445    8974
100060178-101060178    8974
100060178-102060178    8974
100063175-353277233    8974
100063175-353277235    8974
100007049-102007049    8974
100056327-103056327    8974
100056226-103056226    8974
100047546-104047546    8974
100036718-103036718    8974
100036718-104036718    8974
100036719-103036719    8974
100036719-104036719    8974
100044493-SC           8974
100047542-103047542 

Test data

In [166]:
test_data = pd.read_parquet("final_test.parquet")
test_data.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude
0,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 01:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
1,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 13:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
2,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 17:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
3,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 19:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
4,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 22:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429


In [3]:
test_data.info()
test_data.describe()

# The test data is from september and october 2021 and doesn't have any missing values
# We have 51440 test points
# dtypes: category(5), datetime64[ns](2), float64(2), int64(1)
# It doesn't have the count_bikes and log_count_bikes columns
# log_count_bikes is our target

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51440 entries, 0 to 51439
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   counter_id                 51440 non-null  category      
 1   counter_name               51440 non-null  category      
 2   site_id                    51440 non-null  int64         
 3   site_name                  51440 non-null  category      
 4   date                       51440 non-null  datetime64[ns]
 5   counter_installation_date  51440 non-null  datetime64[ns]
 6   coordinates                51440 non-null  category      
 7   counter_technical_id       51440 non-null  category      
 8   latitude                   51440 non-null  float64       
 9   longitude                  51440 non-null  float64       
dtypes: category(5), datetime64[ns](2), float64(2), int64(1)
memory usage: 2.2 MB


Unnamed: 0,site_id,latitude,longitude
count,51440.0,51440.0,51440.0
mean,107305000.0,48.854275,2.344642
std,37388390.0,0.018607,0.038257
min,100007000.0,48.82636,2.26542
25%,100047500.0,48.83977,2.31179
50%,100056300.0,48.85209,2.35387
75%,100056300.0,48.86461,2.37587
max,300014700.0,48.89172,2.40969


In [8]:
#Checking that the counters in the test dataset are in the train dataset

same_counters = test_data['counter_id'].isin(data['counter_id']).all()
print(same_counters)

True


Additional data

In [167]:
#External data
ext_data = pd.read_csv("external_data.csv")
ext_data.head()

Unnamed: 0,numer_sta,date,pmer,tend,cod_tend,dd,ff,t,td,u,...,hnuage1,nnuage2,ctype2,hnuage2,nnuage3,ctype3,hnuage3,nnuage4,ctype4,hnuage4
0,7149,2021-01-01 00:00:00,100810,80,1,270,1.8,272.75,272.15,96,...,600.0,,,,,,,,,
1,7149,2021-01-01 03:00:00,100920,110,3,300,1.7,271.25,270.95,98,...,1500.0,2.0,3.0,3000.0,,,,,,
2,7149,2021-01-01 06:00:00,100950,30,3,290,2.6,271.95,271.65,98,...,480.0,4.0,6.0,2000.0,6.0,3.0,3000.0,,,
3,7149,2021-01-01 09:00:00,101100,150,2,280,1.7,272.45,272.05,97,...,1740.0,3.0,3.0,2800.0,,,,,,
4,7149,2021-01-01 12:00:00,101110,30,0,50,1.0,276.95,274.15,82,...,330.0,4.0,6.0,570.0,7.0,6.0,810.0,,,


In [19]:
ext_data.info()
ext_data.describe()

# Some of the features of this dataset have a lot of null values
# It covers the same time period than our data
# excepted its by 3 hours slots.
# 3322 entries
# 58 columns, all int or float (and a date_time)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3322 entries, 0 to 3321
Data columns (total 59 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   numer_sta  3322 non-null   int64  
 1   date       3322 non-null   object 
 2   pmer       3322 non-null   int64  
 3   tend       3322 non-null   int64  
 4   cod_tend   3322 non-null   int64  
 5   dd         3322 non-null   int64  
 6   ff         3322 non-null   float64
 7   t          3322 non-null   float64
 8   td         3322 non-null   float64
 9   u          3322 non-null   int64  
 10  vv         3322 non-null   int64  
 11  ww         3322 non-null   int64  
 12  w1         3315 non-null   float64
 13  w2         3312 non-null   float64
 14  n          3166 non-null   float64
 15  nbas       3317 non-null   float64
 16  hbas       2869 non-null   float64
 17  cl         2909 non-null   float64
 18  cm         1941 non-null   float64
 19  ch         1678 non-null   float64
 20  pres    

Unnamed: 0,numer_sta,pmer,tend,cod_tend,dd,ff,t,td,u,vv,...,hnuage1,nnuage2,ctype2,hnuage2,nnuage3,ctype3,hnuage3,nnuage4,ctype4,hnuage4
count,3322.0,3322.0,3322.0,3322.0,3322.0,3322.0,3322.0,3322.0,3322.0,3322.0,...,2867.0,1695.0,1443.0,1695.0,618.0,470.0,618.0,42.0,87.0,42.0
mean,7149.0,101682.886815,-0.237809,4.273028,184.379892,3.654154,285.877905,281.006442,74.775135,20142.338952,...,2028.674573,5.030678,4.515593,2692.943953,5.441748,3.961702,3310.695793,3.47619,6.735632,3205.0
std,0.0,923.88817,122.191314,2.713199,103.708084,2.000353,6.8517,5.634409,17.022427,10283.556918,...,2377.241533,2.003304,2.463098,2309.340104,2.017108,3.042454,2423.703985,2.360633,3.196873,2595.553656
min,7149.0,97260.0,-750.0,0.0,0.0,0.0,267.65,260.75,24.0,120.0,...,0.0,1.0,0.0,60.0,1.0,0.0,300.0,1.0,0.0,360.0
25%,7149.0,101200.0,-70.0,2.0,80.0,2.2,280.95,276.95,63.0,12710.0,...,500.0,3.0,3.0,1080.0,4.25,0.0,1400.0,1.0,6.0,1112.5
50%,7149.0,101820.0,0.0,3.0,200.0,3.4,285.85,281.45,79.0,20000.0,...,1080.0,5.0,6.0,1800.0,6.0,3.0,2430.0,2.5,8.0,2150.0
75%,7149.0,102287.5,70.0,7.0,260.0,4.9,290.65,285.55,89.0,25000.0,...,2100.0,7.0,6.0,3400.0,7.0,6.0,5000.0,5.75,9.0,5800.0
max,7149.0,103920.0,720.0,8.0,360.0,14.6,307.45,293.15,100.0,60000.0,...,9000.0,8.0,9.0,9000.0,8.0,9.0,9000.0,7.0,9.0,8000.0


In [32]:
# Construction sites that hinder traffic
# https://opendata.paris.fr/explore/dataset/chantiers-perturbants/information/?disjunctive.cp_arrondissement&disjunctive.maitre_ouvrage&disjunctive.objet&disjunctive.impact_circulation&disjunctive.niveau_perturbation&disjunctive.statut&sort=-date_debut

cons_sites = pd.read_csv(Path("data") / "chantiers-perturbants.csv", sep=';')
cons_sites.head()

Unnamed: 0,Identifiant,Identifiant CTV,Code postal de l'arrondissement,Numéro de STV,Typologie,Maitre d'ouvrage,Objet,Description,Voie(s),Précisions de localisation,Date de début,Date de fin,Impact sur la circulation,Détail de l'impact sur la circulation,Niveau de perturbation,Statut,URL LettreInfoChantier,geo_shape,geo_point_2d
0,CP001735,,75018.0,9.0,1.0,SG Mission des JO 2024,CONSTRUCTION_IMMEUBLE,Arena 2,avenue de la Porte de la Chapelle,n°6,2020-09-07,2023-07-31,BARRAGE_TOTAL,entre le bd Ney et l'avenue de la pte de la Ch...,2.0,2,,"{""coordinates"": [[[2.3603260350936983, 48.8987...","48.89941650246351, 2.3610672014134044"
1,CP002368,654245.0,75005.0,12.0,2.0,Eau de Paris,ENTRETIEN_RESEAU,Renouvellement conduits EP,"Gobelins, Monge, Censier et Mirbel",13-17 av Gobelins + 111-115 rue Monge + 30 rue...,2022-08-08,2023-01-20,RESTREINTE,Déviation circulation générale (Bus + vélos),2.0,2,,"{""coordinates"": [[[2.352170678013787, 48.84077...","48.84036171643268, 2.3520135239093847"
2,CP002390,689822.0,75014.0,12.0,3.0,Congrégation St Joseph,REHABILITATION_IMMEUBLE,,Rue Méchain,du 19b au 21,2022-10-05,2023-12-15,RESTREINTE,"Maintien de 4,00m de circulation",2.0,2,,"{""coordinates"": [[[[2.3384636559592034, 48.835...","48.83573836601194, 2.3385684617326348"
3,CP002429,662837.0,75015.0,13.0,3.0,SCCV Porte de Brancion,CONSTRUCTION_IMMEUBLE,"Construction neuve, résidence sociale et comme...",Avenue de la Porte de Brancion,Carrefour au-dessus du périphérique,2022-11-14,2024-03-15,RESTREINTE,"restriction à é files de circulation, maintien...",2.0,2,,"{""coordinates"": [[[2.299950868215911, 48.82590...","48.82574599992239, 2.300005595088703"
4,CP002276,,75017.0,9.0,3.0,ICF HABITAT,CONSTRUCTION_IMMEUBLE,Création de 111 logements sociaux.\nEmprise pr...,Voie Bus,Entre le 188 bis et 188 ter avenue de Clichy,2022-05-02,2024-06-30,RESTREINTE,Déviation bus dans circulation générale\nCréat...,2.0,2,http://xdir-CGPub-prd.ressources.paris.mdp/Pdf...,"{""coordinates"": [[[2.315884598767695, 48.89320...","48.89326507701689, 2.3156964416358314"


In [12]:
cons_sites.info()
cons_sites.describe()

# This dataset gives the time and emplacemment of construction sites that hinder
# traffic (potentially bike traffic) and a level of pertubation (1 or 2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 0 to 191
Data columns (total 19 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Identifiant                            192 non-null    object 
 1   Identifiant CTV                        128 non-null    object 
 2   Code postal de l'arrondissement        190 non-null    float64
 3   Numéro de STV                          191 non-null    float64
 4   Typologie                              189 non-null    float64
 5   Maitre d'ouvrage                       188 non-null    object 
 6   Objet                                  191 non-null    object 
 7   Description                            157 non-null    object 
 8   Voie(s)                                190 non-null    object 
 9   Précisions de localisation             174 non-null    object 
 10  Date de début                          191 non-null    object 
 11  Date d

Unnamed: 0,Code postal de l'arrondissement,Numéro de STV,Typologie,Niveau de perturbation,Statut
count,190.0,191.0,189.0,188.0,192.0
mean,75011.605263,11.308901,1.931217,1.755319,2.057292
std,4.985373,1.696397,0.805915,0.431046,0.32657
min,75001.0,9.0,1.0,1.0,2.0
25%,75008.0,10.0,1.0,2.0,2.0
50%,75013.0,11.0,2.0,2.0,2.0
75%,75016.0,13.0,3.0,2.0,2.0
max,75020.0,15.0,3.0,2.0,4.0


In [33]:
# Public bike stations and their capacities
# https://opendata.paris.fr/explore/dataset/velib-emplacement-des-stations/table/

velib = pd.read_csv(Path("data") / "velib-emplacement-des-stations.csv", sep=';')
velib.head()

Unnamed: 0,Identifiant station,Nom de la station,Capacité de la station,Coordonnées géographiques
0,9020,Toudouze - Clauzel,21,"48.87929591733507, 2.3373600840568547"
1,30002,Jean Rostand - Paul Vaillant Couturier,40,"48.908168131015, 2.4530601033354"
2,9002,Abbeville - Faubourg Poissonnière,14,"48.879223, 2.349147"
3,21323,Charlot - Stade Gabriel Voisin,31,"48.827228120536, 2.268122527474"
4,19003,Quai de la Seine,24,"48.884492238407525, 2.3703941702842717"


In [14]:
velib.info()
velib.describe()
# We have 1469 stations and their capacities, no null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1469 entries, 0 to 1468
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Identifiant station        1469 non-null   object
 1   Nom de la station          1469 non-null   object
 2   Capacité de la station     1469 non-null   int64 
 3   Coordonnées géographiques  1469 non-null   object
dtypes: int64(1), object(3)
memory usage: 46.0+ KB


Unnamed: 0,Capacité de la station
count,1469.0
mean,31.038802
std,12.03177
min,0.0
25%,23.0
50%,29.0
75%,37.0
max,74.0


In [168]:
# Holidays
# Creating a DataFrame with French national holidays with three categories :
# - no holiday - national holiday - school holiday

# This function will help defining our three categories
def classify_holiday(row):
    if row['date'].date() in national_holidays:
        return 'National Holiday'
    elif row['is_school_holiday']:
        return 'School Holiday'
    else:
        return 'No Holiday'

# Creating a dictionnary with national holidays for the concerned time period
national_holidays = {**holidays.France(years=2020), **holidays.France(years=2021)}

# Reading and processing the CSV file containing school holidays for Paris
school_holidays = pd.read_csv('fr-en-calendrier-scolaire.csv',
                              parse_dates=['start_date', 'end_date'], sep=';')
school_holidays = school_holidays[school_holidays['location'] == 'Paris']

school_holidays['start_date'] = pd.to_datetime(school_holidays['start_date'], utc=True)
school_holidays['end_date'] = pd.to_datetime(school_holidays['end_date'], utc=True)

school_holidays_ = []
for start_date, end_date in school_holidays[['start_date', 'end_date']].itertuples(index=False):
    holiday_ = pd.date_range(start_date, end_date)
    for day in holiday_ :
      school_holidays_.append(day.to_pydatetime())
school_holidays_ = [date.strftime('%Y-%m-%d') for date in school_holidays_]

#Creating the holiday dataframe
df_holidays = pd.DataFrame({'date': pd.date_range(start='2020-01-01',
                                                  end='2021-12-31')})
df_holidays['is_school_holiday'] = df_holidays['date'].astype(str).isin(school_holidays_)
df_holidays['holiday'] = df_holidays.apply(classify_holiday, axis=1)
df_holidays = df_holidays.drop('is_school_holiday', axis=1)

# Display the DataFrame
df_holidays

Unnamed: 0,date,holiday
0,2020-01-01,National Holiday
1,2020-01-02,School Holiday
2,2020-01-03,School Holiday
3,2020-01-04,School Holiday
4,2020-01-05,School Holiday
...,...,...
726,2021-12-27,School Holiday
727,2021-12-28,School Holiday
728,2021-12-29,School Holiday
729,2021-12-30,School Holiday


In [169]:
#Lockdown
#Creating a dataframe containing lockdown periods in Paris du to Covid.
#partial lockdown corresponds to progressive deconfinement or curfews periods
#Source wikipedia

#Initiating dataframe
df_lock = pd.DataFrame(pd.date_range(
    start='2020-01-01', end='2021-12-31', freq='D'), columns=['date'])
df_lock['lockdown'] = 'no_lockdown'


#Defining periods
lockdown_periods = [
    {'start': '2020-03-17', 'end': '2020-05-11', 'type': 'lockdown'},
    {'start': '2020-10-30', 'end': '2020-12-15', 'type': 'lockdown'},
    {'start': '2021-04-03', 'end': '2021-05-02', 'type': 'lockdown'}]

partial_lockdown_periods = [
    {'start': '2020-05-12', 'end': '2020-06-11', 'type': 'partial'},
    {'start': '2020-10-17', 'end': '2020-10-29', 'type': 'partial'},
    {'start': '2020-12-16', 'end': '2020-12-31', 'type': 'partial'},
    {'start': '2021-05-03', 'end': '2021-05-31', 'type': 'partial'}]

# Updating lockdown column
for period in lockdown_periods:
    df_lock.loc[(df_lock['date'] >= period['start']) & (
        df_lock['date'] <= period['end']), 'lockdown'] = period['type']

for period in partial_lockdown_periods:
    df_lock.loc[(df_lock['date'] >= period['start']) & (
        df_lock['date'] <= period['end']), 'lockdown'] = period['type']

df_lock

Unnamed: 0,date,lockdown
0,2020-01-01,no_lockdown
1,2020-01-02,no_lockdown
2,2020-01-03,no_lockdown
3,2020-01-04,no_lockdown
4,2020-01-05,no_lockdown
...,...,...
726,2021-12-27,no_lockdown
727,2021-12-28,no_lockdown
728,2021-12-29,no_lockdown
729,2021-12-30,no_lockdown


## Feature engineering

Creating useful functions

In [180]:
# Defining a fonction that changes the date information to :
# day, month, year, weekday and hour

def encode_dates(X):
    X = X.copy()  # modify a copy of X

    # Encode the date information from the date columns
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    # Create Categorical Features for Daytime and Seasons
    X["daytime"] = pd.cut(
        X["hour"],
        bins=[-1, 5, 12, 17, 21, 24],
        labels=["night", "morning", "afternoon", "evening", "night"],
        ordered=False)

    X["season"] = pd.cut(
        X["month"],
        bins=[0, 3, 6, 9, 12],
        labels=["winter", "spring", "summer", "fall"],
        ordered=False)

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])

In [171]:
# Defining a function that encodes categorical features and
# standardize numerical features.
# This function returns the scaler and the encoder fitted on
# the training dataset to be able to use them (through the same function)
# for the testing dataset.

def encode_and_standardize(df, label_encoder=None, scaler=None):

    df_modified = df.copy()

    # Separate columns into categorical and numerical
    categorical_columns = df.select_dtypes(include=['category', 'object']).columns
    numerical_columns = df.select_dtypes(include=['number']).columns

    # Encode categorical columns
    if label_encoder is None:
      label_encoder = []
      for col in categorical_columns :
        encoder = LabelEncoder()
        df_modified[col] = encoder.fit_transform(df[col])
        label_encoder.append(encoder)
    else:
      i = 0
      for col in categorical_columns:
        df_modified[col] = label_encoder[i].fit_transform(df[col])
        i =+ 1

    # Standardize numerical columns
    if scaler is None:
        scaler = StandardScaler()
        scaler.fit(df[numerical_columns])
    df_modified[numerical_columns] = scaler.transform(df[numerical_columns])

    return df_modified, label_encoder, scaler

Defining X_train, Y_train and X_test (and encoding dates). X_train and X_test are called "v1" as we will merge more data into them.

In [172]:
data = data.sort_values('date')
X_train_v1 = data.drop(columns=['log_bike_count']) # Features
Y_train = data['log_bike_count'].to_numpy() # Target
X_test_v1 = test_data.sort_values('date')

#Sorting values by date will help merging additional data

Encoding dates and selecting features from our additional datasets

In [173]:
# External data

selected_columns = ['date', 't', 'ht_neige', 'pres', 'ff', 'vv', 'rr3']
ext_data_ = ext_data[selected_columns].sort_values('date')
ext_data_['date'] = pd.to_datetime(ext_data_['date'])
#dataset has a few null values that will replace by the means
for column in ext_data_:
    ext_data_[column] = ext_data_[column].fillna(ext_data_[column].mean())

Merging the additional datasets to the training and testing one

In [174]:
# Merging External Data
X_train = pd.merge_asof(X_train_v1, ext_data_, on='date', direction='backward')
X_test = pd.merge_asof(X_test_v1, ext_data_, on='date', direction='backward')
# NB: we decided to triple each ext_data observation to match with the
# hourly aspect of X_train and X_test (ext_data has data for every three hour).

X_train.info()
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496827 entries, 0 to 496826
Data columns (total 17 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 496827 non-null  category      
 1   counter_name               496827 non-null  category      
 2   site_id                    496827 non-null  int64         
 3   site_name                  496827 non-null  category      
 4   bike_count                 496827 non-null  float64       
 5   date                       496827 non-null  datetime64[ns]
 6   counter_installation_date  496827 non-null  datetime64[ns]
 7   coordinates                496827 non-null  category      
 8   counter_technical_id       496827 non-null  category      
 9   latitude                   496827 non-null  float64       
 10  longitude                  496827 non-null  float64       
 11  t                          496827 non-null  float64 

In [175]:
#Merging Holiday data

X_train = pd.merge_asof(X_train, df_holidays, on='date', direction='backward')
X_train['holiday'] = X_train['holiday'].astype('category')
X_test = pd.merge_asof(X_test, df_holidays, on='date', direction='backward')
X_test['holiday'] = X_test['holiday'].astype('category')

X_train.info()
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496827 entries, 0 to 496826
Data columns (total 18 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 496827 non-null  category      
 1   counter_name               496827 non-null  category      
 2   site_id                    496827 non-null  int64         
 3   site_name                  496827 non-null  category      
 4   bike_count                 496827 non-null  float64       
 5   date                       496827 non-null  datetime64[ns]
 6   counter_installation_date  496827 non-null  datetime64[ns]
 7   coordinates                496827 non-null  category      
 8   counter_technical_id       496827 non-null  category      
 9   latitude                   496827 non-null  float64       
 10  longitude                  496827 non-null  float64       
 11  t                          496827 non-null  float64 

In [176]:
#Merging lockdown data

X_train = pd.merge_asof(X_train, df_lock, on='date', direction='backward')
X_train['lockdown'] = X_train['lockdown'].astype('category')
X_test = pd.merge_asof(X_test, df_lock, on='date', direction='backward')
X_test['lockdown'] = X_test['lockdown'].astype('category')

X_train.info()
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496827 entries, 0 to 496826
Data columns (total 19 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 496827 non-null  category      
 1   counter_name               496827 non-null  category      
 2   site_id                    496827 non-null  int64         
 3   site_name                  496827 non-null  category      
 4   bike_count                 496827 non-null  float64       
 5   date                       496827 non-null  datetime64[ns]
 6   counter_installation_date  496827 non-null  datetime64[ns]
 7   coordinates                496827 non-null  category      
 8   counter_technical_id       496827 non-null  category      
 9   latitude                   496827 non-null  float64       
 10  longitude                  496827 non-null  float64       
 11  t                          496827 non-null  float64 

In [177]:
#adding week-end feature

X_train['is_weekend'] = X_train['date'].dt.dayofweek // 5 == 1
X_test['is_weekend'] = X_train['date'].dt.dayofweek // 5 == 1

X_train.info()
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496827 entries, 0 to 496826
Data columns (total 20 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 496827 non-null  category      
 1   counter_name               496827 non-null  category      
 2   site_id                    496827 non-null  int64         
 3   site_name                  496827 non-null  category      
 4   bike_count                 496827 non-null  float64       
 5   date                       496827 non-null  datetime64[ns]
 6   counter_installation_date  496827 non-null  datetime64[ns]
 7   coordinates                496827 non-null  category      
 8   counter_technical_id       496827 non-null  category      
 9   latitude                   496827 non-null  float64       
 10  longitude                  496827 non-null  float64       
 11  t                          496827 non-null  float64 

In [178]:
X_train

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,t,ht_neige,pres,ff,vv,rr3,holiday,lockdown,is_weekend
0,100007049-101007049,28 boulevard Diderot O-E,100007049,28 boulevard Diderot,1.0,2020-09-01 01:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,285.75,0.0,100960,1.6,30000,0.0,No Holiday,no_lockdown,False
1,100056226-103056226,Face au 8 avenue de la porte de Charenton SE-NO,100056226,Face au 8 avenue de la porte de Charenton,1.0,2020-09-01 01:00:00,2019-11-01,"48.830331,2.400551",Y2H19070370,48.830331,2.400551,285.75,0.0,100960,1.6,30000,0.0,No Holiday,no_lockdown,False
2,100047545-104047545,Face 104 rue d'Aubervilliers S-N,100047545,Face 104 rue d'Aubervilliers,1.0,2020-09-01 01:00:00,2018-11-29,"48.890457,2.368852",Y2H18086321,48.890457,2.368852,285.75,0.0,100960,1.6,30000,0.0,No Holiday,no_lockdown,False
3,100060178-102060178,90 Rue De Sèvres NE-SO,100060178,90 Rue De Sèvres,21.0,2020-09-01 01:00:00,2020-07-22,"48.84638,2.31529",Y2H20052705,48.846380,2.315290,285.75,0.0,100960,1.6,30000,0.0,No Holiday,no_lockdown,False
4,100056327-103056327,Face au 4 avenue de la porte de Bagnolet E-O,100056327,Face au 4 avenue de la porte de Bagnolet,2.0,2020-09-01 01:00:00,2019-11-06,"48.86461,2.40969",Y2H19070372,48.864610,2.409690,285.75,0.0,100960,1.6,30000,0.0,No Holiday,no_lockdown,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496822,100057380-103057380,Totem Cours la Reine O-E,100057380,Totem Cours la Reine,32.0,2021-09-09 23:00:00,2020-02-11,"48.86462,2.31444",YTH19111509,48.864620,2.314440,292.25,0.0,100190,1.6,42230,0.0,No Holiday,no_lockdown,False
496823,100047547-103047547,6 rue Julia Bartet SO-NE,100047547,6 rue Julia Bartet,2.0,2021-09-09 23:00:00,2018-11-28,"48.82636,2.30303",Y2H18086323,48.826360,2.303030,292.25,0.0,100190,1.6,42230,0.0,No Holiday,no_lockdown,False
496824,100047548-103047548,Face au 25 quai de l'Oise NE-SO,100047548,Face au 25 quai de l'Oise,4.0,2021-09-09 23:00:00,2018-11-28,"48.89141,2.38482",Y2H18086324,48.891410,2.384820,292.25,0.0,100190,1.6,42230,0.0,No Holiday,no_lockdown,False
496825,100042374-110042374,Voie Georges Pompidou NE-SO,100042374,Voie Georges Pompidou,9.0,2021-09-09 23:00:00,2017-12-15,"48.8484,2.27586",Y2H21025335,48.848400,2.275860,292.25,0.0,100190,1.6,42230,0.0,No Holiday,no_lockdown,False


Pre-processing and selecting features from training data

In [183]:
# Feature selection
columns_to_drop = ['coordinates', 'counter_technical_id', 'counter_installation_date', 'counter_name',
                   'site_id', 'site_name', 'bike_count']
X_train = X_train.drop(columns=columns_to_drop)

# Encoding and scaling
X_train = encode_dates(X_train)
X_train, fitted_encoder, fitted_scaler = encode_and_standardize(X_train)

In [191]:
X_train["is_weekend"] = X_train["is_weekend"].astype(float)

Pre-processing and selecting features from testing data

In [184]:
# Feature selection
X_test = X_test.drop(columns=columns_to_drop[:-1])

# Encoding and scaling
X_test = encode_dates(X_test)
X_test, _ , _ = encode_and_standardize(X_test, fitted_encoder, fitted_scaler)

In [None]:
X_test["is_weekend"] = X_test["is_weekend"].astype(float)

Saving the X_test and X_train datasets

In [None]:
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)

## Failed attempts for feature engineering

**The following cells** were attempts to create features from public construction sites and public bike stations data. However, we ended up concluding that the creation of these features would be impossible considering our computations capacities and time constraints. Changing our initial approach (calculating all distances) to a KNN helped a lot to reduce the potential computation time needed, but it's not working well at all, returning absurd distances and absurdly long neighbor vectors. Nevertheless, this optimised method using KNN required a lot of simplifications and still needed few hours to calculate. We also wanted to try similar approaches with data about public events that my disturb traffic (ex: a protest), about public transportation perturbation, and about strikes (which could cause both traffic perturbation and transportation perturbation). However for these we lacked public data.

In [36]:
# Defining a function calculating distances from coordinates
def haversine_distance(lat1, lon1, lat2, lon2):
  coords_1 = (lat1, lon1)
  coords_2 = (lat2, lon2)
  return geodesic(coords_1, coords_2).meters

In [53]:
#DO NOT RUN IT TAKES HOURS

# Construction sites

# This one is special as our aim will be to create a new feature.
# We will create a pertubation score feature that will depend on date, location
# and level of pertubation. The idea is that is there is some traffic
# pertubation due to construction sites nearby, the bike count might be affected

# transform date and geo data of construction sites dataset
cons_sites[['Date de début', 'Date de fin']] = cons_sites[['Date de début',
                                          'Date de fin']].apply(pd.to_datetime)

cons_sites[['latitude', 'longitude']] = cons_sites['geo_point_2d'].str.split(
                                                ', ', expand=True).astype(float)

#Managing null values in the fields we're interested in
cons_sites['Niveau de perturbation'] = cons_sites['Niveau de perturbation'].fillna(0)
cons_sites = cons_sites[cons_sites['Date de début'].notnull()]

#Scaling latitudes and longitudes of the geographical data
scaler = StandardScaler()
cons_sites[['latitude', 'longitude']] = scaler.fit_transform(cons_sites[[
                                                      'latitude', 'longitude']])

#Filtering to our concerned time period to not be working with unusefull data
cons_sites = cons_sites[(cons_sites ['Date de début'] <= pd.to_datetime(
  '2021-12-31')) & (cons_sites['Date de fin'] >= pd.to_datetime('2020-01-01'))]

# Defining the a max number of sites to consider
n_counters = 2

#Fit a KNN model
earth_radius_meters = 6371000.0
knn_model = NearestNeighbors(n_neighbors=n_counters, metric='haversine',
                             radius = earth_radius_meters)
knn_model.fit(cons_sites[['latitude', 'longitude']])

start = time.time()

# Iterate through the counters
for category in X_train['counter_id'].unique():
    cat_rows = X_train[X_train['counter_id'] == category]

    # Get coordinates of the category
    cat_coor = cat_rows[['latitude', 'longitude']]

    # Get the closest construction sites
    dist, ind = knn_model.kneighbors(cat_coor, n_neighbors=n_counters)
    closest_points = cons_sites.iloc[ind.flatten()].reset_index(drop=True)
    closest_points = pd.DataFrame({
        'Score': closest_points['Niveau de perturbation'].values,
        'Distance': dist.flatten(),
        'Start_Date': closest_points['Date de début'],
        'End_Date': closest_points['Date de fin'] })
#    closest_points = closest_points.sort_values(by='Score', ascending=False).head(5)

    print(len(closest_points))
    print(closest_points)

    #filter by date
 #   closest_points = closest_points[(cat_rows['date'].between(closest_points['Start_Date'],
 #                                 closest_points['End_Date']))]
 # This filtering by date doesn't work has it should be for each data point
 #

    # Get the sum ofscore values from the sites closer then a certain distance
    X_train.loc[X_train['counter_id'] == category,
                   'perturbation_score'] = closest_points['Score'].sum()
    break

end = time.time()

print((end-start)*len(X_train['counter_id'].unique()))

X_train.head()

17948
       Score  Distance Start_Date   End_Date
0        2.0  0.267756 2020-11-02 2023-09-30
1        1.0  0.354179 2019-10-12 2023-03-31
2        2.0  0.267756 2020-11-02 2023-09-30
3        1.0  0.354179 2019-10-12 2023-03-31
4        2.0  0.267756 2020-11-02 2023-09-30
...      ...       ...        ...        ...
17943    1.0  0.354179 2019-10-12 2023-03-31
17944    2.0  0.267756 2020-11-02 2023-09-30
17945    1.0  0.354179 2019-10-12 2023-03-31
17946    2.0  0.267756 2020-11-02 2023-09-30
17947    1.0  0.354179 2019-10-12 2023-03-31

[17948 rows x 4 columns]
2.65789794921875


Unnamed: 0,counter_id,latitude,longitude,t,ht_neige,pres,ff,vv,rr3,year,month,day,weekday,hour,perturbation_score
0,0,-0.448167,0.787625,0.018014,0.04428,0.418383,-1.069403,0.96928,-0.226585,-1.457223,0.713556,-1.633425,-0.998576,-1.517532,26922.0
1,26,-1.294163,1.448287,0.018014,0.04428,0.418383,-1.069403,0.96928,-0.226585,-1.457223,0.713556,-1.633425,-0.998576,-1.517532,
2,12,1.946354,0.614663,0.018014,0.04428,0.418383,-1.069403,0.96928,-0.226585,-1.457223,0.713556,-1.633425,-0.998576,-1.517532,
3,51,-0.429195,-0.793918,0.018014,0.04428,0.418383,-1.069403,0.96928,-0.226585,-1.457223,0.713556,-1.633425,-0.998576,-1.517532,
4,28,0.553319,1.688626,0.018014,0.04428,0.418383,-1.069403,0.96928,-0.226585,-1.457223,0.713556,-1.633425,-0.998576,-1.517532,


In [60]:
# Public bike stations

# This one is special too as our aim will aslo be to create a new feature.
# We will create a bike avaibility feature that will depend on location and
# capacity of public bike stations in Paris. The idea is that is a counter has
# a bike station around, it might get a higher traffic we should modelize

# transform geo data of bike stations

velib[['latitude', 'longitude']] = velib['Coordonnées géographiques'].str.split(
                                                ', ', expand=True).astype(float)

# Defining the a max number of stations to consider
n_stations = 2

#Fit a KNN model
knn_model = NearestNeighbors(n_neighbors=n_stations, metric='haversine',
                             radius = earth_radius_meters)
knn_model.fit(velib[['latitude', 'longitude']])

start = time.time()

# Iterate through the counters
for category in X_train_v1['counter_id'].unique():
    cat_rows = X_train_v1[X_train_v1['counter_id'] == category]

    # Get coordinates of the category
    cat_coor = cat_rows[['latitude', 'longitude']]

    # Get the closest bike stations
    dist, ind = knn_model.kneighbors(cat_coor, n_neighbors=n_counters)
    closest_stations = velib.iloc[ind.flatten()].reset_index(drop=True)
    closest_stations = pd.DataFrame({
        'bikes': closest_stations['Capacité de la station'].values,
        'Distance': dist.flatten() })

    # Get the sum ofscore values from the sites closer then a certain distance
    X_train_v1.loc[X_train_v1['counter_id'] == category,
                   'bikes_available'] = closest_stations['bikes'].sum()
    break

end = time.time()

print((end-start)*len(X_train_v1['counter_id'].unique()))

#X_train_v1.head()

31.029401779174805


Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,perturbation_score,bikes_available
57884,100007049-101007049,28 boulevard Diderot O-E,100007049,28 boulevard Diderot,1.0,2020-09-01 01:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,26922.0,565362.0
607941,100056226-103056226,Face au 8 avenue de la porte de Charenton SE-NO,100056226,Face au 8 avenue de la porte de Charenton,1.0,2020-09-01 01:00:00,2019-11-01,"48.830331,2.400551",Y2H19070370,48.830331,2.400551,,
305866,100047545-104047545,Face 104 rue d'Aubervilliers S-N,100047545,Face 104 rue d'Aubervilliers,1.0,2020-09-01 01:00:00,2018-11-29,"48.890457,2.368852",Y2H18086321,48.890457,2.368852,,
855447,100060178-102060178,90 Rue De Sèvres NE-SO,100060178,90 Rue De Sèvres,21.0,2020-09-01 01:00:00,2020-07-22,"48.84638,2.31529",Y2H20052705,48.84638,2.31529,,
617970,100056327-103056327,Face au 4 avenue de la porte de Bagnolet E-O,100056327,Face au 4 avenue de la porte de Bagnolet,2.0,2020-09-01 01:00:00,2019-11-06,"48.86461,2.40969",Y2H19070372,48.86461,2.40969,,


##Training

In [159]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from math import sqrt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [193]:
def LSTM_pipeline(X_train, Y_train, units=50, e=10, b=32, v=0, opti='adam'):

  tscv = TimeSeriesSplit(n_splits=5) # Time serie specific cross validation
  rmse_values = []

  for train_index, test_index in tscv.split(X_train):

      X_train_, X_test_ = X_train.iloc[train_index], X_train.iloc[test_index]
      y_train_, y_test_ = Y_train[train_index], Y_train[test_index]

      # Reshape data for LSTM input
      X_train_reshaped = X_train_.values.reshape((X_train_.shape[0], 1, X_train_.shape[1]))
      X_test_reshaped = X_test_.values.reshape((X_test_.shape[0], 1, X_test_.shape[1]))

      # Create LSTM model
      model = Sequential()
      model.add(LSTM(units, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])))
      model.add(Dense(1))
      model.compile(optimizer=opti, loss='mean_squared_error')

      # Train the model
      model.fit(X_train_reshaped,
                y_train_,
                epochs=e, batch_size=b, verbose=v)

      # Make predictions on the test set
      y_pred = model.predict(X_test_reshaped)

      # Evaluate the model using RMSE
      rmse = sqrt(mean_squared_error(y_test_, y_pred))
      rmse_values.append(rmse)

  average_rmse = np.mean(rmse_values)
  std_dev_rmse = np.std(rmse_values)

  return average_rmse, std_dev_rmse

RMSE for this fold: 1.2142852695444553
RMSE for this fold: 1.1147741885427491
RMSE for this fold: 1.0447641911178054
RMSE for this fold: 0.8620615980483445
RMSE for this fold: 0.9360527957835243
Average RMSE: 1.0343876086073758
Standard Deviation of RMSE: 0.12513898947748267


In [194]:
pip install optuna

Collecting optuna
  Downloading optuna-3.4.0-py3-none-any.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.6/409.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.0-py3-none-any.whl (230 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m230.6/230.6 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.0-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.0-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.0 alembic-1.13.0 colorlog-6.8.0 optuna-3.4.0


In [None]:
import optuna
from optuna import Trial

def objective(trial: Trial, X_train, Y_train, tscv):
    units = trial.suggest_int('units', 50, 100)
    e = trial.suggest_int('e', 10, 20)
    b = trial.suggest_int('b', 32, 128)

    print(f"Trying parameters: units={units}, e={e}, b={b}")
    print(time.time())

    rmse_values = []
    for train_index, test_index in tscv.split(X_train):
        X_train_, X_test_ = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_, y_test_ = Y_train[train_index], Y_train[test_index]

        # Reshape data for LSTM input
        X_train_reshaped = X_train_.values.reshape((X_train_.shape[0], 1, X_train_.shape[1]))
        X_test_reshaped = X_test_.values.reshape((X_test_.shape[0], 1, X_test_.shape[1]))

        # Create LSTM model
        model = Sequential()
        model.add(LSTM(units, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])))
        model.add(Dense(1))
        model.compile(optimizer='adam', loss='mean_squared_error')

        # Train the model
        model.fit(X_train_reshaped, y_train_, epochs=e, batch_size=b, verbose=0)

        # Make predictions on the test set
        y_pred = model.predict(X_test_reshaped)

        # Evaluate the model using RMSE
        rmse = sqrt(mean_squared_error(y_test_, y_pred))
        rmse_values.append(rmse)

    average_rmse = np.mean(rmse_values)
    print(f"Average RMSE for this set of parameters: {average_rmse}")
    return average_rmse

def optimize_parameters(X_train, Y_train):
    tscv = TimeSeriesSplit(n_splits=5)  # Time series-specific cross-validation

    study = optuna.create_study(direction='minimize')
    objective_func = lambda trial: objective(trial, X_train, Y_train, tscv)
    study.optimize(objective_func, n_trials=50)

    # Get the best parameters
    best_params = study.best_params

    # Use the best parameters to get the final RMSE
    final_rmse = objective_func(study.best_trial)

    return best_params, final_rmse

best_params, final_rmse = optimize_parameters(X_train, Y_train)
print(f"Best Hyperparameters: {best_params}")
print(f"Final RMSE with Best Parameters: {final_rmse}")

[I 2023-12-10 08:42:28,259] A new study created in memory with name: no-name-8a11a6fe-03c7-471f-b7c0-fa4a6a9e6acf


Trying parameters: units=68, e=12, b=111
1702197748.2813668


[I 2023-12-10 08:52:48,761] Trial 0 finished with value: 1.0919082798191861 and parameters: {'units': 68, 'e': 12, 'b': 111}. Best is trial 0 with value: 1.0919082798191861.


Average RMSE for this set of parameters: 1.0919082798191861
Trying parameters: units=93, e=17, b=100
1702198368.7719483


[I 2023-12-10 09:09:01,845] Trial 1 finished with value: 1.0859012054220614 and parameters: {'units': 93, 'e': 17, 'b': 100}. Best is trial 1 with value: 1.0859012054220614.


Average RMSE for this set of parameters: 1.0859012054220614
Trying parameters: units=79, e=20, b=81
1702199341.8491676


[I 2023-12-10 09:29:11,067] Trial 2 finished with value: 1.0957243622470971 and parameters: {'units': 79, 'e': 20, 'b': 81}. Best is trial 1 with value: 1.0859012054220614.


Average RMSE for this set of parameters: 1.0957243622470971
Trying parameters: units=64, e=17, b=48
1702200551.0711381


[I 2023-12-10 09:54:53,106] Trial 3 finished with value: 1.0517314870105843 and parameters: {'units': 64, 'e': 17, 'b': 48}. Best is trial 3 with value: 1.0517314870105843.


Average RMSE for this set of parameters: 1.0517314870105843
Trying parameters: units=89, e=16, b=50
1702202093.1129687


[I 2023-12-10 10:21:10,634] Trial 4 finished with value: 1.055419995452017 and parameters: {'units': 89, 'e': 16, 'b': 50}. Best is trial 3 with value: 1.0517314870105843.


Average RMSE for this set of parameters: 1.055419995452017
Trying parameters: units=88, e=10, b=106
1702203670.639238


[I 2023-12-10 10:31:21,505] Trial 5 finished with value: 1.0364816170365978 and parameters: {'units': 88, 'e': 10, 'b': 106}. Best is trial 5 with value: 1.0364816170365978.


Average RMSE for this set of parameters: 1.0364816170365978
Trying parameters: units=54, e=10, b=97
1702204281.5102181


[I 2023-12-10 10:39:25,360] Trial 6 finished with value: 1.0821875106289005 and parameters: {'units': 54, 'e': 10, 'b': 97}. Best is trial 5 with value: 1.0364816170365978.


Average RMSE for this set of parameters: 1.0821875106289005
Trying parameters: units=75, e=12, b=39
1702204765.3639114
