In [54]:
# we import the necessary modules
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

In [55]:
# first we import the data
data = pd.read_parquet(Path("data") / "train.parquet")

In [56]:
# let's visualize the first few rows of the dataset
data.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,log_bike_count
48321,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 02:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0
48324,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2020-09-01 03:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.693147
48327,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 04:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0
48330,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2020-09-01 15:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,1.609438
48333,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2020-09-01 18:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2.302585


In [57]:
# let's visualize somme main information about the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496827 entries, 48321 to 929187
Data columns (total 12 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 496827 non-null  category      
 1   counter_name               496827 non-null  category      
 2   site_id                    496827 non-null  int64         
 3   site_name                  496827 non-null  category      
 4   bike_count                 496827 non-null  float64       
 5   date                       496827 non-null  datetime64[ns]
 6   counter_installation_date  496827 non-null  datetime64[ns]
 7   coordinates                496827 non-null  category      
 8   counter_technical_id       496827 non-null  category      
 9   latitude                   496827 non-null  float64       
 10  longitude                  496827 non-null  float64       
 11  log_bike_count             496827 non-null  floa

In [58]:
# let's see if there are any missing values in the data
missing_values_count = data.isnull().any(axis=1).sum()
print(f"Number of lines with missing values: {missing_values_count}")

Number of lines with missing values: 0


In [59]:
def _encode_dates(X):
    X = X.copy()
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour
    X["just_date"] = pd.to_datetime(X["date"].dt.date)
    return X

# let's separate the year, the month, the day of the month, the day of the week and the hour
data = _encode_dates(data)
min_date = data['date'].min().strftime('%Y-%m-%d')
max_date = data['date'].max().strftime('%Y-%m-%d')
data.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,log_bike_count,year,month,day,weekday,hour,just_date
48321,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 02:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0,2020,9,1,1,2,2020-09-01
48324,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2020-09-01 03:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.693147,2020,9,1,1,3,2020-09-01
48327,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 04:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0,2020,9,1,1,4,2020-09-01
48330,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2020-09-01 15:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,1.609438,2020,9,1,1,15,2020-09-01
48333,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2020-09-01 18:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2.302585,2020,9,1,1,18,2020-09-01


In [60]:
import holidays
vacances = holidays.CountryHoliday('France', years=[2020, 2021])
vacances_dates = pd.to_datetime(list(vacances.keys())).date

# let's add information about if a given day is durong a holidays or during the weekend
data["IsHolidays"] = data["just_date"].isin(vacances_dates).astype(int)
data['IsWeekend'] = data['weekday'].apply(lambda x: 1 if x >= 5 else 0)

In [61]:
# let's look at how the data changed
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496827 entries, 48321 to 929187
Data columns (total 20 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 496827 non-null  category      
 1   counter_name               496827 non-null  category      
 2   site_id                    496827 non-null  int64         
 3   site_name                  496827 non-null  category      
 4   bike_count                 496827 non-null  float64       
 5   date                       496827 non-null  datetime64[ns]
 6   counter_installation_date  496827 non-null  datetime64[ns]
 7   coordinates                496827 non-null  category      
 8   counter_technical_id       496827 non-null  category      
 9   latitude                   496827 non-null  float64       
 10  longitude                  496827 non-null  float64       
 11  log_bike_count             496827 non-null  floa

In [None]:
# let's get rid of some useless data
new_data = data.drop(columns=["just_date", "day", "counter_name", "site_name", "bike_count", "date", "counter_installation_date", "coordinates", "counter_technical_id", "latitude", "longitude"])
new_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496827 entries, 48321 to 929187
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   counter_id      496827 non-null  category
 1   site_id         496827 non-null  int64   
 2   log_bike_count  496827 non-null  float64 
 3   year            496827 non-null  int64   
 4   month           496827 non-null  int64   
 5   weekday         496827 non-null  int64   
 6   hour            496827 non-null  int64   
 7   IsHolidays      496827 non-null  int64   
 8   IsWeekend       496827 non-null  int64   
dtypes: category(1), float64(1), int64(7)
memory usage: 34.6 MB


56

In [None]:
weather_data = pd.read_csv(Path("data") / "external_data.csv")
weather_data = weather_data.filter(items=['numer_sta', 'date', 'pmer', 'pres', 'dd', 'ff', 'ww', 't', 'u', 'vv', 'n'])
weather_data.head()
weather_data.info()

Unnamed: 0,numer_sta,date,pmer,tend,cod_tend,dd,ff,t,td,u,...,hnuage1,nnuage2,ctype2,hnuage2,nnuage3,ctype3,hnuage3,nnuage4,ctype4,hnuage4
0,7149,2021-01-01 00:00:00,100810,80,1,270,1.8,272.75,272.15,96,...,600.0,,,,,,,,,
1,7149,2021-01-01 03:00:00,100920,110,3,300,1.7,271.25,270.95,98,...,1500.0,2.0,3.0,3000.0,,,,,,
2,7149,2021-01-01 06:00:00,100950,30,3,290,2.6,271.95,271.65,98,...,480.0,4.0,6.0,2000.0,6.0,3.0,3000.0,,,
3,7149,2021-01-01 09:00:00,101100,150,2,280,1.7,272.45,272.05,97,...,1740.0,3.0,3.0,2800.0,,,,,,
4,7149,2021-01-01 12:00:00,101110,30,0,50,1.0,276.95,274.15,82,...,330.0,4.0,6.0,570.0,7.0,6.0,810.0,,,


In [63]:
numerical_columns = ['year', 'month', 'weekday', 'hour', 'IsHolidays', 'IsWeekend']
categorical_columns = ['counter_id', 'site_id']

# Preprocessing
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Preprocess numerical features (scaling)
scaler = MinMaxScaler()
new_data[numerical_columns] = scaler.fit_transform(new_data[numerical_columns])

# Preprocess categorical features (encoding)
le = LabelEncoder()
new_data[categorical_columns] = new_data[categorical_columns].apply(le.fit_transform)

In [64]:
X = new_data.drop('log_bike_count', axis=1)
y = new_data['log_bike_count']

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

models = {
    "Linear Regression": LinearRegression(),
    #"Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "K-Nearest Neighbors": KNeighborsRegressor(),
    #"Support Vector Regression": SVR(),
    #"Decision Tree": DecisionTreeRegressor(random_state=42)
}

results = {}

In [None]:
for model_name, model in models.items():
    # Perform cross-validation and calculate the mean and standard deviation of the score
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')  # Using MSE as the evaluation metric
    
    # Compute the average of the cross-validation scores
    mean_mse = np.mean(cv_scores)
    mean_rmse = np.sqrt(-mean_mse)  # Convert MSE to RMSE
    mean_r2 = np.mean(cross_val_score(model, X, y, cv=5, scoring='r2'))  # R² score
    
    # Store the results
    results[model_name] = {
        "Mean R²": mean_r2,
        "Mean MSE": mean_mse,
        "Mean RMSE": mean_rmse,
        "MSE Std Dev": np.std(cv_scores),
        "RMSE Std Dev": np.std(np.sqrt(-cv_scores))  # Calculate standard deviation of RMSE
    }



In [66]:
import pandas as pd
results_df = pd.DataFrame(results).T
print(results_df)

                      Mean R²  Mean MSE  Mean RMSE  MSE Std Dev  RMSE Std Dev
Linear Regression    0.085811 -2.491091   1.578319     0.285835      0.091178
Gradient Boosting    0.579149 -1.149079   1.071951     0.160534      0.075601
K-Nearest Neighbors  0.500581 -1.348491   1.161246     0.122251      0.052833


Quelques secondes pour la régression linéaire

3 min pour gradient boosting

1 min pour K nearest neighbors


On nva regarder les significations des données de external_data.

In [None]:
dict_external_data = {
    'pmer' : 'pression atmosphérique au niveau de la mer',
    'dd' : 'direction du vent',
    'ff' : 'vitesse du vent',
    't' : 'température en Kelvin',
    'u' : 'humidité relative',
    'numer_sta' : 'numéro de la station',
    'date' : 'date',
    'vv' : 'visibilité en km',
    'ww' : 'puissance du vent',
    'n' : 'cloud coverage',
    'pres' : 'pression atmosphérique',
}