In [22]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn.linear_model import LinearRegression
from datetime import datetime
import holidays
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder


In [23]:
def clean(df, df_name):
    missing_values_count = df.isnull().any(axis=1).sum()
    print(f"({df_name}) : Number of lines with missing values: {missing_values_count}")
    df = df.dropna()
    return df

def _encode_dates_bike(df):
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month
    df["day"] = df["date"].dt.day
    df["hour"] = df["date"].dt.hour
    df["weekday"] = df["date"].dt.weekday
    df['IsWeekend'] = df['weekday'].apply(lambda x: 1 if x >= 5 else 0)

    vacances = holidays.CountryHoliday('France', years=[i for i in range(2009, 2025)])
    vacances_dates = pd.to_datetime(list(vacances.keys())).date
    df["IsHolidays"] = df["date"].isin(vacances_dates).astype(int)

    return df

def _encode_dates_meteo(df):   
    df['DATE'] = pd.to_datetime(df['DATE'])
    df["year"] = df["DATE"].dt.year
    df["month"] = df["DATE"].dt.month
    df["day"] = df["DATE"].dt.day
    return df

In [24]:
def transform_data(raw_bike_counter_data, raw_data_meteo, scaler):

    bike_counter_data = raw_bike_counter_data.copy()
    data_meteo = raw_data_meteo.copy()

    bike_counter_data = clean(bike_counter_data, 'bike_counter_data')
    bike_counter_data = _encode_dates_bike(bike_counter_data)
    #print(bike_counter_data.info())
    min_date_bike = bike_counter_data['date'].min().strftime('%Y-%m-%d')
    max_date_bike = bike_counter_data['date'].max().strftime('%Y-%m-%d')

    data_meteo = data_meteo.drop(columns = ['TEMPERATURE_NIGHT_C', 'SUNRISE', 'SUNSET'] )
    data_meteo = clean(data_meteo, 'data_meteo')
    data_meteo = _encode_dates_meteo(data_meteo)
    #print(data_meteo.info())    
    merged_data = pd.merge(bike_counter_data, data_meteo, on=['year', 'month', 'day'])

    all_dates = pd.merge(
        bike_counter_data[['year', 'month', 'day']],
        data_meteo[['year', 'month', 'day']],
        on=['year', 'month', 'day'],
        how='outer',
        indicator=True
    )

    # Filter for rows that are only in bike_counter_data
    missing_dates = all_dates[all_dates['_merge'] == 'left_only']

    print("Missing dates in data_meteo:")
    print(missing_dates[['year', 'month', 'day']])




    columns_to_drop = ["day", "counter_name", "site_name", "DATE", "counter_installation_date", "coordinates", "counter_technical_id", "latitude", "longitude"]
    if 'bike_count' in merged_data.columns:
        merged_data = merged_data.drop(columns = 'bike_count')
    merged_data = merged_data.drop(columns = columns_to_drop)

    columns_to_encode = ['WEATHER_CODE_MORNING', 'WEATHER_CODE_NOON', 'WEATHER_CODE_EVENING', 'OPINION', "counter_id", "date"]
    label_encoders = {}

    for col in columns_to_encode:
        le = LabelEncoder()
        merged_data[col] = le.fit_transform(merged_data[col])
        label_encoders[col] = le     # Garder une référence pour un éventuel inverse_transform

    # Scale the data
    merged_data[merged_data.columns] = scaler.fit_transform(merged_data[merged_data.columns])
    print(len(bike_counter_data))
    print(len(merged_data))
    
    return merged_data



In [25]:
raw_bike_counter_data = pd.read_parquet(Path("data") / "train.parquet")
raw_meteo_data = pd.read_csv('external_data/export-paris0.csv')
raw_bike_counter_test_data = pd.read_parquet(Path("data") / "final_test.parquet")

scaler = StandardScaler()
train_data = transform_data(raw_bike_counter_data, raw_meteo_data, scaler)
public_test_data = transform_data(raw_bike_counter_test_data, raw_meteo_data, scaler)

(bike_counter_data) : Number of lines with missing values: 0
(data_meteo) : Number of lines with missing values: 0
Missing dates in data_meteo:
Empty DataFrame
Columns: [year, month, day]
Index: []
496827
496827
(bike_counter_data) : Number of lines with missing values: 0
(data_meteo) : Number of lines with missing values: 0
Missing dates in data_meteo:
Empty DataFrame
Columns: [year, month, day]
Index: []
51440
51440


In [26]:
train_data.info()
public_test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496827 entries, 0 to 496826
Data columns (total 31 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   counter_id              496827 non-null  float64
 1   site_id                 496827 non-null  float64
 2   date                    496827 non-null  float64
 3   log_bike_count          496827 non-null  float64
 4   year                    496827 non-null  float64
 5   month                   496827 non-null  float64
 6   hour                    496827 non-null  float64
 7   weekday                 496827 non-null  float64
 8   IsWeekend               496827 non-null  float64
 9   IsHolidays              496827 non-null  float64
 10  MAX_TEMPERATURE_C       496827 non-null  float64
 11  MIN_TEMPERATURE_C       496827 non-null  float64
 12  WINDSPEED_MAX_KMH       496827 non-null  float64
 13  TEMPERATURE_MORNING_C   496827 non-null  float64
 14  TEMPERATURE_NOON_C  

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesRegressor

In [28]:
target = train_data["log_bike_count"]
data = train_data.drop(columns = "log_bike_count")

X_train_sample = data.sample(frac=0.01, random_state=42)  # x % des données
y_train_sample = target.loc[X_train_sample.index]

In [29]:
# Define the regressor
reg = ExtraTreesRegressor(random_state=42, n_jobs=-1)

# Create the pipeline
pipeline = Pipeline(steps=[
    #('preprocessor', preprocessor),
    ('regressor', reg)
])

# Update param_grid to toggle scalers within the 'num' pipeline
param_grid = {
    #'preprocessor__scaled_num__scaler': [StandardScaler(), MinMaxScaler(), 'passthrough'],  # Apply scalers or skip
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [3, 5, 7],
    'regressor__min_samples_split': [2, 5],
    'regressor__min_samples_leaf': [1, 2],
    'regressor__max_features': ['sqrt', 'log2', None],
    'regressor__bootstrap': [True, False]
}

# Set up GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the model
grid_search_result = grid_search.fit(X_train_sample, y_train_sample)

# Output the best results
print("Best hyperparameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best hyperparameters found:  {'regressor__bootstrap': False, 'regressor__max_depth': 7, 'regressor__max_features': None, 'regressor__min_samples_leaf': 2, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 50}
Best cross-validation score:  0.6835236694640136


In [30]:
def final_output(grid_search_result, test_data):
    y_pred = grid_search_result.predict(test_data)
    results = pd.DataFrame(
        dict(
            Id=np.arange(y_pred.shape[0]),
            log_bike_count=y_pred,
        )
    )
    results.to_csv("submission.csv", index=False)

In [31]:
final_output(grid_search_result, public_test_data)

In [32]:
grid_search_result