In [51]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn.linear_model import LinearRegression
from datetime import datetime
import holidays
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder


In [52]:
def clean(df, df_name):
    missing_values_count = df.isnull().any(axis=1).sum()
    print(f"({df_name}) : Number of lines with missing values: {missing_values_count}")
    df = df.dropna()
    return df

def _encode_dates_bike(df):
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month
    df["day"] = df["date"].dt.day
    df["hour"] = df["date"].dt.hour
    df["weekday"] = df["date"].dt.weekday
    df['IsWeekend'] = df['weekday'].apply(lambda x: 1 if x >= 5 else 0)

    vacances = holidays.CountryHoliday('France', years=[2020, 2021])
    vacances_dates = pd.to_datetime(list(vacances.keys())).date
    df["IsHolidays"] = df["date"].isin(vacances_dates).astype(int)

    return df

def _encode_dates_meteo(df, min_date, max_date):   
    df = df[(df['DATE'] > min_date) & (df['DATE'] < max_date)]
    df['DATE'] = pd.to_datetime(df['DATE'])
    df["year"] = df["DATE"].dt.year
    df["month"] = df["DATE"].dt.month
    df["day"] = df["DATE"].dt.day
    return df

In [53]:
def transform_data(raw_bike_counter_data, raw_data_meteo, scaler):

    bike_counter_data = raw_bike_counter_data.copy()
    data_meteo = raw_data_meteo.copy()
    
    bike_counter_data = clean(bike_counter_data, 'bike_counter_data')
    bike_counter_data = _encode_dates_bike(bike_counter_data)
    #print(bike_counter_data.info())

    min_date_bike = bike_counter_data['date'].min().strftime('%Y-%m-%d')
    max_date_bike = bike_counter_data['date'].max().strftime('%Y-%m-%d')

    data_meteo = data_meteo.drop(columns = ['TEMPERATURE_NIGHT_C', 'SUNRISE', 'SUNSET'] )
    data_meteo = clean(data_meteo, 'data_meteo')
    data_meteo = _encode_dates_meteo(data_meteo, min_date_bike, max_date_bike)
    #print(data_meteo.info())    

    merged_data = pd.merge(bike_counter_data, data_meteo, on=['year', 'month', 'day'])

    columns_to_drop = ["day", "counter_name", "site_name", "DATE", "counter_installation_date", "coordinates", "counter_technical_id", "latitude", "longitude"]
    if 'bike_count' in merged_data.columns:
        merged_data = merged_data.drop(columns = 'bike_count')
    merged_data = merged_data.drop(columns = columns_to_drop)

    columns_to_encode = ['WEATHER_CODE_MORNING', 'WEATHER_CODE_NOON', 'WEATHER_CODE_EVENING', 'OPINION', "counter_id", "date"]
    label_encoders = {}

    for col in columns_to_encode:
        le = LabelEncoder()
        merged_data[col] = le.fit_transform(merged_data[col])
        label_encoders[col] = le     # Garder une référence pour un éventuel inverse_transform

    # Scale the data
    merged_data[merged_data.columns] = scaler.fit_transform(merged_data[merged_data.columns])
    
    return merged_data



In [54]:
raw_bike_counter_data = pd.read_parquet(Path("data") / "train.parquet")
raw_meteo_data = pd.read_csv('external_data/export-paris0.csv')
raw_bike_counter_test_data = pd.read_parquet(Path("data") / "final_test.parquet")

scaler = StandardScaler()
train_data = transform_data(raw_bike_counter_data, raw_meteo_data, scaler)
public_test_data = transform_data(raw_bike_counter_test_data, raw_meteo_data, scaler)

(bike_counter_data) : Number of lines with missing values: 0
(data_meteo) : Number of lines with missing values: 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DATE'] = pd.to_datetime(df['DATE'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["year"] = df["DATE"].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["month"] = df["DATE"].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_i

(bike_counter_data) : Number of lines with missing values: 0
(data_meteo) : Number of lines with missing values: 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DATE'] = pd.to_datetime(df['DATE'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["year"] = df["DATE"].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["month"] = df["DATE"].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_i

In [None]:
train_data.info()
public_test_data.info()

        counter_id   site_id      date  log_bike_count      year     month  \
0        -1.641252 -0.166405 -1.746569       -1.853594 -1.460033  0.716376   
1        -1.641252 -0.166405 -1.743846       -1.018237 -1.460033  0.716376   
2        -1.641252 -0.166405 -1.743068       -0.773911 -1.460033  0.716376   
3        -1.641252 -0.166405 -1.742679       -0.883775 -1.460033  0.716376   
4        -1.641252 -0.166405 -1.741512       -0.529585 -1.460033  0.716376   
...            ...       ...       ...             ...       ...       ...   
494236    1.668802  6.058964  1.723762       -0.308000  0.684916  0.716376   
494237    1.668802  6.058964  1.717538        1.815558  0.684916  0.716376   
494238    1.668802  6.058964  1.717927        1.573278  0.684916  0.716376   
494239    1.668802  6.058964  1.720650        1.074844  0.684916  0.716376   
494240    1.668802  6.058964  1.721428        1.640838  0.684916  0.716376   

            hour   weekday  IsWeekend  IsHolidays  ...  HEATIND

In [56]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesRegressor

In [57]:
target = train_data["log_bike_count"]
data = train_data.drop(columns = "log_bike_count")

X_train_sample = data.sample(frac=0.01, random_state=42)  # x % des données
y_train_sample = target.loc[X_train_sample.index]

In [58]:
# Define the regressor
reg = ExtraTreesRegressor(random_state=42, n_jobs=-1)

# Create the pipeline
pipeline = Pipeline(steps=[
    #('preprocessor', preprocessor),
    ('regressor', reg)
])

# Update param_grid to toggle scalers within the 'num' pipeline
param_grid = {
    #'preprocessor__scaled_num__scaler': [StandardScaler(), MinMaxScaler(), 'passthrough'],  # Apply scalers or skip
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [3, 5, 7],
    'regressor__min_samples_split': [2, 5],
    'regressor__min_samples_leaf': [1, 2],
    'regressor__max_features': ['sqrt', 'log2', None],
    'regressor__bootstrap': [True, False]
}

# Set up GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the model
grid_search_result = grid_search.fit(X_train_sample, y_train_sample)

# Output the best results
print("Best hyperparameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best hyperparameters found:  {'regressor__bootstrap': False, 'regressor__max_depth': 7, 'regressor__max_features': None, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 50}
Best cross-validation score:  0.6700642342464229


In [59]:
def final_output(grid_search_result, test_data):
    y_pred = grid_search_result.predict(test_data)
    results = pd.DataFrame(
        dict(
            Id=np.arange(y_pred.shape[0]),
            log_bike_count=y_pred,
        )
    )
    results.to_csv("submission.csv", index=False)

In [60]:
final_output(grid_search_result, public_test_data)