# Modelling

The goal is too find the best model for the prediction, to do this we will start by importing the different data sets, preprocessing them and merging them together. Afterwards will do some feature engineering and some encoding before using auto-sklearn to find the best model.

## Imports

In [1]:
import pandas as pd
import numpy as np
import holidays
from sklearn.preprocessing import OneHotEncoder
from flaml import AutoML
from sklearn.model_selection import TimeSeriesSplit

### Importing data

In [2]:
train_df = pd.read_parquet("data/train.parquet")

school_hols_df = pd.read_csv("external_data/holidays.csv")

lockdown_periods = [
    ('2020-03-18', '2020-05-10'),
    ('2020-10-31', '2020-12-14'),
    ('2021-04-04', '2021-05-02')
]

weather_df = pd.read_csv(
    "external_data/H_75_previous-2020-2022.csv.gz",
    parse_dates=["AAAAMMJJHH"],
    date_format="%Y%m%d%H",
    compression="gzip",
    sep=";",
).rename(columns={"AAAAMMJJHH": "date"})

## Preprocessing and merging 

In [3]:
# List of features you want to keep
selected_features = ['NUM_POSTE', 'date', 'RR1', 'DRR1', 'FF', 'T', 'TCHAUSSEE', 'U', 'GLO']

# Subset the DataFrame to only these columns
weather_df = weather_df[selected_features]

# Keep only rows where "NUM_POSTE" is equal to 75114001 and then drop the column
weather_df = weather_df[weather_df['NUM_POSTE'] == 75114001]
weather_df.drop('NUM_POSTE', axis=1, inplace=True)

#Interpolate missing values
weather_df.set_index('date', inplace=True)
weather_df = weather_df.interpolate(method='time')
weather_df.reset_index(inplace=True)

In [4]:
#Merge function

def prepare_and_merge_data(train_df, weather_df, school_hols_df):
    # Convert date columns to datetime
    train_df['date'] = pd.to_datetime(train_df['date'])
    weather_df['date'] = pd.to_datetime(weather_df['date'])
    school_hols_df['date'] = pd.to_datetime(school_hols_df['date'])

    # Convert boolean vacations to int (True -> 1, False -> 0)
    school_hols_df['vacances_zone_c'] = school_hols_df['vacances_zone_c'].astype(int)

    # Add bank holidays
    fr_holidays = holidays.France()
    train_df['is_bank_holiday'] = train_df['date'].dt.date.apply(
        lambda d: 1 if d in fr_holidays else 0
    )

    # Define lockdown periods
    lockdown_periods = [
        ('2020-03-18', '2020-05-10'),
        ('2020-10-31', '2020-12-14'),
        ('2021-04-04', '2021-05-02')
    ]

    def in_lockdown(dt):
        d_str = dt.strftime('%Y-%m-%d')
        return 1 if any(start <= d_str <= end for start, end in lockdown_periods) else 0

    train_df['is_lockdown'] = train_df['date'].apply(in_lockdown)

    # Merge school holidays (daily data) into the hourly train data
    train_df['date_only'] = train_df['date'].dt.floor('D')
    train_df = train_df.merge(
        school_hols_df[['date', 'vacances_zone_c']],
        left_on='date_only',
        right_on='date',
        how='left'
    )

    train_df.rename(columns={'vacances_zone_c': 'school_holidays'}, inplace=True)

    # Clean up after merge
    train_df.drop(columns=['date_only', 'date_y'], inplace=True)
    train_df.rename(columns={'date_x': 'date'}, inplace=True)

    # Merge weather data
    merged_df = pd.merge(train_df, weather_df, on='date', how='left')

    return merged_df

In [5]:
final_train_df = prepare_and_merge_data(train_df, weather_df, school_hols_df)

## Encoding 

In [6]:
# Function to encode date features

def encode_time_features(df, date_col='date'):
    df['year'] = df[date_col].dt.year
    df['quarter'] = df[date_col].dt.quarter
    df['month'] = df[date_col].dt.month
    df['day'] = df[date_col].dt.day
    df['weekday'] = df[date_col].dt.weekday
    df['hour'] = df[date_col].dt.hour

    # Cyclical encodings
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)

    # Weekend indicator
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)

    return df.drop(columns=[date_col])

In [7]:
final_train_df = encode_time_features(final_train_df, date_col='date')

In [8]:
# Distance feature from center of Paris coordinates
ref_lat, ref_lon = 48.8566, 2.3522
final_train_df['dist_center'] = np.sqrt((final_train_df['latitude'] - ref_lat)**2 + (final_train_df['longitude'] - ref_lon)**2)

In [9]:
# Drop reredundant columns (except counter_id and site_id for encoding)
cols_to_drop = ['counter_name', 'counter_technical_id', 'site_name', 'latitude', 'longitude', 'coordinates', 'bike_count', 'counter_installation_date']
final_train_df.drop(columns=cols_to_drop, inplace=True)

In [10]:
# OneHotEncode counter_id and site_id
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Fit and transform on counter_id
counter_id_encoded = encoder.fit_transform(final_train_df[['counter_id']])
counter_id_encoded_df = pd.DataFrame(counter_id_encoded, columns=encoder.get_feature_names_out(['counter_id']))
final_train_df = pd.concat([final_train_df, counter_id_encoded_df], axis=1)

# Fit and transform on site_id
site_id_encoded = encoder.fit_transform(final_train_df[['site_id']])
site_id_encoded_df = pd.DataFrame(site_id_encoded, columns=encoder.get_feature_names_out(['site_id']))
final_train_df = pd.concat([final_train_df, site_id_encoded_df], axis=1)


# Drop the original counter_id and site_id after encoding
final_train_df.drop(columns=['counter_id', 'site_id'], inplace=True)


In [11]:
# Define target and features
y = final_train_df['log_bike_count']
X = final_train_df.drop(columns=['log_bike_count'])

# Initialize FLAML AutoML
automl = AutoML(estimator_list=["rf", "xgboost", "lgbm", "catboost"])

# Fit AutoML model using Cross-Validation
automl.fit(
    X,
    y,
    task="regression",
    time_budget=1500,
    eval_method="cv",
    metric="rmse",
    n_splits=3,
    verbose=2,
    split_type=TimeSeriesSplit(),
    estimator_list = [
        "extra_tree",
        "histgb",
        "lgbm",
        "rf",
        "xgboost",
        "xgb_limitdepth",
    ],
    force_cancel=True,
)




In [12]:
print("Best Model:", automl.best_estimator)

Best Model: extra_tree
