In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from vacances_scolaires_france import SchoolHolidayDates
from jours_feries_france import JoursFeries
from functions import *

#### Load data

In [None]:
df_train = pd.read_parquet("/Users/solalzana/Desktop/X/Python for Data Science/Final Project/bike_counters/data/train.parquet")
df_test_kaggle = pd.read_parquet("/Users/solalzana/Desktop/X/Python for Data Science/Final Project/bike_counters/data/final_test.parquet")
df_ext = pd.read_csv("/Users/solalzana/Desktop/X/Python for Data Science/Final Project/bike_counters/data/external_data.csv")

#### Prepare data

In [None]:
df_train_cleaned = prepare_data(df_train, df_ext)
df_test_kaggle_cleaned = prepare_data(df_test_kaggle, df_ext)

#### Train/test split

In [None]:
# Splitting the data
X_train = df_train_cleaned.drop(columns=["log_bike_count", "bike_count", "date"])
y_train = df_train_cleaned['log_bike_count']

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
X_test_kaggle = df_test_kaggle # when testing on kaggle

If testing on kaggle then use provided test dataset instead of splitting.

#### Build Pipeline

Different models to be tested here:

In [None]:
models = {
    # 'random_forest': RandomForestRegressor(
    #     n_estimators=100,
    #     random_state=42
    # ),
    'xgboost': xgb.XGBRegressor(
        n_estimators=100,
        random_state=42,
        enable_categorical=True  # Add this if you have categorical features
    ),
    'ridge': Ridge(
        random_state=42
    ),
    'catboost': CatBoostRegressor(
        iterations=100,
        random_state=42,
        verbose=False
    )
}

In [None]:
results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    pipeline, rmse = train_and_evaluate_model(X_train, X_test, y_train, y_test, model)
    results[name] = {
        'pipeline': pipeline,
        'rmse': rmse
    }
    print(f"{name} RMSE: {rmse}")

Find the best model

In [None]:
best_model = min(results.items(), key=lambda x: x[1]['rmse'])
print(f"\nBest model: {best_model[0]} with RMSE: {best_model[1]['rmse']}")

In [None]:
# from sklearn.pipeline import make_pipeline
# import xgboost as xgb

# fit_encoder(X_train)
# # X_train = encoder(X_train)
# # X_test = encoder(X_test)

# model = xgb.XGBRegressor(objective='reg:squarederror')
# trained_model = train_model(X_train, y_train, model)

# test_model_kaggle(trained_model, X_test, "xgb") # results is a df storing y_pred(s)
# # check submission folder now
# # X_test.drop(columns=['date'], inplace=True)
# # evaluate_model(trained_model, X_test, y_test)

In [None]:
# from catboost import CatBoostRegressor

# model = CatBoostRegressor()
# pipeline_cb = build_pipeline(X_train, y_train, model)
# trained_model_cb = train_model(pipeline_cb, model, X_train, y_train)

# # test_model_kaggle(pipeline_cb, X_test, "cb") # results is a df storing y_pred(s)
# # # check submission folder now
# test_model_kaggle(model, X_test, y_test)

In [None]:
# # lightgbm
# !pip install lightgbm
# from lightgbm import LGBMRegressor
# import lightgbm as lgb

# model = lgb.LGBMRegressor()
# pipeline_lgb = build_pipeline(X_train, y_train, model)
# trained_model_lgb = train_model(pipeline_lgb, model, X_train, y_train)

# test_model_kaggle(pipeline_lgb, X_test, "lgb") # results is a df storing y_pred(s)
# # check submission folder

### RF

In [None]:
# # random forest
# from sklearn.ensemble import RandomForestRegressor

# model = RandomForestRegressor(n_jobs=-1)
# pipeline_cb = build_pipeline(X_train, y_train, model)
# trained_model_cb = train_model(pipeline_cb, model, X_train, y_train)

# test_model_kaggle(pipeline_xgb, X_test, "rf") # results is a df storing y_pred(s)
# # check submission folder now

####  Tune Hyperparameters

In [None]:
pipeline_best_model = tune_hyperparameters(best_model, X_train, y_train).best_estimator_

#### Submit predictions using the best model on Kaggle

In [None]:
best_pipeline = best_model[1]['pipeline']
submission, _ = test_model_kaggle(pipeline_best_model, X_test_kaggle, best_model[0])