In [1]:
from sklearn.dummy import DummyRegressor
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
train_day_df = pd.read_csv("D:/VS Code Projects/Datasets/Bike Sharing/data_processed/feature_engineered_train_day.csv")
valid_day_df = pd.read_csv("D:/VS Code Projects/Datasets/Bike Sharing/data_processed/feature_engineered_valid_day.csv")
train_hour_df = pd.read_csv("D:/VS Code Projects/Datasets/Bike Sharing/data_processed/feature_engineered_train_hour.csv")
valid_hour_df = pd.read_csv("D:/VS Code Projects/Datasets/Bike Sharing/data_processed/feature_engineered_valid_hour.csv")

In [3]:
### Ensure datetime & remove the column 'Unnamed: 0'
datasets_list =  [train_day_df,train_hour_df,valid_day_df,valid_hour_df]
for dataset in datasets_list:
    dataset['date'] = pd.to_datetime(dataset['date'], errors= 'coerce')
    dataset.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')

print('ISSUE FIXED')
train_hour_df.dtypes

ISSUE FIXED


date                       datetime64[ns]
month                               int64
hour                                int64
weather_situation                   int64
feels_like_temp_norm              float64
temp_feel_diff                    float64
humidity_norm                     float64
temp_x_humidity                   float64
temp_x_wind_speed                 float64
casual                              int64
registered                          int64
hour_category_afternoon             int64
hour_category_evening               int64
hour_category_night                 int64
num_rentals                         int64
dtype: object

## Persistence (Last-Value) Baseline Explanation

The persistence baseline is a simple time-series model that does NOT learn from features or parameters. It predicts the next value using the most recently observed target value:

y_pred(t) = y_true(t-1)

**Key points:**
1. Does not use X_train or any explanatory variables.
2. Does not require `.fit()` because there are no parameters to learn.
3. Exploits temporal continuity (demand today ≈ yesterday).
4. Provides a strong lower-bound benchmark for evaluating time-series forecasting models.

This baseline is simple, interpretable, and helps verify that any ML model provides meaningful improvement over a naive time-series guess.


In [4]:
#Use Last value regressor (persistance ) as a basline regressor  for time_series dataset without shuffling to avoid data leakage.
def persistence_baseline (y_train, y_valid):
    y_pred = [] #storing the prediction values
    last_value = y_train.iloc[-1]

    for true_Value in y_valid:
        y_pred.append(last_value) #actual prediction from the last day
        # For this time step, I predict the same number of rentals as last time. So the first proediction is the last day of training.
        last_value = true_Value 

    return np.array(y_pred)


## Define Target and features

In [5]:
# Daily Data
y_pred_day = persistence_baseline (
    train_day_df["num_rentals"],
    valid_day_df["num_rentals"]
)


# Hourly Data
y_pred_hour = persistence_baseline (
    train_hour_df['num_rentals'],
    valid_hour_df['num_rentals']
)



## Prediction Metrics

In [15]:
# Prediction Metrics
def evaluate (y_true, y_pred, label):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{label} → RMSE: {rmse:.2f}, R²: {r2:.3f}")

In [16]:
evaluate (valid_day_df["num_rentals"], y_pred_day, 'Daily persistence')


Daily persistence → RMSE: 1198.10, R²: 0.512


In [17]:
evaluate (valid_hour_df["num_rentals"], y_pred_hour, 'hourly persistence')

hourly persistence → RMSE: 122.13, R²: 0.665


The explanation of variability is not bad for a naive mode.

However, the RMSE is quite high, indicating poor predictive accuracy. The baseline was intended to show that our ML model must perform significantly better; otherwise, we may need to adjust hyperparameters or consider a different model.
