In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV




In [5]:
# 1. Loading the data
FILE_PATH = "data/dataset/day.csv"
df_ = pd.read_csv(FILE_PATH, parse_dates=['dteday'])
df_ = df_.sort_values(by='dteday').reset_index(drop=True)

# split dataset
df_last30 = df_.tail(30)
df = df_.iloc[:-30, :]

In [20]:

# Python 3.x
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV

# -----------------------------
# 1) Load and sort your data
# -----------------------------
FILE_PATH = "data/dataset/day.csv"
df_ = pd.read_csv(FILE_PATH, parse_dates=['dteday'])
df_ = df_.sort_values(by='dteday').reset_index(drop=True)

# Split dataset: last 30 days for future prediction
df_last30 = df_.tail(30)
df = df_.iloc[:-30, :]

target_col = 'cnt'
time_col = 'dteday'

# Drop unnecessary columns
cols_to_drop = ['casual', 'registered']
df = df.drop(columns=cols_to_drop)

# -----------------------------
# 2) Feature Engineering
# -----------------------------
def add_prophet_features(df, time_col='dteday'):
    df['time_index'] = np.arange(len(df))
    df['dayofyear'] = df[time_col].dt.dayofyear
    df['hour'] = 0  # daily data, so hour = 0
    df['dayofweek'] = df[time_col].dt.dayofweek
    df['month'] = df[time_col].dt.month
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)

    # Fourier terms for seasonality
    for k in range(1, 5):  # daily harmonics
        df[f'daily_sin_{k}'] = np.sin(2 * np.pi * k * df['hour'] / 24)
        df[f'daily_cos_{k}'] = np.cos(2 * np.pi * k * df['hour'] / 24)

    for k in range(1, 3):  # weekly harmonics
        df[f'weekly_sin_{k}'] = np.sin(2 * np.pi * k * df['dayofweek'] / 7)
        df[f'weekly_cos_{k}'] = np.cos(2 * np.pi * k * df['dayofweek'] / 7)

    for k in range(1, 3):  # yearly harmonics
        df[f'yearly_sin_{k}'] = np.sin(2 * np.pi * k * df['dayofyear'] / 365)
        df[f'yearly_cos_{k}'] = np.cos(2 * np.pi * k * df['dayofyear'] / 365)

    return df

def add_lags_and_rolls(df, lags=(1, 7, 14), roll_windows=(3, 7, 14)):
    for L in lags:
        df[f'lag_{L}'] = df[target_col].shift(L)
    for w in roll_windows:
        df[f'roll_mean_{w}'] = df[target_col].shift(1).rolling(w).mean()
        df[f'roll_std_{w}'] = df[target_col].shift(1).rolling(w).std()
    return df

# Apply feature engineering
df = add_prophet_features(df.copy())
df = add_lags_and_rolls(df)

# Interaction features
df['temp_x_is_weekend'] = df['temp'] * df['is_weekend']
df['atemp_x_is_weekend'] = df['atemp'] * df['is_weekend']

# -----------------------------
# 3) Handle NaNs for lag/rolling
# -----------------------------
lag_roll_cols = [col for col in df.columns if col.startswith('lag_') or col.startswith('roll_')]
df[lag_roll_cols] = df[lag_roll_cols].fillna(method='bfill').fillna(0)

# -----------------------------
# 4) Dynamic categorical detection + One-hot encoding
# -----------------------------
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
for col in df.columns:
    if df[col].nunique() < 20 and df[col].dtype in ['int64', 'float64']:
        categorical_cols.append(col)
categorical_cols = list(set(categorical_cols))

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# -----------------------------
# 5) Remove unwanted columns
# -----------------------------
cols_to_drop = [col for col in df.columns if col.startswith('roll_std_')]
df = df.drop(columns=cols_to_drop)

# -----------------------------
# 6) Prepare features
# -----------------------------
y = df[target_col].values
X = df.drop(columns=[target_col, time_col]).values

# -----------------------------
# 7) Hyperparameter tuning
# -----------------------------
param_grid = {
    'n_estimators': [300, 500, 800, 1000],
    'max_depth': [None, 10, 20, 30],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', 0.7]
}

tscv = TimeSeriesSplit(n_splits=5)
rf = RandomForestRegressor(random_state=42)

search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=20,
    scoring='neg_mean_squared_error',
    cv=tscv,
    n_jobs=-1,
    random_state=42
)

print("Tuning hyperparameters...")
search.fit(X, y)
best_rf = search.best_estimator_
print("Best parameters:", search.best_params_)

# -----------------------------
# 8) Evaluate with TimeSeriesSplit
# -----------------------------
oof_preds = np.zeros(len(df))
fold_rmses = []

for fold, (train_idx, val_idx) in enumerate(tscv.split(X), 1):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    best_rf.fit(X_train, y_train)
    preds = best_rf.predict(X_val)

    rmse = np.sqrt(mean_squared_error(y_val, preds))
    fold_rmses.append(rmse)
    oof_preds[val_idx] = preds
    print(f"Fold {fold}: RMSE = {rmse:.3f}")

overall_rmse = np.sqrt(mean_squared_error(y, oof_preds))
print(f"Overall OOF RMSE = {overall_rmse:.3f}")
print("Per-fold RMSEs:", np.round(fold_rmses, 3))

# -----------------------------
# 9) Final training on full data
# -----------------------------
best_rf.fit(X, y)
print("Final model trained on full data.")


  df[lag_roll_cols] = df[lag_roll_cols].fillna(method='bfill').fillna(0)


Tuning hyperparameters...
Best parameters: {'n_estimators': 300, 'min_samples_leaf': 1, 'max_features': 0.7, 'max_depth': 30}
Fold 1: RMSE = 723.916
Fold 2: RMSE = 1102.279
Fold 3: RMSE = 1489.192
Fold 4: RMSE = 1136.414
Fold 5: RMSE = 1026.759
Overall OOF RMSE = 1389.668
Per-fold RMSEs: [ 723.916 1102.279 1489.192 1136.414 1026.759]
Final model trained on full data.


In [14]:
pd.set_option('display.max_columns', 500)
df.head()

Unnamed: 0,instant,dteday,temp,atemp,hum,windspeed,casual,registered,cnt,lag_1,lag_24,lag_168,roll_mean_3,roll_std_3,roll_mean_6,roll_std_6,roll_mean_24,roll_std_24,roll_mean_168,roll_std_168,temp_x_is_weekend,atemp_x_is_weekend,holiday_1,weathersit_2,weathersit_3,workingday_1,season_2,season_3,season_4,sin_dow_-0.7818314824680299,sin_dow_-0.433883739117558,sin_dow_0.0,sin_dow_0.43388373911755823,sin_dow_0.7818314824680298,sin_dow_0.9749279121818236,yr_1,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,mnth_2,mnth_3,mnth_4,mnth_5,mnth_6,mnth_7,mnth_8,mnth_9,mnth_10,mnth_11,mnth_12,cos_dow_-0.900968867902419,cos_dow_-0.2225209339563146,cos_dow_-0.22252093395631434,cos_dow_0.6234898018587334,cos_dow_0.6234898018587336,cos_dow_1.0,is_weekend_1
0,1,2011-01-01,0.344167,0.363625,0.805833,0.160446,331,654,985,985.0,985.0,985.0,1045.0,278.883488,1317.166667,346.738759,1266.875,315.075465,2744.363095,1406.355882,0.344167,0.363625,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True
1,2,2011-01-02,0.363478,0.353739,0.696087,0.248539,131,670,801,985.0,985.0,985.0,1045.0,278.883488,1317.166667,346.738759,1266.875,315.075465,2744.363095,1406.355882,0.0,0.0,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
2,3,2011-01-03,0.196364,0.189405,0.437273,0.248309,120,1229,1349,801.0,985.0,985.0,1045.0,278.883488,1317.166667,346.738759,1266.875,315.075465,2744.363095,1406.355882,0.0,0.0,False,False,False,True,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
3,4,2011-01-04,0.2,0.212122,0.590435,0.160296,108,1454,1562,1349.0,985.0,985.0,1045.0,278.883488,1317.166667,346.738759,1266.875,315.075465,2744.363095,1406.355882,0.0,0.0,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
4,5,2011-01-05,0.226957,0.22927,0.436957,0.1869,82,1518,1600,1562.0,985.0,985.0,1237.333333,392.596909,1317.166667,346.738759,1266.875,315.075465,2744.363095,1406.355882,0.0,0.0,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False


In [21]:

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

# -----------------------------
# 1) Load and sort your data
# -----------------------------
FILE_PATH = "data/dataset/day.csv"
df_ = pd.read_csv(FILE_PATH, parse_dates=['dteday'])
df_ = df_.sort_values(by='dteday').reset_index(drop=True)

# Split dataset: last 30 days for future prediction
df_last30 = df_.tail(30)
df = df_.iloc[:-30, :]

target_col = 'cnt'
time_col = 'dteday'

# Drop unnecessary columns
df = df.drop(columns=['casual', 'registered'])

# -----------------------------
# 2) Feature Engineering
# -----------------------------
def add_prophet_features(df, time_col='dteday'):
    df['time_index'] = np.arange(len(df))
    df['dayofyear'] = df[time_col].dt.dayofyear
    df['hour'] = 0  # daily data
    df['dayofweek'] = df[time_col].dt.dayofweek
    df['month'] = df[time_col].dt.month
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)

    # Fourier terms for seasonality
    for k in range(1, 5):
        df[f'daily_sin_{k}'] = np.sin(2 * np.pi * k * df['hour'] / 24)
        df[f'daily_cos_{k}'] = np.cos(2 * np.pi * k * df['hour'] / 24)

    for k in range(1, 3):
        df[f'weekly_sin_{k}'] = np.sin(2 * np.pi * k * df['dayofweek'] / 7)
        df[f'weekly_cos_{k}'] = np.cos(2 * np.pi * k * df['dayofweek'] / 7)

    for k in range(1, 3):
        df[f'yearly_sin_{k}'] = np.sin(2 * np.pi * k * df['dayofyear'] / 365)
        df[f'yearly_cos_{k}'] = np.cos(2 * np.pi * k * df['dayofyear'] / 365)

    return df

def add_lags_and_rolls(df, lags=(1, 7, 14), roll_windows=(3, 7, 14)):
    for L in lags:
        df[f'lag_{L}'] = df[target_col].shift(L)
    for w in roll_windows:
        df[f'roll_mean_{w}'] = df[target_col].shift(1).rolling(w).mean()
        df[f'roll_std_{w}'] = df[target_col].shift(1).rolling(w).std()
    return df

df = add_prophet_features(df.copy())
df = add_lags_and_rolls(df)

# Interaction features
df['temp_x_is_weekend'] = df['temp'] * df['is_weekend']
df['atemp_x_is_weekend'] = df['atemp'] * df['is_weekend']

# Handle NaNs
lag_roll_cols = [col for col in df.columns if col.startswith('lag_') or col.startswith('roll_')]
df[lag_roll_cols] = df[lag_roll_cols].fillna(method='bfill').fillna(0)

# Dynamic categorical detection + One-hot encoding
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
for col in df.columns:
    if df[col].nunique() < 20 and df[col].dtype in ['int64', 'float64']:
        categorical_cols.append(col)
categorical_cols = list(set(categorical_cols))
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Remove unwanted columns
cols_to_drop = [col for col in df.columns if col.startswith('roll_std_')]
df = df.drop(columns=cols_to_drop)

# Prepare features
y = df[target_col].values
X = df.drop(columns=[target_col, time_col]).values

# -----------------------------
# 3) Hyperparameter tuning
# -----------------------------
param_grid = {
    'n_estimators': [300, 500, 800, 1000],
    'max_depth': [None, 10, 20, 30],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', 0.7]
}

rf = RandomForestRegressor(random_state=42)
search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=20,
    scoring='neg_mean_squared_error',
    cv=3,  # quick internal CV for tuning
    n_jobs=-1,
    random_state=42
)

print("Tuning hyperparameters...")
search.fit(X, y)
best_rf = search.best_estimator_
print("Best parameters:", search.best_params_)

# -----------------------------
# 4) Expanding Window Cross-Validation
# -----------------------------
def expanding_window_split(n_samples, n_splits=5, min_train_size=30):
    """Generate expanding window train/validation indices."""
    split_size = (n_samples - min_train_size) // n_splits
    for i in range(n_splits):
        train_end = min_train_size + i * split_size
        val_end = train_end + split_size
        yield np.arange(train_end), np.arange(train_end, val_end)

oof_preds = np.zeros(len(df))
fold_rmses = []

for fold, (train_idx, val_idx) in enumerate(expanding_window_split(len(df), n_splits=5), 1):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    best_rf.fit(X_train, y_train)
    preds = best_rf.predict(X_val)

    rmse = np.sqrt(mean_squared_error(y_val, preds))
    fold_rmses.append(rmse)
    oof_preds[val_idx] = preds
    print(f"Fold {fold}: RMSE = {rmse:.3f}")

overall_rmse = np.sqrt(mean_squared_error(y, oof_preds))
print(f"Overall OOF RMSE = {overall_rmse:.3f}")
print("Per-fold RMSEs:", np.round(fold_rmses, 3))

# -----------------------------
# 5) Final training on full data
# -----------------------------
best_rf.fit(X, y)
print("Final model trained on full data.")


  df[lag_roll_cols] = df[lag_roll_cols].fillna(method='bfill').fillna(0)


Tuning hyperparameters...
Best parameters: {'n_estimators': 300, 'min_samples_leaf': 1, 'max_features': 0.7, 'max_depth': 30}
Fold 1: RMSE = 2355.023
Fold 2: RMSE = 801.114
Fold 3: RMSE = 942.731
Fold 4: RMSE = 1816.474
Fold 5: RMSE = 1012.321
Overall OOF RMSE = 1512.496
Per-fold RMSEs: [2355.023  801.114  942.731 1816.474 1012.321]
Final model trained on full data.


In [None]:
FEATURE_COLS = [
    # 'dteday',
    'season',
    # 'yr',
    'mnth',
    'holiday',
    'weekday',
    'workingday',
    'weathersit',
    'temp',
    'atemp',
    'hum',
    'windspeed',
    # 'casual',
    # 'registered',
    # 'bikes_cnt',
    # 'day'
    ]

: 

In [None]:
# 1. Loading the data

df = pd.read_csv("data/dataset/day.csv", parse_dates=['dteday'])
df = df.sort_values(by='dteday').reset_index(drop=True)

print(f"Data loaded with {len(df)} rows and {len(df.columns)} columns.")
print("\n", df.info())


Data loaded with 731 rows and 16 columns.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   instant     731 non-null    int64         
 1   dteday      731 non-null    datetime64[ns]
 2   season      731 non-null    int64         
 3   yr          731 non-null    int64         
 4   mnth        731 non-null    int64         
 5   holiday     731 non-null    int64         
 6   weekday     731 non-null    int64         
 7   workingday  731 non-null    int64         
 8   weathersit  731 non-null    int64         
 9   temp        731 non-null    float64       
 10  atemp       731 non-null    float64       
 11  hum         731 non-null    float64       
 12  windspeed   731 non-null    float64       
 13  casual      731 non-null    int64         
 14  registered  731 non-null    int64         
 15  cnt         731 non-null    int6

: 

In [22]:

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

# -----------------------------
# 1) Load and sort data
# -----------------------------
FILE_PATH = "data/dataset/day.csv"
df_ = pd.read_csv(FILE_PATH, parse_dates=['dteday'])
df_ = df_.sort_values(by='dteday').reset_index(drop=True)

# Define target and time columns
target_col = 'cnt'
time_col = 'dteday'

# -----------------------------
# 2) Split for hold-out (last 30 rows)
# -----------------------------
df_holdout_raw = df_.tail(30).copy()     # untouched copy for later comparison if needed
df_train_raw   = df_.iloc[:-30, :].copy()

# -----------------------------
# 3) Feature Engineering functions
# -----------------------------
def add_prophet_features(df, time_col='dteday'):
    df['time_index'] = np.arange(len(df))
    df['dayofyear'] = df[time_col].dt.dayofyear
    df['hour'] = 0  # daily data
    df['dayofweek'] = df[time_col].dt.dayofweek
    df['month'] = df[time_col].dt.month
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)

    # Fourier terms for seasonality
    for k in range(1, 5):
        df[f'daily_sin_{k}'] = np.sin(2 * np.pi * k * df['hour'] / 24)
        df[f'daily_cos_{k}'] = np.cos(2 * np.pi * k * df['hour'] / 24)
    for k in range(1, 3):
        df[f'weekly_sin_{k}'] = np.sin(2 * np.pi * k * df['dayofweek'] / 7)
        df[f'weekly_cos_{k}'] = np.cos(2 * np.pi * k * df['dayofweek'] / 7)
    for k in range(1, 3):
        df[f'yearly_sin_{k}'] = np.sin(2 * np.pi * k * df['dayofyear'] / 365)
        df[f'yearly_cos_{k}'] = np.cos(2 * np.pi * k * df['dayofyear'] / 365)
    return df

def add_lags_and_rolls(df, lags=(1, 7, 14), roll_windows=(3, 7, 14), target_col='cnt'):
    # IMPORTANT: build lags/rolls on the FULL timeline before splitting,
    # so the hold-out rows can use past values from train.
    for L in lags:
        df[f'lag_{L}'] = df[target_col].shift(L)
    for w in roll_windows:
        df[f'roll_mean_{w}'] = df[target_col].shift(1).rolling(w).mean()
        df[f'roll_std_{w}'] = df[target_col].shift(1).rolling(w).std()
    return df

def add_interactions(df):
    if 'temp' in df.columns and 'is_weekend' in df.columns:
        df['temp_x_is_weekend']  = df['temp']  * df['is_weekend']
    if 'atemp' in df.columns and 'is_weekend' in df.columns:
        df['atemp_x_is_weekend'] = df['atemp'] * df['is_weekend']
    return df

def fill_lag_roll_nans(df):
    lag_roll_cols = [c for c in df.columns if c.startswith('lag_') or c.startswith('roll_')]
    # Backward fill then zero for any remaining NaNs
    df[lag_roll_cols] = df[lag_roll_cols].bfill().fillna(0)
    return df

def detect_categoricals(df):
    cats = df.select_dtypes(include=['object', 'category']).columns.tolist()
    for col in df.columns:
        if df[col].nunique() < 20 and df[col].dtype in ['int64', 'float64']:
            cats.append(col)
    return list(set(cats))

def one_hot_fit_transform(df_train, df_holdout, categorical_cols, drop_first=True):
    """Fit categories on train, transform both, and align columns."""
    train_enc = pd.get_dummies(df_train, columns=categorical_cols, drop_first=drop_first)
    hold_enc  = pd.get_dummies(df_holdout, columns=categorical_cols, drop_first=drop_first)
    # Align hold-out columns to train columns (add missing with 0, drop extras)
    hold_enc = hold_enc.reindex(columns=train_enc.columns, fill_value=0)
    return train_enc, hold_enc

# -----------------------------
# 4) Build features on the full df_ (to avoid leakage for hold-out lags)
# -----------------------------
df_feat = df_.copy()
# Drop columns not used (optional)
for col in ['casual', 'registered']:
    if col in df_feat.columns:
        df_feat = df_feat.drop(columns=[col])

df_feat = add_prophet_features(df_feat, time_col=time_col)
df_feat = add_lags_and_rolls(df_feat, target_col=target_col)
df_feat = add_interactions(df_feat)
df_feat = fill_lag_roll_nans(df_feat)

# Split features into train/hold-out after feature engineering
df_train = df_feat.iloc[:-30, :].copy()
df_holdout = df_feat.tail(30).copy()

# Remove noisy columns (optional)
cols_to_drop = [c for c in df_train.columns if c.startswith('roll_std_')]
df_train = df_train.drop(columns=cols_to_drop)
df_holdout = df_holdout.drop(columns=[c for c in cols_to_drop if c in df_holdout.columns])

# -----------------------------
# 5) One-hot encoding (fit on train, align hold-out)
# -----------------------------
categorical_cols = detect_categoricals(df_train)
df_train_enc, df_holdout_enc = one_hot_fit_transform(df_train, df_holdout, categorical_cols, drop_first=True)

# -----------------------------
# 6) Prepare X/y for train and hold-out
# -----------------------------
feature_cols = [c for c in df_train_enc.columns if c not in [target_col, time_col]]
X_train = df_train_enc[feature_cols].values
y_train = df_train_enc[target_col].values

X_hold  = df_holdout_enc[feature_cols].values
y_hold  = df_holdout_enc[target_col].values

# -----------------------------
# 7) Hyperparameter tuning on TRAIN ONLY
# -----------------------------
param_grid = {
    'n_estimators': [300, 500, 800, 1000],
    'max_depth': [None, 10, 20, 30],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', 0.7]
}

rf = RandomForestRegressor(random_state=42)
search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=20,
    scoring='neg_mean_squared_error',
    cv=3,       # quick internal CV on training data
    n_jobs=-1,
    random_state=42
)

print("Tuning hyperparameters on training data...")
search.fit(X_train, y_train)
best_rf = search.best_estimator_
print("Best parameters:", search.best_params_)

# -----------------------------
# 8) Fit on training and evaluate on HOLD-OUT (last 30 days)
# -----------------------------
best_rf.fit(X_train, y_train)
pred_hold = best_rf.predict(X_hold)

rmse_hold = np.sqrt(mean_squared_error(y_hold, pred_hold))
print(f"Hold-out (last 30 days) RMSE = {rmse_hold:.3f}")

# Optional: attach predictions to the hold-out dataframe for inspection
df_eval = df_holdout_raw.copy()
df_eval['y_true'] = y_hold
df_eval['y_pred'] = pred_hold
print(df_eval[['dteday', 'y_true', 'y_pred']].head())


Tuning hyperparameters on training data...
Best parameters: {'n_estimators': 300, 'min_samples_leaf': 1, 'max_features': 0.7, 'max_depth': 30}
Hold-out (last 30 days) RMSE = 1094.751
        dteday  y_true       y_pred
701 2012-12-02    4649  4117.143333
702 2012-12-03    6234  5454.686667
703 2012-12-04    6606  5704.856667
704 2012-12-05    5729  5840.530000
705 2012-12-06    5375  4322.830000


In [None]:
# 2. Feature engineering

# Lag Feature Engineering (CRITICAL FOR TIME SERIES)
LAG_PERIOD = 1 
df['lag_demand_1h'] = df['cnt'].shift(LAG_PERIOD)

# Fill NaN created by shifting (The first few rows will be missing the lag value)
# Using 0 for simplicity, or you could drop these rows later if preferred
df['lag_demand_1h'] = df['lag_demand_1h'].fillna(0) 

print(f"Added lag feature: 'lag_demand_1h'")

Added lag feature: 'lag_demand_1h'


: 

In [None]:
# CONFIG

FILE_PATH = "data/dataset/day.csv" 
# NOTE: Set the date to split your data (e.g., use all data before '2012-01-01' for training)
TEST_SPLIT_DATE = '2012-10-31' 

param_dist = {
    'regressor__n_estimators': [100, 200, 300, 400], # Number of trees
    'regressor__max_depth': [10, 20, 30, None],      # Max depth of the trees (None means nodes are expanded until all leaves are pure)
    'regressor__min_samples_split': [2, 5, 10],      # Minimum number of samples required to split an internal node
    'regressor__min_samples_leaf': [1, 2, 4],        # Minimum number of samples required to be at a leaf node
    }

N_ITER_SEARCH = 15


TARGET_COL = 'cnt'

: 

: 

In [None]:
NUMERICAL_FEATURES = []
CATEGORICAL_FEATURES = []

for col in FEATURE_COLS:
    col_dtype = df[col].dtype
    num_unique = df[col].nunique()
    
    # Rule 1: If dtype is float (e.g., 'temp'), it's numerical.
    if np.issubdtype(col_dtype, np.number) and 'float' in str(col_dtype):
        NUMERICAL_FEATURES.append(col)
    
    # Rule 2: If it's a number (int) but has a low number of unique values,
    # treat it as categorical for the OneHotEncoder.
    elif np.issubdtype(col_dtype, np.number) and num_unique <= 50:
        CATEGORICAL_FEATURES.append(col)
        
    # Rule 3: If it's a high-cardinality integer or a true continuous variable.
    elif np.issubdtype(col_dtype, np.number):
            NUMERICAL_FEATURES.append(col)
    
    # Rule 4: If it's an object/string (shouldn't happen much here)
    elif col_dtype == 'object':
        CATEGORICAL_FEATURES.append(col)
        
print("\n--- Identified Feature Types ---")
print(f"Numerical Features: {NUMERICAL_FEATURES}")
print(f"Categorical Features: {CATEGORICAL_FEATURES}")


--- Identified Feature Types ---
Numerical Features: ['temp', 'atemp', 'hum', 'windspeed']
Categorical Features: ['season', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']


: 

In [None]:
X = df[FEATURE_COLS]
y = df[TARGET_COL]

: 

In [None]:
# 2a. Create pre-processing pipeline

# Define steps for categorical features (One-Hot Encoding)
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        # 'passthrough' fixes the error encountered previously and works well 
        # for numerical features when using Random Forest (no scaling needed)
        ('num', 'passthrough', NUMERICAL_FEATURES), 
        ('cat', categorical_transformer, CATEGORICAL_FEATURES)
    ],
    remainder='drop' # Drop any columns not explicitly listed in FEATURE_COLS
)

: 

In [None]:
n_splits = 5
n_iter_search = 10
MIN_TEST_SAMPLES = None


tscv = TimeSeriesSplit(
        n_splits=n_splits, 
        max_train_size=None, 
        test_size= MIN_TEST_SAMPLES
    )
    
cv_metrics = []

print("\n--- Starting Expanding Window Cross-Validation with Tuning ---")

param_dist = {
        'regressor__n_estimators': [100, 200, 300, 400],
        'regressor__max_depth': [10, 20, 30, None],
        'regressor__min_samples_split': [2, 5, 10],
        'regressor__min_samples_leaf': [1, 2, 4],
    }

# Inner split for tuning (used inside RandomizedSearchCV)
inner_cv = TimeSeriesSplit(n_splits=3,
                           test_size=MIN_TEST_SAMPLES)

for fold, (train_index, test_index) in enumerate(tscv.split(X)):
     
    # Prepare Data for Current Fold
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
    
    print(f"\n[Fold {fold + 1}/{n_splits}] Training size: {len(X_train_fold)}, Testing size: {len(X_test_fold)}")

    # Define the Full ML Pipeline
    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(random_state=42, n_jobs=-1))
    ])
    
    # Perform Randomized Search on the CURRENT Training Data
    random_search = RandomizedSearchCV(
        full_pipeline, 
        param_distributions=param_dist, 
        n_iter=n_iter_search, 
        # We use NEGATIVE MSE because RandomizedSearchCV maximizes the scoring function (minimize RMSE/MAE)
        scoring='neg_root_mean_squared_error', 
        cv=inner_cv, 
        random_state=42,
        n_jobs=-1,
        verbose=0
    )
    
    print(f"  Tuning Random Forest on current training window (n_iter={n_iter_search})...")
    random_search.fit(X_train_fold, y_train_fold)
    
    # Use the Best Model found to predict on the outer test fold
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test_fold)
    
    # Evaluate Metrics for this Fold
    rmse = np.sqrt(mean_squared_error(y_test_fold, y_pred))
    mae = mean_absolute_error(y_test_fold, y_pred)
    mse_rf = mean_squared_error(y_test_fold, y_pred)
    r2_rf = r2_score(y_test_fold, y_pred)
    
    
    cv_metrics.append({'RMSE': rmse,
                       'MAE': mae, 
                       'MSE': mse_rf,
                       'r2':r2_rf,
                       'Best_Params': random_search.best_params_})
    
    print(f"  Best Parameters: {random_search.best_params_}")
    print(f"  Fold Metrics: RMSE={rmse:.2f}, MAE={mae:.2f}")

# Calculate and Report Averages
avg_rmse = np.mean([m['RMSE'] for m in cv_metrics])
avg_mae = np.mean([m['MAE'] for m in cv_metrics])
avg_mse = np.mean([m['MSE'] for m in cv_metrics])
avg_r2 = np.mean([m['r2'] for m in cv_metrics])

print("\n--- Cross-Validation Summary ---")
print(f"Average RMSE over {n_splits} folds: {avg_rmse:.2f}")
print(f"Average MAE over {n_splits} folds: {avg_mae:.2f}")
print(f"Average MSE over {n_splits} folds: {avg_mse:.2f}")
print(f"Average r2 over {n_splits} folds: {avg_r2:.2f}")

print('avg_RMSE', avg_rmse, 'avg_MAE', avg_mae, 'avg_r2', avg_r2, 'avg_mse', avg_mse, 'individual_folds', cv_metrics)


--- Starting Expanding Window Cross-Validation with Tuning ---

[Fold 1/5] Training size: 126, Testing size: 121
  Tuning Random Forest on current training window (n_iter=10)...
  Best Parameters: {'regressor__n_estimators': 100, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 2, 'regressor__max_depth': 10}
  Fold Metrics: RMSE=937.16, MAE=789.00

[Fold 2/5] Training size: 247, Testing size: 121
  Tuning Random Forest on current training window (n_iter=10)...
  Best Parameters: {'regressor__n_estimators': 100, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 2, 'regressor__max_depth': 10}
  Fold Metrics: RMSE=1125.95, MAE=976.87

[Fold 3/5] Training size: 368, Testing size: 121
  Tuning Random Forest on current training window (n_iter=10)...
  Best Parameters: {'regressor__n_estimators': 100, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 2, 'regressor__max_depth': 10}
  Fold Metrics: RMSE=2081.79, MAE=1930.06

[Fold 4/5] Training 

: 

In [None]:
pd.DataFrame(cv_metrics)

Unnamed: 0,RMSE,MAE,MSE,r2,Best_Params
0,1556.949522,1480.772912,2424092.0,-4.38901,"{'regressor__n_estimators': 300, 'regressor__m..."
1,1774.552702,1594.677451,3149037.0,-2.472972,"{'regressor__n_estimators': 400, 'regressor__m..."
2,1128.167306,1039.023279,1272761.0,-2.379099,"{'regressor__n_estimators': 400, 'regressor__m..."
3,1089.491947,1013.736033,1186993.0,0.586857,"{'regressor__n_estimators': 400, 'regressor__m..."
4,855.118378,768.509039,731227.4,0.036933,"{'regressor__n_estimators': 400, 'regressor__m..."


: 

: 

: 

: 

: 

: 

: 

: 

: 

In [None]:
855/12

71.25

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

In [None]:
X_train_fold.head()


Unnamed: 0,season,weathersit,temp,atemp,hum,windspeed
0,1,2,0.344167,0.363625,0.805833,0.160446
1,1,2,0.363478,0.353739,0.696087,0.248539
2,1,1,0.196364,0.189405,0.437273,0.248309
3,1,1,0.2,0.212122,0.590435,0.160296
4,1,1,0.226957,0.22927,0.436957,0.1869


: 

: 

: 

: 

: 

: 

In [None]:
# Assuming these imports are already done
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# Assume X_train, X_test, y_train, y_test are already created 
# from your time-based split or cross-validation fold

n_splits = 5
INITIAL_TRAIN_SIZE = 7 

# 1. Initialize TimeSeriesSplit
from sklearn.model_selection import TimeSeriesSplit

# We define the inner CV's test size (168 samples = 7 days if data is hourly)
MIN_TEST_SAMPLES = 168 
tscv = TimeSeriesSplit(
    n_splits=n_splits, 
    max_train_size=None, 
    test_size=INITIAL_TRAIN_SIZE 
)

# 2. Get Indices for the First Fold (Fold 1/5)
# We use next() to get the first (train_index, test_index) pair from the generator
train_index, test_index = next(tscv.split(X))


# 1. Define the Categorical Transformer (One-Hot Encoding)
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# --- 3. Split the Data into Train and Test Sets for the First Fold ---

# Create X_train, X_test by slicing the original DataFrame using the indices
X_train = X.iloc[train_index]
X_test = X.iloc[test_index]

# Create y_train, y_test by slicing the original Series using the indices
y_train = y.iloc[train_index]
y_test = y.iloc[test_index]

# Print sizes to confirm the split
print(f"X_train size: {len(X_train)} samples")
print(f"X_test size: {len(X_test)} samples")
print(f"y_train size: {len(y_train)} samples")
print(f"y_test size: {len(y_test)} samples")

# 2. Combine transformers using ColumnTransformer
# Note: NUMERICAL_FEATURES and CATEGORICAL_FEATURES must be defined lists of column names
preprocessor = ColumnTransformer(
    transformers=[
        # Numerical features pass straight through
        ('num', 'passthrough', NUMERICAL_FEATURES), 
        # Categorical features go through the defined encoding pipeline
        ('cat', categorical_transformer, CATEGORICAL_FEATURES)
    ],
    remainder='drop' # Drop any columns not explicitly listed (like 'datetime')
)

# 3. Fit the preprocessor on the training data and transform it
# The output will be a NumPy array.
X_train_processed = preprocessor.fit_transform(X_train)

X_train size: 696 samples
X_test size: 7 samples
y_train size: 696 samples
y_test size: 7 samples


: 

: 

: 

: 

: 

: 

In [None]:
yy = pd.DataFrame(X_train_processed)
yy

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.344167,0.363625,0.805833,0.160446,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.363478,0.353739,0.696087,0.248539,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.196364,0.189405,0.437273,0.248309,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.200000,0.212122,0.590435,0.160296,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.226957,0.229270,0.436957,0.186900,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
691,0.340000,0.350371,0.580417,0.052871,0.0,0.0,0.0,1.0,1.0,0.0,0.0
692,0.368333,0.378779,0.568750,0.148021,0.0,0.0,0.0,1.0,1.0,0.0,0.0
693,0.278333,0.248742,0.404583,0.376871,0.0,0.0,0.0,1.0,1.0,0.0,0.0
694,0.245833,0.257583,0.468333,0.150500,0.0,0.0,0.0,1.0,1.0,0.0,0.0


: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

In [None]:
def get_processed_feature_names(column_transformer):
    """
    Retrieves the final feature names after all transformations 
    (passthrough and OneHotEncoding).
    """
    feature_names = []
    
    # Iterate through the transformers defined in the ColumnTransformer
    for name, transformer, features in column_transformer.transformers_:
        
        # Check if the transformer is 'passthrough' (numerical features)
        if transformer == 'passthrough':
            feature_names.extend(features)
        
        # Check if the transformer is the OneHotEncoder pipeline
        elif name == 'cat':
            # Use get_feature_names_out() from the OneHotEncoder step
            onehot_features = transformer.named_steps['onehot'].get_feature_names_out(features)
            feature_names.extend(onehot_features)
            
        # Note: We skip 'remainder' columns since we set remainder='drop'
            
    return feature_names


# --- Assuming X_train and the fitted preprocessor are available ---

# A. Transform the X_train data (using fit_transform, which you already did)
# NOTE: If your preprocessor is already fitted from a previous step, 
# you only need to use 'transform(X_train)' here.
X_train_processed = preprocessor.fit_transform(X_train) 

print(f"✅ Data transformed into a NumPy array with shape: {X_train_processed.shape}")


# B. Get the final, expanded column names
processed_column_names = get_processed_feature_names(preprocessor)

print(f"\nTotal Number of Features after encoding: {len(processed_column_names)}")


# C. Convert the array back to a DataFrame for inspection
X_processed_df = pd.DataFrame(
    X_train_processed, 
    columns=processed_column_names
)

# D. Inspect the final DataFrame structure
print("\n--- First 5 Rows of X_train_processed (as DataFrame) ---")
print(X_processed_df.head())

✅ Data transformed into a NumPy array with shape: (696, 11)

Total Number of Features after encoding: 7


ValueError: Shape of passed values is (696, 11), indices imply (696, 7)

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# --- (Assuming load_data, create_datetime_features, create_preprocessing_pipeline, 
# and the main execution block are defined as per previous steps) ---

# --- Corrected Expanding Window Cross-Validation Function ---

def run_expanding_window_cv(X, y, preprocessor_template, param_dist, n_splits=5, initial_train_size=None, n_iter_search=10):
    """
    Performs Expanding Window Cross-Validation with Hyperparameter Tuning 
    using Randomized Search.
    
    Args:
        X, y: Full feature/target data.
        preprocessor_template: The unfitted ColumnTransformer template.
        param_dist: The hyperparameter distribution for tuning.
        ...
    """
    
    # Configuration for TimeSeriesSplit
    MIN_TEST_SAMPLES = 168 
    
    tscv = TimeSeriesSplit(
        n_splits=n_splits, 
        max_train_size=None, 
        test_size=initial_train_size or MIN_TEST_SAMPLES 
    )
    
    cv_metrics = []
    inner_cv = TimeSeriesSplit(n_splits=3, test_size=MIN_TEST_SAMPLES)
    
    print("\n--- Starting Expanding Window Cross-Validation with Tuning ---")
    
    for fold, (train_index, test_index) in enumerate(tscv.split(X)):
        
        # 1. Split Data for Current Fold
        X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
        y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
        
        print(f"\n[Fold {fold + 1}/{n_splits}] Training size: {len(X_train_fold)}, Testing size: {len(X_test_fold)}")

        # 2. Define the Full ML Pipeline (Preprocessor + Estimator)
        # We use the unfitted template here. The RandomizedSearchCV will handle fitting 
        # the preprocessor on the inner CV folds.
        full_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor_template), # Pass the TEMPLATE here
            ('regressor', RandomForestRegressor(random_state=42, n_jobs=-1))
        ])
        
        # 3. Perform Randomized Search on the CURRENT Training Data
        random_search = RandomizedSearchCV(
            full_pipeline, 
            param_distributions=param_dist, 
            n_iter=n_iter_search, 
            scoring='neg_mean_squared_error',
            cv=inner_cv, 
            random_state=42,
            n_jobs=-1,
            verbose=0
        )
        
        print(f"  Tuning Random Forest on current training window (n_iter={n_iter_search})...")
        
        # --- CRITICAL STEP ---
        # search.fit() handles:
        # a) Splitting X_train_fold using inner_cv.
        # b) On each inner split, it fits the preprocessor and the regressor 
        #    on the inner train set, then evaluates on the inner test set.
        random_search.fit(X_train_fold, y_train_fold)
        
        # 4. Use the Best Model to predict on the outer test fold
        best_model = random_search.best_estimator_
        
        # --- CRITICAL STEP ---
        # best_model (a Pipeline) handles:
        # a) Transforming X_test_fold using the preprocessor learned during the final
        #    best fit on the entire X_train_fold.
        # b) Predicting with the regressor.
        y_pred = best_model.predict(X_test_fold)
        
        # 5. Evaluate Metrics for this Fold
        rmse = np.sqrt(mean_squared_error(y_test_fold, y_pred))
        mae = mean_absolute_error(y_test_fold, y_pred)
        
        cv_metrics.append({'RMSE': rmse, 'MAE': mae, 'Best_Params': random_search.best_params_})
        
        print(f"  Best Parameters: {random_search.best_params_}")
        print(f"  Fold Metrics: RMSE={rmse:.2f}, MAE={mae:.2f}")

    # ... (Averaging logic remains the same) ...
    avg_rmse = np.mean([m['RMSE'] for m in cv_metrics])
    avg_mae = np.mean([m['MAE'] for m in cv_metrics])
    
    print("\n--- Cross-Validation Summary ---")
    print(f"Average RMSE over {n_splits} folds: {avg_rmse:.2f}")
    print(f"Average MAE over {n_splits} folds: {avg_mae:.2f}")
    
    return {'avg_RMSE': avg_rmse, 'avg_MAE': avg_mae, 'individual_folds': cv_metrics}

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

In [None]:
# --- 1. Data Loading and Preparation ---

def load_data(file_path):
    """Loads the dataset, ensuring the datetime column is parsed correctly."""
    print(f"Loading data from: {file_path}")
    # Assuming 'datetime' is the time column and 'demand' is the target
    df = pd.read_csv(file_path, parse_dates=['dteday'])
    df = df.sort_values(by='dteday').reset_index(drop=True)
    
    # Simple check for target variable
    if 'cnt' not in df.columns:
        raise ValueError("Dataset must contain a 'demand' column.")
    
    print(f"Data loaded with {len(df)} rows and {len(df.columns)} columns.")
    return df



: 

: 

: 

: 

: 

: 

In [None]:
df.index

RangeIndex(start=0, stop=731, step=1)

: 

: 

: 

: 

: 

: 

Loading data from: data/dataset/day.csv
Data loaded with 731 rows and 16 columns.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   instant     731 non-null    int64         
 1   dteday      731 non-null    datetime64[ns]
 2   season      731 non-null    int64         
 3   yr          731 non-null    int64         
 4   mnth        731 non-null    int64         
 5   holiday     731 non-null    int64         
 6   weekday     731 non-null    int64         
 7   workingday  731 non-null    int64         
 8   weathersit  731 non-null    int64         
 9   temp        731 non-null    float64       
 10  atemp       731 non-null    float64       
 11  hum         731 non-null    float64       
 12  windspeed   731 non-null    float64       
 13  casual      731 non-null    int64         
 14  registered  731 non-null    int64       

: 

: 

: 

: 

: 

: 

In [None]:
def create_preprocessing_pipeline(numerical_cols, categorical_cols):
    """
    Creates a column transformer for applying different transformations 
    to numerical and categorical features.
    """
    # Define steps for numerical features (e.g., scaling could be added here)
    numerical_transformer = Pipeline(steps=[
        # For Random Forest, often no scaling is needed, but we keep the structure.
        ('scaler', StandardScaler()) 
    ])
    
    # Define steps for categorical features (One-Hot Encoding)
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Combine transformers using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ],
        remainder='passthrough' # Keep any other columns if needed
    )
    
    return preprocessor

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

In [None]:


# --- 3. Model Training and Evaluation ---

def train_and_evaluate_model(X_train, X_test, y_train, y_test, preprocessor):
    """
    Trains the Random Forest model and evaluates it using RMSE and MAE.
    """
    # 1. Define the full ML Pipeline (Preprocessor + Estimator)
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(
            n_estimators=100,      # Number of trees in the forest
            random_state=42,       # For reproducibility
            n_jobs=-1              # Use all available cores
        ))
    ])
    
    # 2. Train the model
    print("\n--- Training Random Forest Regressor ---")
    model_pipeline.fit(X_train, y_train)
    print("Training complete.")
    
    # 3. Predict on the test set
    y_pred = model_pipeline.predict(X_test)
    
    # 4. Evaluate Metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    
    # --- 4. Cross-Validation Block (New Function) ---



: 

: 

: 

: 

: 

: 

In [None]:
# --- 4. Cross-Validation Block (Updated Function) ---

def run_expanding_window_cv(X, y, preprocessor, param_dist, n_splits=5, initial_train_size=None, n_iter_search=10):
    """
    Performs Expanding Window Cross-Validation with Hyperparameter Tuning 
    using Randomized Search.
    
    Args:
        ... (existing args) ...
        param_dist (dict): The hyperparameter distribution for tuning.
        n_iter_search (int): Number of parameter settings that are sampled.
    """
    
    tscv = TimeSeriesSplit(
        n_splits=n_splits, 
        max_train_size=None, 
        test_size=initial_train_size
    )
    
    cv_metrics = []
    
    print("\n--- Starting Expanding Window Cross-Validation with Tuning ---")
    
    # Define a smaller, inner TimeSeriesSplit for tuning the current training fold
    inner_cv = TimeSeriesSplit(n_splits=3)
    
    for fold, (train_index, test_index) in enumerate(tscv.split(X)):
        
        # 1. Prepare Data for Current Fold
        X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
        y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
        
        print(f"\n[Fold {fold + 1}/{n_splits}] Training size: {len(X_train_fold)}, Testing size: {len(X_test_fold)}")

        # 2. Define the Full ML Pipeline (Preprocessor + Estimator)
        # We need this pipeline inside the tuning step
        full_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', RandomForestRegressor(random_state=42, n_jobs=-1)) # Use base estimator here
        ])
        
        # 3. Perform Randomized Search on the CURRENT Training Data
        # We use NEGATIVE MSE because RandomizedSearchCV maximizes the scoring function (neg_mean_squared_error is minimized)
        random_search = RandomizedSearchCV(
            full_pipeline, 
            param_distributions=param_dist, 
            n_iter=n_iter_search, 
            scoring='neg_mean_squared_error',
            cv=inner_cv, # Use the inner TimeSeriesSplit for tuning
            random_state=42,
            n_jobs=-1,
            verbose=0
        )
        
        print(f"  Tuning Random Forest on current training window (n_iter={n_iter_search})...")
        random_search.fit(X_train_fold, y_train_fold)
        
        # 4. Use the Best Model found by the search to predict on the outer test fold
        best_model = random_search.best_estimator_
        y_pred = best_model.predict(X_test_fold)
        
        # 5. Evaluate Metrics for this Fold
        rmse = np.sqrt(mean_squared_error(y_test_fold, y_pred))
        mae = mean_absolute_error(y_test_fold, y_pred)
        
        cv_metrics.append({'RMSE': rmse, 'MAE': mae, 'Best_Params': random_search.best_params_})
        
        print(f"  Best Parameters: {random_search.best_params_}")
        print(f"  Fold Metrics: RMSE={rmse:.2f}, MAE={mae:.2f}")

    # 6. Calculate and Report Averages
    # ... (same averaging logic as before) ...
    avg_rmse = np.mean([m['RMSE'] for m in cv_metrics])
    avg_mae = np.mean([m['MAE'] for m in cv_metrics])
    
    print("\n--- Cross-Validation Summary ---")
    print(f"Average RMSE over {n_splits} folds: {avg_rmse:.2f}")
    print(f"Average MAE over {n_splits} folds: {avg_mae:.2f}")
    
    return {'avg_RMSE': avg_rmse, 'avg_MAE': avg_mae, 'individual_folds': cv_metrics}

: 

: 

: 

: 

: 

: 

In [None]:
def run_pipeline(file_path, test_split_date, FEATURE_COLS, TARGET_COL):
    """Main function to run the entire pipeline."""
    
    # 1. Load Data and Engineer Datetime Features
    data = load_data(file_path)
    
    

    data_with_features = data.copy()
    # Filter down to the columns we actually use
    X = data_with_features[FEATURE_COLS]
    y = data_with_features[TARGET_COL]
    
    # 3. Time-based Train-Test Split
    # The split should be based on time to simulate real-world prediction
    split_date = pd.to_datetime(test_split_date)
    
    X_train = X[data_with_features['dteday'] < split_date]
    X_test = X[data_with_features['dteday'] >= split_date]
    y_train = y[data_with_features['dteday'] < split_date]
    y_test = y[data_with_features['dteday'] >= split_date]
    
    print(f"\nTraining set size: {len(X_train)} samples")
    print(f"Testing set size: {len(X_test)} samples")
    
    # 4. Define Column Types for Preprocessing
    # This is crucial for applying OneHotEncoder correctly
    NUMERICAL_FEATURES = []
    CATEGORICAL_FEATURES = []
    
    for col in FEATURE_COLS:
        col_dtype = X_train[col].dtype
        num_unique = X_train[col].nunique()
        
        # Rule 1: If dtype is float (e.g., 'temp'), it's numerical.
        if np.issubdtype(col_dtype, np.number) and 'float' in str(col_dtype):
            NUMERICAL_FEATURES.append(col)
        
        # Rule 2: If it's a number (int) but has a low number of unique values,
        # treat it as categorical for the OneHotEncoder.
        elif np.issubdtype(col_dtype, np.number) and num_unique <= 50:
            CATEGORICAL_FEATURES.append(col)
            
        # Rule 3: If it's a high-cardinality integer or a true continuous variable.
        elif np.issubdtype(col_dtype, np.number):
             NUMERICAL_FEATURES.append(col)
        
        # Rule 4: If it's an object/string (shouldn't happen much here)
        elif col_dtype == 'object':
            CATEGORICAL_FEATURES.append(col)
            
    print("\n--- Identified Feature Types ---")
    print(f"Numerical Features: {NUMERICAL_FEATURES}")
    print(f"Categorical Features: {CATEGORICAL_FEATURES}")
    
    # 4. Create Preprocessing Pipeline
    preprocessor = create_preprocessing_pipeline(NUMERICAL_FEATURES, CATEGORICAL_FEATURES)
    
    # 5. Run Expanding Window Cross-Validation!
    # Using 5 splits (5 separate training/testing rounds)
    # Adjust initial_train_size if you want a different starting window (e.g., 6 months of data)
    final_metrics = run_expanding_window_cv(
        X, y, preprocessor, 
        n_splits=5, 
        initial_train_size=180
        # Example: Start with 20% of data for the first training set
    )
    
    return final_metrics # Only returns the metrics now
    
    # # 5. Create Preprocessing Pipeline
    # preprocessor = create_preprocessing_pipeline(NUMERICAL_FEATURES, CATEGORICAL_FEATURES)
    
    # # 6. Train and Evaluate
    # model, metrics = train_and_evaluate_model(X_train, X_test, y_train, y_test, preprocessor)
    
    # return model, metrics



: 

: 

: 

: 

: 

: 

### Execution

In [None]:
df.info()
df.tail(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   instant     731 non-null    int64         
 1   dteday      731 non-null    datetime64[ns]
 2   season      731 non-null    int64         
 3   yr          731 non-null    int64         
 4   mnth        731 non-null    int64         
 5   holiday     731 non-null    int64         
 6   weekday     731 non-null    int64         
 7   workingday  731 non-null    int64         
 8   weathersit  731 non-null    int64         
 9   temp        731 non-null    float64       
 10  atemp       731 non-null    float64       
 11  hum         731 non-null    float64       
 12  windspeed   731 non-null    float64       
 13  casual      731 non-null    int64         
 14  registered  731 non-null    int64         
 15  cnt         731 non-null    int64         
dtypes: datetime64[ns](1), floa

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
729,730,2012-12-30,1,1,12,0,0,0,1,0.255833,0.2317,0.483333,0.350754,364,1432,1796
730,731,2012-12-31,1,1,12,0,1,1,2,0.215833,0.223487,0.5775,0.154846,439,2290,2729


: 

: 

: 

: 

: 

: 

In [None]:
# Hyperparameter Grid for Random Forest
# Defines the distributions from which parameters will be sampled


: 

: 

: 

: 

: 

: 

In [None]:
# --- Execution Example ---

if __name__ == '__main__':
    # NOTE: Replace 'your_bike_sharing_data.csv' with your actual file path!
    FILE_PATH = "data/dataset/day.csv" 
    # NOTE: Set the date to split your data (e.g., use all data before '2012-01-01' for training)
    TEST_SPLIT_DATE = '2012-10-31' 
    
    param_dist = {
        'regressor__n_estimators': [100, 200, 300, 400], # Number of trees
        'regressor__max_depth': [10, 20, 30, None],      # Max depth of the trees (None means nodes are expanded until all leaves are pure)
        'regressor__min_samples_split': [2, 5, 10],      # Minimum number of samples required to split an internal node
        'regressor__min_samples_leaf': [1, 2, 4],        # Minimum number of samples required to be at a leaf node
        }
    
    N_ITER_SEARCH = 15
    
    FEATURE_COLS = [
        # 'dteday',
        'season',
        'yr',
        'mnth',
        'holiday',
        'weekday',
        'workingday',
        'weathersit',
        'temp',
        'atemp',
        'hum',
        'windspeed',
        # 'casual',
        # 'registered',
        # 'bikes_cnt',
        # 'day'
        ]
    TARGET_COL = 'cnt'

    
    try:
        # final_model, 
        final_metrics = run_expanding_window_cv(
            X, y, preprocessor, 
            param_dist=param_dist,
            n_splits=5, 
            initial_train_size=int(len(X) * 0.2),
            n_iter_search=N_ITER_SEARCH
        )
        final_metrics = run_pipeline(FILE_PATH, TEST_SPLIT_DATE, FEATURE_COLS, TARGET_COL)
        
        print("\n✅ Pipeline Execution Complete.")
        print(f"Final Model Metrics: {final_metrics}")
        
    except FileNotFoundError:
        print(f"\n❌ ERROR: File not found at '{FILE_PATH}'. Please update the FILE_PATH.")
    except Exception as e:
        print(f"\n❌ An unexpected error occurred: {e}")

Loading data from: data/dataset/day.csv
Data loaded with 731 rows and 16 columns.

Training set size: 669 samples
Testing set size: 62 samples

--- Identified Feature Types ---
Numerical Features: ['temp', 'atemp', 'hum', 'windspeed']
Categorical Features: ['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']

❌ An unexpected error occurred: run_expanding_window_cv() missing 1 required positional argument: 'param_dist'


: 

: 

: 

: 

: 

: 

In [None]:
data

NameError: name 'data' is not defined

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 