In [1009]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import mlflow
import json 
from sklearn.metrics import make_scorer
from prophet import Prophet
from xgboost import XGBRegressor

In [1010]:
# --- MLFLOW SETUP ---
EXPERIMENT_NAME = "Bike_Sharing_Demand_TSCV"

mlflow.set_experiment(EXPERIMENT_NAME)

FEATURE_COLS = [
    # 'dteday',
    'season',
    'yr',
    'mnth',
    'holiday',
    'weekday',
    'workingday',
    'weathersit',
    'temp',
    'atemp',
    'hum',
    'windspeed',
    # 'casual',
    # 'registered',
    # 'bikes_cnt',
    # 'day'
    ] 



In [1011]:
# Custom function to calculate MAPE
def mean_absolute_percentage_error(y_true, y_pred):
    """
    Calculates the Mean Absolute Percentage Error (MAPE).
    
    Handles zero values in y_true by replacing them with a very small 
    epsilon value to avoid division by zero errors.
    """
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    
    # Add a small epsilon to avoid division by zero
    # np.where(y_true == 0, epsilon, y_true) ensures division is safe
    epsilon = 1e-8 
    
    # Calculate |(Actual - Predicted) / Actual|
    percentage_error = np.abs((y_true - y_pred) / np.where(y_true == 0, epsilon, y_true))
    
    # Calculate the mean and multiply by 100 to get a percentage
    return np.mean(percentage_error) * 100

def neg_mean_absolute_percentage_error(y_true, y_pred):
    return -1.0 * mean_absolute_percentage_error(y_true, y_pred)

mape_scorer = make_scorer(neg_mean_absolute_percentage_error, greater_is_better=True)

# CONFIG

In [1012]:

# CONFIG

FILE_PATH = "data/dataset/day.csv"
TARGET_COL = 'cnt'
MIN_TEST_SAMPLES = None #≈ # 210 #30
MAX_TRAIN_SAMPLE = None # 360 #None # 180 # day

SCORING_MARTIX = "neg_mean_absolute_error"
# SCORING_MARTIX = 'neg_mean_squared_error'
# SCORING_MARTIX = "neg_root_mean_squared_error"
# SCORING_MARTIX = mape_scorer

N_ITER_SEARCH = 15
n_splits = 5
# n_iter_search = 10

# 1. Loading the data
df_ = pd.read_csv(FILE_PATH, parse_dates=['dteday'])
df_ = df_.sort_values(by='dteday').reset_index(drop=True)

# split dataset
df_last30 = df_.tail(30)
df = df_.iloc[:-30, :]

param_dist = {
    'regressor__n_estimators': [200, 500, 800, 1000, 1500],
    'regressor__max_depth': [15, 30, 45, 60, None], # Exploring deeper values
    'regressor__min_samples_split': [2, 5, 10, 20, 40], # Testing higher regularization
    'regressor__min_samples_leaf': [1, 3, 5, 10, 15],  # Testing higher regularization       
    }



# Feature Engg.

In [1013]:
# 2. Feature engineering

df['is_weekend'] = np.where(df['weekday'].isin([5, 6]), 1, 0)

df['lag_demand_1d'] = df['cnt'].shift(1)
df['lag_demand_1d'] = df['lag_demand_1d'].fillna(0) 

df['lag_demand_7d'] = df['cnt'].shift(1)
df['lag_demand_7d'] = df['lag_demand_7d'].fillna(0) 

# --- Day of Week (7-day cycle) ---

df['temp_lag_1d'] = df['temp'].shift(1)
df['temp_lag_1d'] = df['temp_lag_1d'].fillna(0)

df['temp_lag_7d'] = df['temp'].shift(7)
df['temp_lag_7d'] = df['temp_lag_7d'].fillna(0)

df['atemp_lag_1d'] = df['atemp'].shift(1)
df['atemp_lag_1d'] = df['atemp_lag_1d'].fillna(0)

df['atemp_lag_7d'] = df['atemp'].shift(7)
df['atemp_lag_7d'] = df['atemp_lag_7d'].fillna(0)

df['hum_lag_1d'] = df['hum'].shift(1)
df['hum_lag_1d'] = df['hum_lag_1d'].fillna(0)

df['hum_lag_7d'] = df['hum'].shift(7)
df['hum_lag_7d'] = df['hum_lag_7d'].fillna(0)

df['windspeed_lag_1d'] = df['windspeed'].shift(1)
df['windspeed_lag_1d'] = df['windspeed_lag_1d'].fillna(0)

df['windspeed_lag_7d'] = df['windspeed'].shift(7)
df['windspeed_lag_7d'] = df['windspeed_lag_7d'].fillna(0)

# Interaction: Temperature impact during peak usage hours

# Interaction: Demand on weekends vs. workdays
df['temp_x_is_weekend'] = df['temp'] * df['is_weekend']

# Interaction: Demand on weekends vs. workdays
df['atemp_x_is_weekend'] = df['atemp'] * df['is_weekend']

FEATURE_COLS.extend([
    'atemp_x_is_weekend', 
    'temp_x_is_weekend', 
    'windspeed_lag_7d', 
    'windspeed_lag_1d', 
    'hum_lag_7d', 
    'hum_lag_1d', 
    'atemp_lag_7d', 
    'atemp_lag_1d', 
    'temp_lag_7d', 
    'temp_lag_1d', 
    'is_weekend', 
    'lag_demand_7d', 
    'trend',
    'yearly',
    'weekly'
    ]
                    )
 
print(f"Data loaded with {len(df)} rows and {len(df.columns)} columns.")
print("\n", df.columns)



Data loaded with 701 rows and 29 columns.

 Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt', 'is_weekend', 'lag_demand_1d',
       'lag_demand_7d', 'temp_lag_1d', 'temp_lag_7d', 'atemp_lag_1d',
       'atemp_lag_7d', 'hum_lag_1d', 'hum_lag_7d', 'windspeed_lag_1d',
       'windspeed_lag_7d', 'temp_x_is_weekend', 'atemp_x_is_weekend'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['is_weekend'] = np.where(df['weekday'].isin([5, 6]), 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lag_demand_1d'] = df['cnt'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lag_demand_1d'] = df['lag_demand_1d'].fillna(0)
A value is trying to be set on a copy of a slice f

In [1014]:
m = Prophet(
    growth='linear',
    yearly_seasonality=True,
    weekly_seasonality=True,
    daily_seasonality=True # Set to True if your data is daily or finer
)
# 1. Prepare data for Prophet
prophet_df = df[['dteday', 'cnt']].rename(columns={'dteday': 'ds', 'cnt': 'y'})
m.fit(prophet_df)

19:46:44 - cmdstanpy - INFO - Chain [1] start processing


19:46:44 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x14e904a90>

In [1015]:
# 2. Predict on the historical dataset to get component values
future = m.make_future_dataframe(periods=0, freq='D')
forecast = m.predict(future)

# 3. Extract key Prophet components (components capture trend, seasonality, etc.)
prophet_components = forecast[['ds', 'trend', 'yearly', 'weekly']]

# 4. Merge these components back into your main training DataFrame (X)
X_features = df.merge(prophet_components, 
                     left_on='dteday', 
                     right_on='ds', 
                     how='left')
# Drop the redundant 'ds' column
X_features = X_features.drop(columns=['ds'])

In [1016]:
df = X_features.copy()
df.columns

Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt', 'is_weekend', 'lag_demand_1d',
       'lag_demand_7d', 'temp_lag_1d', 'temp_lag_7d', 'atemp_lag_1d',
       'atemp_lag_7d', 'hum_lag_1d', 'hum_lag_7d', 'windspeed_lag_1d',
       'windspeed_lag_7d', 'temp_x_is_weekend', 'atemp_x_is_weekend', 'trend',
       'yearly', 'weekly'],
      dtype='object')

In [1017]:
NUMERICAL_FEATURES = []
CATEGORICAL_FEATURES = []

# Feature Type Identification
for col in FEATURE_COLS:
    col_dtype = df[col].dtype
    num_unique = df[col].nunique()
    
    if np.issubdtype(col_dtype, np.number) and 'float' in str(col_dtype):
        NUMERICAL_FEATURES.append(col)
    elif np.issubdtype(col_dtype, np.number) and num_unique <= 50:
        CATEGORICAL_FEATURES.append(col)
    elif np.issubdtype(col_dtype, np.number):
            NUMERICAL_FEATURES.append(col)
    elif col_dtype == 'object':
        CATEGORICAL_FEATURES.append(col)
        
print("\n--- Identified Feature Types ---")
print(f"Numerical Features: {NUMERICAL_FEATURES}")
print(f"Categorical Features: {CATEGORICAL_FEATURES}")

X = df[FEATURE_COLS]
y = df[TARGET_COL]

# 2a. Create pre-processing pipeline
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', NUMERICAL_FEATURES), 
        ('cat', categorical_transformer, CATEGORICAL_FEATURES)
    ],
    remainder='drop' 
)

tscv = TimeSeriesSplit(
        n_splits=n_splits, 
        max_train_size=MAX_TRAIN_SAMPLE, 
        test_size= MIN_TEST_SAMPLES
    )
    
cv_metrics = []

print("\n--- Starting Expanding Window Cross-Validation with Tuning ---")

# Inner split for tuning (used inside RandomizedSearchCV)
inner_cv = TimeSeriesSplit(n_splits=3,
                           test_size=MIN_TEST_SAMPLES)

# --- START MLFLOW RUN ---
# This context manager automatically starts and ends a run
with mlflow.start_run():
    
    # 1. Log the featured columns as a **parameter**
    mlflow.log_param("featured_columns", json.dumps(FEATURE_COLS)) 
    
    # Also log other parameters that define the experiment
    mlflow.log_param("FILE_PATH", FILE_PATH)
    mlflow.log_param("n_splits", n_splits)
    mlflow.log_param("n_iter_search", n_iter_search)
    mlflow.log_param("model_type", "RandomForestRegressor")
    mlflow.log_param("SCORING_MARTIX", SCORING_MARTIX)   
    
    N_ITER_SEARCH = 10 
    
    for fold, (train_index, test_index) in enumerate(tscv.split(X)):
        
        # Prepare Data for Current Fold
        X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
        y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
        
        print(f"\n[Fold {fold + 1}/{n_splits}] Training size: {len(X_train_fold)}, Testing size: {len(X_test_fold)}")

        # Define the Full ML Pipeline
        full_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', RandomForestRegressor(random_state=42, n_jobs=-1))
        ])
        
        # Perform Randomized Search on the CURRENT Training Data
        random_search = RandomizedSearchCV(
            full_pipeline, 
            param_distributions=param_dist, 
            n_iter=n_iter_search, 
            scoring= SCORING_MARTIX, 
            cv=inner_cv, 
            random_state=42,
            n_jobs=-1,
            verbose=0
        )
        
        print(f"  Tuning Random Forest on current training window (n_iter={n_iter_search})...")
        random_search.fit(X_train_fold, y_train_fold)
        
        # Use the Best Model found to predict on the outer test fold
        best_model = random_search.best_estimator_
        y_pred = best_model.predict(X_test_fold)
        
        # Evaluate Metrics for this Fold
        rmse = np.sqrt(mean_squared_error(y_test_fold, y_pred))
        mae = mean_absolute_error(y_test_fold, y_pred)
        mse_rf = mean_squared_error(y_test_fold, y_pred)
        r2_rf = r2_score(y_test_fold, y_pred)
        mape = mean_absolute_percentage_error(y_test_fold, y_pred)
        
        # 2. Log Fold Metrics as **metrics**
        mlflow.log_metric(f"fold_{fold+1}_rmse", rmse)
        mlflow.log_metric(f"fold_{fold+1}_mae", mae)
        mlflow.log_metric(f"fold_{fold+1}_r2", r2_rf)
        mlflow.log_metric(f"fold_{fold+1}_mse", mse_rf)
        mlflow.log_metric(f"fold_{fold+1}_mape", mape)
        
        # 3. Log Best Parameters for the Fold as **parameters**
        # Note: You may want to simplify this or only log the final best parameters
        # For simplicity, we log all best params for the fold as a parameter
        fold_params = {f"fold_{fold+1}_best_params": json.dumps(random_search.best_params_)}
        mlflow.log_params(fold_params) 
        
        cv_metrics.append({'RMSE': rmse,
                           'MAE': mae, 
                           'MSE': mse_rf,
                           'r2':r2_rf,
                           'MAPE': mape,
                           'Best_Params': random_search.best_params_})
        
        print(f"  Best Parameters: {random_search.best_params_}")
        print(f"  Fold Metrics: RMSE={rmse:.2f}, MAE={mae:.2f}")

    # Calculate and Report Averages
    avg_rmse = np.mean([m['RMSE'] for m in cv_metrics])
    avg_mae = np.mean([m['MAE'] for m in cv_metrics])
    avg_mse = np.mean([m['MSE'] for m in cv_metrics])
    avg_r2 = np.mean([m['r2'] for m in cv_metrics])
    avg_mape = np.mean([m['MAPE'] for m in cv_metrics])

    # 4. Log the final average metrics
    mlflow.log_metric("avg_rmse", avg_rmse)
    mlflow.log_metric("avg_mae", avg_mae)
    mlflow.log_metric("avg_mse", avg_mse)
    mlflow.log_metric("avg_r2", avg_r2)
    mlflow.log_metric("avg_mape", avg_mape)

    print("\n--- Cross-Validation Summary ---")
    print(f"Average RMSE over {n_splits} folds: {avg_rmse:.2f}")
    print(f"Average MAE over {n_splits} folds: {avg_mae:.2f}")
    print(f"Average MSE over {n_splits} folds: {avg_mse:.2f}")
    print(f"Average r2 over {n_splits} folds: {avg_r2:.2f}")
    print(f"Average MAPE over {n_splits} folds: {avg_mape:.2f}%")

    print('avg_MAPE', avg_mape,'avg_RMSE', avg_rmse, 'avg_MAE', avg_mae, 'avg_r2', avg_r2, 'avg_mse', avg_mse, 'individual_folds', cv_metrics)
# The run is automatically ended here


--- Identified Feature Types ---
Numerical Features: ['temp', 'atemp', 'hum', 'windspeed', 'atemp_x_is_weekend', 'temp_x_is_weekend', 'windspeed_lag_7d', 'windspeed_lag_1d', 'hum_lag_7d', 'hum_lag_1d', 'atemp_lag_7d', 'atemp_lag_1d', 'temp_lag_7d', 'temp_lag_1d', 'lag_demand_7d', 'trend', 'yearly', 'weekly']
Categorical Features: ['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit', 'is_weekend']

--- Starting Expanding Window Cross-Validation with Tuning ---

[Fold 1/5] Training size: 121, Testing size: 116
  Tuning Random Forest on current training window (n_iter=10)...
  Best Parameters: {'regressor__n_estimators': 800, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 3, 'regressor__max_depth': None}
  Fold Metrics: RMSE=643.21, MAE=528.10

[Fold 2/5] Training size: 237, Testing size: 116
  Tuning Random Forest on current training window (n_iter=10)...
  Best Parameters: {'regressor__n_estimators': 800, 'regressor__min_samples_split': 2, 'regre

In [1018]:
print(df['cnt'].max())
print(df['cnt'].min())
print(df.shape)

8714
22
(701, 32)


## XGBoost

In [1019]:
param_dist_xgb = {
    'regressor__n_estimators': [100, 300, 500],
    'regressor__max_depth': [3, 6, 9, 12],
    'regressor__learning_rate': [0.05, 0.1, 0.2],
    'regressor__subsample': [0.7, 0.9],
    'regressor__reg_alpha': [0, 0.1], # L1 regularization
}


NUMERICAL_FEATURES = []
CATEGORICAL_FEATURES = []

# Feature Type Identification
for col in FEATURE_COLS:
    col_dtype = df[col].dtype
    num_unique = df[col].nunique()
    
    if np.issubdtype(col_dtype, np.number) and 'float' in str(col_dtype):
        NUMERICAL_FEATURES.append(col)
    elif np.issubdtype(col_dtype, np.number) and num_unique <= 50:
        CATEGORICAL_FEATURES.append(col)
    elif np.issubdtype(col_dtype, np.number):
            NUMERICAL_FEATURES.append(col)
    elif col_dtype == 'object':
        CATEGORICAL_FEATURES.append(col)
        
print("\n--- Identified Feature Types ---")
print(f"Numerical Features: {NUMERICAL_FEATURES}")
print(f"Categorical Features: {CATEGORICAL_FEATURES}")

X = df[FEATURE_COLS]
y = df[TARGET_COL]

# 2a. Create pre-processing pipeline
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', NUMERICAL_FEATURES), 
        ('cat', categorical_transformer, CATEGORICAL_FEATURES)
    ],
    remainder='drop' 
)

tscv = TimeSeriesSplit(
        n_splits=n_splits, 
        max_train_size=MAX_TRAIN_SAMPLE, 
        test_size= MIN_TEST_SAMPLES
    )
    
cv_metrics = []

print("\n--- Starting Expanding Window Cross-Validation with Tuning ---")

# Inner split for tuning (used inside RandomizedSearchCV)
inner_cv = TimeSeriesSplit(n_splits=3,
                           test_size=MIN_TEST_SAMPLES)



--- Identified Feature Types ---
Numerical Features: ['temp', 'atemp', 'hum', 'windspeed', 'atemp_x_is_weekend', 'temp_x_is_weekend', 'windspeed_lag_7d', 'windspeed_lag_1d', 'hum_lag_7d', 'hum_lag_1d', 'atemp_lag_7d', 'atemp_lag_1d', 'temp_lag_7d', 'temp_lag_1d', 'lag_demand_7d', 'trend', 'yearly', 'weekly']
Categorical Features: ['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit', 'is_weekend']

--- Starting Expanding Window Cross-Validation with Tuning ---


In [1020]:
# --- 4. MLFLOW RUN LOOP ---
with mlflow.start_run():
    
    # Log initial parameters
    mlflow.log_param("featured_columns", json.dumps(FEATURE_COLS)) 
    mlflow.log_param("n_splits", n_splits)
    mlflow.log_param("n_iter_search", N_ITER_SEARCH)
    mlflow.log_param("model_type", "XGBoostRegressor")
    
    for fold, (train_index, test_index) in enumerate(tscv.split(X)):
        
        X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
        y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
        
        print(f"\n[Fold {fold + 1}/{n_splits}] Training size: {len(X_train_fold)}, Testing size: {len(X_test_fold)}")

        # Define the Full ML Pipeline with XGBoost
        full_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', XGBRegressor(random_state=42, n_jobs=-1, eval_metric='rmse', verbosity=0)) 
        ])
        
        # Perform Randomized Search on the CURRENT Training Data
        random_search = RandomizedSearchCV(
            full_pipeline, 
            param_distributions=param_dist_xgb, 
            n_iter=N_ITER_SEARCH, 
            scoring=mape_scorer, # Optimized scoring using NEGATED MAPE
            cv=inner_cv, 
            random_state=42,
            n_jobs=-1,
            verbose=0
        )
        
        print(f"  Tuning XGBoost on current training window (n_iter={N_ITER_SEARCH})...")
        random_search.fit(X_train_fold, y_train_fold)
        
        best_model = random_search.best_estimator_
        y_pred = best_model.predict(X_test_fold)
        
        # --- CALCULATE ALL METRICS ---
        rmse = np.sqrt(mean_squared_error(y_test_fold, y_pred))
        mae = mean_absolute_error(y_test_fold, y_pred)
        mse_rf = mean_squared_error(y_test_fold, y_pred)
        r2_rf = r2_score(y_test_fold, y_pred)
        mape = mean_absolute_percentage_error(y_test_fold, y_pred) # Positive MAPE
        
        # Log Fold Metrics 
        mlflow.log_metric(f"fold_{fold+1}_rmse", rmse)
        mlflow.log_metric(f"fold_{fold+1}_mae", mae)
        mlflow.log_metric(f"fold_{fold+1}_r2", r2_rf)
        mlflow.log_metric(f"fold_{fold+1}_mape", mape) # Log the positive value
        
        # Log Best Parameters
        fold_params = {f"fold_{fold+1}_best_params": json.dumps(random_search.best_params_)}
        mlflow.log_params(fold_params) 
        
        cv_metrics.append({'RMSE': rmse,
                           'MAE': mae, 
                           'MSE': mse_rf,
                           'r2':r2_rf,
                           'MAPE': mape,
                           'Best_Params': random_search.best_params_})
        
        print(f"  Best Parameters: {random_search.best_params_}")
        print(f"  Fold Metrics: RMSE={rmse:.2f}, MAE={mae:.2f}, MAPE={mape:.2f}%")

    # --- 5. LOG FINAL AVERAGE METRICS ---
    
    avg_rmse = np.mean([m['RMSE'] for m in cv_metrics])
    avg_mae = np.mean([m['MAE'] for m in cv_metrics])
    avg_mse = np.mean([m['MSE'] for m in cv_metrics])
    avg_r2 = np.mean([m['r2'] for m in cv_metrics])
    avg_mape = np.mean([m['MAPE'] for m in cv_metrics])

    mlflow.log_metric("avg_rmse", avg_rmse)
    mlflow.log_metric("avg_mae", avg_mae)
    mlflow.log_metric("avg_mse", avg_mse)
    mlflow.log_metric("avg_r2", avg_r2)
    mlflow.log_metric("avg_mape", avg_mape)

    print("\n--- Cross-Validation Summary ---")
    print(f"Average RMSE over {n_splits} folds: {avg_rmse:.2f}")
    print(f"Average MAE over {n_splits} folds: {avg_mae:.2f}")
    print(f"Average MAPE over {n_splits} folds: {avg_mape:.2f}%")
    print(f"Average MSE over {n_splits} folds: {avg_mse:.2f}")
    print(f"Average r2 over {n_splits} folds: {avg_r2:.2f}")

    print('avg_RMSE', avg_rmse, 'avg_MAE', avg_mae, 'avg_MAPE', avg_mape, 'avg_r2', avg_r2, 'avg_mse', avg_mse)


[Fold 1/5] Training size: 121, Testing size: 116
  Tuning XGBoost on current training window (n_iter=10)...
  Best Parameters: {'regressor__subsample': 0.7, 'regressor__reg_alpha': 0, 'regressor__n_estimators': 100, 'regressor__max_depth': 12, 'regressor__learning_rate': 0.2}
  Fold Metrics: RMSE=581.92, MAE=467.85, MAPE=9.98%

[Fold 2/5] Training size: 237, Testing size: 116
  Tuning XGBoost on current training window (n_iter=10)...
  Best Parameters: {'regressor__subsample': 0.9, 'regressor__reg_alpha': 0.1, 'regressor__n_estimators': 300, 'regressor__max_depth': 6, 'regressor__learning_rate': 0.05}
  Fold Metrics: RMSE=852.40, MAE=663.93, MAPE=25.57%

[Fold 3/5] Training size: 353, Testing size: 116
  Tuning XGBoost on current training window (n_iter=10)...
  Best Parameters: {'regressor__subsample': 0.9, 'regressor__reg_alpha': 0, 'regressor__n_estimators': 500, 'regressor__max_depth': 6, 'regressor__learning_rate': 0.2}
  Fold Metrics: RMSE=1224.46, MAE=929.35, MAPE=26.74%

[Fold

In [1021]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

In [1022]:
df = pd.read_csv("data/dataset/day.csv")
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [1023]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   dteday      731 non-null    object 
 2   season      731 non-null    int64  
 3   yr          731 non-null    int64  
 4   mnth        731 non-null    int64  
 5   holiday     731 non-null    int64  
 6   weekday     731 non-null    int64  
 7   workingday  731 non-null    int64  
 8   weathersit  731 non-null    int64  
 9   temp        731 non-null    float64
 10  atemp       731 non-null    float64
 11  hum         731 non-null    float64
 12  windspeed   731 non-null    float64
 13  casual      731 non-null    int64  
 14  registered  731 non-null    int64  
 15  cnt         731 non-null    int64  
dtypes: float64(4), int64(11), object(1)
memory usage: 91.5+ KB


In [1024]:
df = df.drop(columns=['instant', 'dteday', 'casual', 'registered'])  # Dropping unnecessary columns

# Apply One-Hot Encoding to categorical features
df_encoded = pd.get_dummies(df, columns=['season', 'yr', 'holiday', 'workingday', 'mnth', 'weekday', 'weathersit'], drop_first=True)

print(df_encoded.shape)

# Display the first few rows of the encoded dataset
df_encoded.head()


(731, 30)


Unnamed: 0,temp,atemp,hum,windspeed,cnt,season_2,season_3,season_4,yr_1,holiday_1,...,mnth_11,mnth_12,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weathersit_2,weathersit_3
0,0.344167,0.363625,0.805833,0.160446,985,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
1,0.363478,0.353739,0.696087,0.248539,801,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,0.196364,0.189405,0.437273,0.248309,1349,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
3,0.2,0.212122,0.590435,0.160296,1562,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
4,0.226957,0.22927,0.436957,0.1869,1600,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


In [1025]:
X = df_encoded.drop(columns=['cnt'])  # Features
y = df_encoded['cnt']  # Target variable

# Split data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [1026]:
X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]