In [29]:
# 1. Loading the data
df_ = pd.read_csv(FILE_PATH, parse_dates=['dteday'])
df_ = df_.sort_values(by='dteday').reset_index(drop=True)

# split dataset
df_last30 = df_.tail(30)
df = df_.iloc[:-30, :]

In [30]:
BEST_PARAMS = {
    'regressor__n_estimators': 800, 
     'regressor__min_samples_split': 2,
     'regressor__min_samples_leaf': 3,
     'regressor__max_depth': None
    }


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import mlflow
import json 
from sklearn.metrics import make_scorer
from prophet import Prophet
from xgboost import XGBRegressor 
import os

# --- MLFLOW SETUP ---
# Set the experiment name for the final run
EXPERIMENT_NAME = "Bike_Sharing_Demand_FINAL_TEST" 
mlflow.set_experiment(EXPERIMENT_NAME)

# --- CUSTOM METRIC FUNCTIONS ---
def mean_absolute_percentage_error(y_true, y_pred):
    """Calculates the Mean Absolute Percentage Error (MAPE)."""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    epsilon = 1e-8 
    percentage_error = np.abs((y_true - y_pred) / np.where(y_true == 0, epsilon, y_true))
    return np.mean(percentage_error) * 100

def neg_mean_absolute_percentage_error(y_true, y_pred):
    """Negated MAPE for scikit-learn scoring."""
    return -1.0 * mean_absolute_percentage_error(y_true, y_pred)

mape_scorer = make_scorer(neg_mean_absolute_percentage_error, greater_is_better=True) 

# --- CONFIG ---
FILE_PATH = "data/dataset/day.csv"
TARGET_COL = 'cnt'
MIN_TEST_SAMPLES = None 
MAX_TRAIN_SAMPLE = None 

# Set scoring for CV (use MAPE scorer if optimizing for MAPE)
SCORING_MARTIX = "neg_mean_squared_error" #mape_scorer 
N_ITER_SEARCH = 15
n_splits = 5

param_dist = {
    'regressor__n_estimators': [200, 500, 800, 1000, 1500],
    'regressor__max_depth': [15, 30, 45, 60, None], 
    'regressor__min_samples_split': [2, 5, 10, 20, 40],
    'regressor__min_samples_leaf': [1, 3, 5, 10, 15],       
    }

# --- PLACEHOLDER FOR BEST PARAMETERS ---
BEST_PARAMS_FOUND = {"regressor__n_estimators": 800,
                     "regressor__min_samples_split": 2,
                     "regressor__min_samples_leaf": 3,
                     "regressor__max_depth": None }


# --- 1. DATA LOADING AND SPLIT ---
if not os.path.exists(FILE_PATH):
    raise FileNotFoundError(f"Data file not found at {FILE_PATH}")

df_ = pd.read_csv(FILE_PATH, parse_dates=['dteday'])
df_ = df_.sort_values(by='dteday').reset_index(drop=True)

# Split dataset: df is the FULL TRAINING SET, df_last30 is the FINAL TEST SET
df_last30 = df_.tail(30).copy()  # Final held-out test data
df = df_.iloc[:-30, :].copy()    # Data used for CV/Tuning

# --- 2. FEATURE ENGINEERING FUNCTION ---

# List of columns to be engineered and used in the final model
FEATURE_COLS = [
    'season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit', 
    'temp', 'atemp', 'hum', 'windspeed', 'is_weekend', 'lag_demand_1d', 
    'lag_demand_7d', 'temp_lag_1d', 'temp_lag_7d', 'atemp_lag_1d', 
    'atemp_lag_7d', 'hum_lag_1d', 'hum_lag_7d', 'windspeed_lag_1d', 
    'windspeed_lag_7d', 'temp_x_is_weekend', 'atemp_x_is_weekend',
    'trend', 'yearly', 'weekly'
]

def apply_feature_engineering(data_df, prophet_model=None, train_df=None):
    """Applies all feature engineering steps to a given DataFrame."""
    df_eng = data_df.copy()

    # Time-based and Interaction Features
    df_eng['is_weekend'] = np.where(df_eng['weekday'].isin([5, 6]), 1, 0)
    df_eng['temp_x_is_weekend'] = df_eng['temp'] * df_eng['is_weekend']
    df_eng['atemp_x_is_weekend'] = df_eng['atemp'] * df_eng['is_weekend']

    # Lag Features (Handles both training and test data)
    if train_df is not None:
        # For Test Data: Pull last values from the Training Data
        last_values = {col: train_df[col].iloc[-1] for col in ['cnt', 'temp', 'atemp', 'hum', 'windspeed']}
        last_values_7d = {col: train_df[col].iloc[-7] if len(train_df) >= 7 else 0 for col in ['cnt', 'temp', 'atemp', 'hum', 'windspeed']}
        
        # Demand Lags
        df_eng['lag_demand_1d'] = df_eng['cnt'].shift(1).fillna(last_values['cnt'])
        df_eng['lag_demand_7d'] = df_eng['cnt'].shift(7).fillna(last_values_7d['cnt'])
        
        # Weather Lags
        for col in ['temp', 'atemp', 'hum', 'windspeed']:
            df_eng[f'{col}_lag_1d'] = df_eng[col].shift(1).fillna(last_values[col])
            df_eng[f'{col}_lag_7d'] = df_eng[col].shift(7).fillna(last_values_7d[col])
            
    else:
        # For Training Data: Fill the first rows with 0 (or drop them)
        for lag in ['lag_demand_1d', 'lag_demand_7d']:
            df_eng[lag] = df_eng['cnt'].shift(int(lag.split('_')[-1].replace('d', ''))).fillna(0)
        
        for col in ['temp', 'atemp', 'hum', 'windspeed']:
            df_eng[f'{col}_lag_1d'] = df_eng[col].shift(1).fillna(0)
            df_eng[f'{col}_lag_7d'] = df_eng[col].shift(7).fillna(0)

    # Prophet Components (Requires fitted model)
    if prophet_model:
        future = df_eng[['dteday']].rename(columns={'dteday': 'ds'})
        forecast = prophet_model.predict(future)
        prophet_components = forecast[['ds', 'trend', 'yearly', 'weekly']]
        df_eng = df_eng.merge(prophet_components, left_on='dteday', right_on='ds', how='left').drop(columns=['ds'])
    
    return df_eng.dropna(subset=FEATURE_COLS) # Drop rows that still have NaNs (e.g. from large lags)


# --- 3. TRAIN PROPHET MODEL ON FULL TRAINING DATA (`df`) ---
prophet_df = df[['dteday', 'cnt']].rename(columns={'dteday': 'ds', 'cnt': 'y'})
m = Prophet(growth='linear', yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True)
m.fit(prophet_df) 

# Apply feature engineering to the full training set (`df`)
df = apply_feature_engineering(df, prophet_model=m)


# --- 4. FEATURE TYPE IDENTIFICATION and PREPROCESSING ---
X = df[FEATURE_COLS]
y = df[TARGET_COL]

NUMERICAL_FEATURES = []
CATEGORICAL_FEATURES = []
for col in FEATURE_COLS:
    col_dtype = df[col].dtype
    num_unique = df[col].nunique()
    
    if np.issubdtype(col_dtype, np.number) and ('float' in str(col_dtype) or num_unique > 50):
        NUMERICAL_FEATURES.append(col)
    elif np.issubdtype(col_dtype, np.number) and num_unique <= 50:
        CATEGORICAL_FEATURES.append(col)
        
print(f"Numerical Features: {NUMERICAL_FEATURES}")
print(f"Categorical Features: {CATEGORICAL_FEATURES}")

categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', NUMERICAL_FEATURES), 
        ('cat', categorical_transformer, CATEGORICAL_FEATURES)
    ],
    remainder='drop' 
)


# --- 5. TIME SERIES CROSS-VALIDATION (CV) LOOP ---
tscv = TimeSeriesSplit(n_splits=n_splits)
inner_cv = TimeSeriesSplit(n_splits=3)
cv_metrics = []

with mlflow.start_run(run_name="Hyperparameter_Tuning_CV"):
    mlflow.log_param("SCORING_MARTIX", str(SCORING_MARTIX))   
    mlflow.log_param("model_type", "RandomForestRegressor")
    
    print("\n--- Starting CV Tuning (Run this once to finalize BEST_PARAMS_FOUND) ---")
    for fold, (train_index, test_index) in enumerate(tscv.split(X)):
        
        X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
        y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
        
        full_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', RandomForestRegressor(random_state=42, n_jobs=-1))
        ])
        
        random_search = RandomizedSearchCV(
            full_pipeline, param_distributions=param_dist, n_iter=N_ITER_SEARCH, 
            scoring=SCORING_MARTIX, cv=inner_cv, random_state=42, n_jobs=-1, verbose=0
        )
        
        random_search.fit(X_train_fold, y_train_fold)
        best_model = random_search.best_estimator_
        y_pred = best_model.predict(X_test_fold)
        
        rmse = np.sqrt(mean_squared_error(y_test_fold, y_pred))
        mape = mean_absolute_percentage_error(y_test_fold, y_pred)
        
        # Log minimal metrics in the loop
        mlflow.log_metric(f"fold_{fold+1}_rmse", rmse)
        mlflow.log_metric(f"fold_{fold+1}_mape", mape)
        cv_metrics.append({'MAPE': mape, 'RMSE': rmse, 'Best_Params': random_search.best_params_})

    avg_mape = np.mean([m['MAPE'] for m in cv_metrics])
    mlflow.log_metric("avg_mape", avg_mape)
    print(f"\nAverage MAPE over {n_splits} folds: {avg_mape:.2f}%")
    print("CV tuning finished. Update BEST_PARAMS_FOUND and run again for final test.")



20:03:09 - cmdstanpy - INFO - Chain [1] start processing
20:03:09 - cmdstanpy - INFO - Chain [1] done processing


Numerical Features: ['temp', 'atemp', 'hum', 'windspeed', 'lag_demand_1d', 'lag_demand_7d', 'temp_lag_1d', 'temp_lag_7d', 'atemp_lag_1d', 'atemp_lag_7d', 'hum_lag_1d', 'hum_lag_7d', 'windspeed_lag_1d', 'windspeed_lag_7d', 'temp_x_is_weekend', 'atemp_x_is_weekend', 'trend', 'yearly', 'weekly']
Categorical Features: ['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit', 'is_weekend']

--- Starting CV Tuning (Run this once to finalize BEST_PARAMS_FOUND) ---

Average MAPE over 5 folds: 54.36%
CV tuning finished. Update BEST_PARAMS_FOUND and run again for final test.

FINAL MODEL TRAINING & TESTING ON UNSEEN DATA (Requires BEST_PARAMS_FOUND)
Final Test Set Size (After Engineering): 30 rows
Fitting final model on full training set...





--- Final Model Performance on Unseen Test Data (Last 30 Days) ---
Final Test RMSE: 1332.95
Final Test MAE: 1111.75
Final Test MAPE: 66.60%
Final Test R2 Score: 0.4451

Run ID for Final Evaluation: 1e4e058373e94968b845aac7b644fe30

Script execution complete. Run 'mlflow ui' to view all results.


In [None]:
# The CV run ends here

# --- 6. FINAL MODEL TRAINING AND TESTING ON UNSEEN DATA ---

print("\n" + "="*70)
print("FINAL MODEL TRAINING & TESTING ON UNSEEN DATA (Requires BEST_PARAMS_FOUND)")
print("="*70)

# --- 6.1. Prepare Test Data (`df_last30`) ---
df_test = apply_feature_engineering(df_last30, prophet_model=m, train_df=df)
X_test_final = df_test[FEATURE_COLS]
y_test_final = df_test[TARGET_COL]
print(f"Final Test Set Size (After Engineering): {len(X_test_final)} rows")

# --- 6.2. Retrain and Test ---
with mlflow.start_run(run_name="Final_Test_Evaluation_on_Held_Out_Set"):
    
    # 1. Define the Final ML Pipeline with Best Parameters
    final_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(random_state=42, n_jobs=-1)) 
    ])

    # 2. Set the best parameters found during CV tuning
    final_pipeline.set_params(**BEST_PARAMS_FOUND)

    # 3. Fit on the ENTIRE training data (X, y)
    print("Fitting final model on full training set...")
    final_pipeline.fit(X, y)

    # 4. Make predictions on the UNSEEN final test set
    y_pred_final = final_pipeline.predict(X_test_final)

    # 5. Evaluate final performance
    rmse_final = np.sqrt(mean_squared_error(y_test_final, y_pred_final))
    mae_final = mean_absolute_error(y_test_final, y_pred_final)
    r2_final = r2_score(y_test_final, y_pred_final)
    mape_final = mean_absolute_percentage_error(y_test_final, y_pred_final)

    # 6. Log the Final Results
    mlflow.log_params(BEST_PARAMS_FOUND)
    mlflow.log_metric("final_test_rmse", rmse_final)
    mlflow.log_metric("final_test_mae", mae_final)
    mlflow.log_metric("final_test_r2", r2_final)
    mlflow.log_metric("final_test_mape", mape_final)
    mlflow.sklearn.log_model(final_pipeline, "final_validated_model")
    
    print("\n--- Final Model Performance on Unseen Test Data (Last 30 Days) ---")
    print(f"Final Test RMSE: {rmse_final:.2f}")
    print(f"Final Test MAE: {mae_final:.2f}")
    print(f"Final Test MAPE: {mape_final:.2f}%")
    print(f"Final Test R2 Score: {r2_final:.4f}")
    print("\nRun ID for Final Evaluation:", mlflow.active_run().info.run_id)

print("\nScript execution complete. Run 'mlflow ui' to view all results.")