In [612]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder



In [613]:
# --- MLFLOW IMPORTS ---
import mlflow
import json 
# ----------------------

# --- MLFLOW SETUP ---
# Set the experiment name (it will create a folder/database in ./mlruns)
EXPERIMENT_NAME = "Bike_Sharing_Demand_TSCV"
mlflow.set_experiment(EXPERIMENT_NAME)
# --------------------

FEATURE_COLS = [
    # 'dteday',
    'season',
    'yr',
    'mnth',
    'holiday',
    'weekday',
    'workingday',
    'weathersit',
    'temp',
    'atemp',
    'hum',
    'windspeed',
    'hr',
    # 'casual',
    # 'registered',
    # 'bikes_cnt',
    # 'day'
    ] 



In [614]:
df1 = pd.read_csv("data/dataset/hour.csv")
df1.head()

# df1.info()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [615]:

# CONFIG
FILE_PATH = "data/dataset/hour.csv"
# FILE_PATH = "data/dataset/day.csv"
# 1. Loading the data
df = pd.read_csv(FILE_PATH, parse_dates=['dteday'])
df = df.sort_values(by='dteday').reset_index(drop=True)

TARGET_COL = 'cnt'

MIN_TEST_SAMPLES =  210 #30  day
# MIN_TEST_SAMPLES =  168 # hour

MAX_TRAIN_SAMPLE = None # 180 # day
# MAX_TRAIN_SAMPLE = 90 # hour

# SCORING_MARTIX = "neg_mean_absolute_error"
# SCORING_MARTIX = 'neg_mean_squared_error'
SCORING_MARTIX = "neg_root_mean_squared_error"
 
# TEST_SPLIT_DATE = '2012-10-31' 

print(f"Data loaded with {len(df)} rows and {len(df.columns)} columns.")
print("\n", df.info())

# 2. Feature engineering

df['is_weekend'] = np.where(df['weekday'].isin([5, 6]), 1, 0)

# Lag Feature Engineering (CRITICAL FOR TIME SERIES)
LAG_PERIOD = 1
df['lag_demand_1h'] = df['cnt'].shift(LAG_PERIOD)

# Fill NaN created by shifting
df['lag_demand_1h'] = df['lag_demand_1h'].fillna(0) 

# IMPORTANT: Add the lag feature to your FEATURE_COLS for training

print("Added lag feature: 'lag_demand_1h'")

hour_cycle = 24
df['hour_sin'] = np.sin(2 * np.pi * df['hr'] / hour_cycle)
df['hour_cos'] = np.cos(2 * np.pi * df['hr'] / hour_cycle)

# --- Day of Week (7-day cycle) ---
# Calculate the cycle period (7 days)
day_cycle = 7
df['dayofweek_sin'] = np.sin(2 * np.pi * df['weekday'] / day_cycle)
df['dayofweek_cos'] = np.cos(2 * np.pi * df['weekday'] / day_cycle)

# Assuming the data is sorted by time and the index frequency matches the lag unit (e.g., hourly data for shift(1))

# Lagged Demand (Previous Hour)
# Shift the target variable ('count') 

# Lagged Demand (Same Hour Yesterday)
# Shift by 24 time steps (24 hours ago)
df['lag_24h'] = df[TARGET_COL].shift(24)
df['lag_24h'] = df['lag_24h'].fillna(0) 

# Lagged Demand (Same Hour Last Week)
# Shift by 168 time steps (24 hours * 7 days ago)
df['lag_1wk'] = df[TARGET_COL].shift(168)
df['lag_1wk'] = df['lag_1wk'].fillna(0)

# You can apply shift to other features, e.g., weather conditions 
# df['temp_lag_1h'] = df['temp'].shift(1)

# Interaction: Temperature impact during peak usage hours
# Assuming 'temp' is a normalized temperature column
df['temp_x_hour_sin'] = df['temp'] * df['hour_sin']

# Interaction: Demand on weekends vs. workdays
df['temp_x_is_weekend'] = df['temp'] * df['is_weekend']

df['atemp_x_hour_sin'] = df['atemp'] * df['hour_sin']

# Interaction: Demand on weekends vs. workdays
df['atemp_x_is_weekend'] = df['atemp'] * df['is_weekend']

FEATURE_COLS.extend(['lag_demand_1h', 'atemp_x_is_weekend','atemp_x_hour_sin', 'temp_x_is_weekend', 'temp_x_hour_sin','lag_1wk', 'lag_24h', 'dayofweek_cos', 'dayofweek_sin', 'hour_cos', 'hour_sin',
                     'is_weekend']
                     ) 
df.head()



Data loaded with 17379 rows and 17 columns.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   instant     17379 non-null  int64         
 1   dteday      17379 non-null  datetime64[ns]
 2   season      17379 non-null  int64         
 3   yr          17379 non-null  int64         
 4   mnth        17379 non-null  int64         
 5   hr          17379 non-null  int64         
 6   holiday     17379 non-null  int64         
 7   weekday     17379 non-null  int64         
 8   workingday  17379 non-null  int64         
 9   weathersit  17379 non-null  int64         
 10  temp        17379 non-null  float64       
 11  atemp       17379 non-null  float64       
 12  hum         17379 non-null  float64       
 13  windspeed   17379 non-null  float64       
 14  casual      17379 non-null  int64         
 15  registered  17379 non-null

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,...,hour_sin,hour_cos,dayofweek_sin,dayofweek_cos,lag_24h,lag_1wk,temp_x_hour_sin,temp_x_is_weekend,atemp_x_hour_sin,atemp_x_is_weekend
0,1,2011-01-01,1,0,1,0,0,6,0,1,...,0.0,1.0,-0.781831,0.62349,0.0,0.0,0.0,0.24,0.0,0.2879
1,24,2011-01-01,1,0,1,23,0,6,0,2,...,-0.258819,0.965926,-0.781831,0.62349,0.0,0.0,-0.119057,0.46,-0.117633,0.4545
2,23,2011-01-01,1,0,1,22,0,6,0,2,...,-0.5,0.866025,-0.781831,0.62349,0.0,0.0,-0.2,0.4,-0.20455,0.4091
3,22,2011-01-01,1,0,1,21,0,6,0,2,...,-0.707107,0.707107,-0.781831,0.62349,0.0,0.0,-0.282843,0.4,-0.289277,0.4091
4,21,2011-01-01,1,0,1,20,0,6,0,2,...,-0.866025,0.5,-0.781831,0.62349,0.0,0.0,-0.34641,0.4,-0.354291,0.4091


In [616]:
FEATURE_COLS

['season',
 'yr',
 'mnth',
 'holiday',
 'weekday',
 'workingday',
 'weathersit',
 'temp',
 'atemp',
 'hum',
 'windspeed',
 'hr',
 'lag_demand_1h',
 'atemp_x_is_weekend',
 'atemp_x_hour_sin',
 'temp_x_is_weekend',
 'temp_x_hour_sin',
 'lag_1wk',
 'lag_24h',
 'dayofweek_cos',
 'dayofweek_sin',
 'hour_cos',
 'hour_sin',
 'is_weekend']

In [617]:


# Mostly this one is used
# param_dist = {
#     'regressor__n_estimators': [100, 200, 300, 400], 
#     'regressor__max_depth': [10, 20, 30, None],      
#     'regressor__min_samples_split': [2, 5, 10],      
#     'regressor__min_samples_leaf': [1, 2, 4],        
#     }

param_dist = {
    'regressor__n_estimators': [200, 500, 800, 1000, 1500],
    'regressor__max_depth': [15, 30, 45, 60, None], # Exploring deeper values
    'regressor__min_samples_split': [2, 5, 10, 20, 40], # Testing higher regularization
    'regressor__min_samples_leaf': [1, 3, 5, 10, 15],  # Testing higher regularization       
    }

N_ITER_SEARCH = 15


NUMERICAL_FEATURES = []
CATEGORICAL_FEATURES = []

# Feature Type Identification
for col in FEATURE_COLS:
    col_dtype = df[col].dtype
    num_unique = df[col].nunique()
    
    if np.issubdtype(col_dtype, np.number) and 'float' in str(col_dtype):
        NUMERICAL_FEATURES.append(col)
    elif np.issubdtype(col_dtype, np.number) and num_unique <= 50:
        CATEGORICAL_FEATURES.append(col)
    elif np.issubdtype(col_dtype, np.number):
            NUMERICAL_FEATURES.append(col)
    elif col_dtype == 'object':
        CATEGORICAL_FEATURES.append(col)
        
print("\n--- Identified Feature Types ---")
print(f"Numerical Features: {NUMERICAL_FEATURES}")
print(f"Categorical Features: {CATEGORICAL_FEATURES}")

X = df[FEATURE_COLS]
y = df[TARGET_COL]

# 2a. Create pre-processing pipeline
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', NUMERICAL_FEATURES), 
        ('cat', categorical_transformer, CATEGORICAL_FEATURES)
    ],
    remainder='drop' 
)

n_splits = 5
n_iter_search = 10



tscv = TimeSeriesSplit(
        n_splits=n_splits, 
        max_train_size=MAX_TRAIN_SAMPLE, 
        test_size= MIN_TEST_SAMPLES
    )
    
cv_metrics = []

print("\n--- Starting Expanding Window Cross-Validation with Tuning ---")

# Inner split for tuning (used inside RandomizedSearchCV)
inner_cv = TimeSeriesSplit(n_splits=3,
                           test_size=MIN_TEST_SAMPLES)

# --- START MLFLOW RUN ---
# This context manager automatically starts and ends a run
with mlflow.start_run():
    
    # 1. Log the featured columns as a **parameter**
    mlflow.log_param("featured_columns", json.dumps(FEATURE_COLS)) 
    
    # Also log other parameters that define the experiment
    mlflow.log_param("FILE_PATH", FILE_PATH)
    mlflow.log_param("n_splits", n_splits)
    mlflow.log_param("n_iter_search", n_iter_search)
    mlflow.log_param("model_type", "RandomForestRegressor")
    mlflow.log_param("SCORING_MARTIX", SCORING_MARTIX)    
    
    for fold, (train_index, test_index) in enumerate(tscv.split(X)):
        
        # Prepare Data for Current Fold
        X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
        y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
        
        print(f"\n[Fold {fold + 1}/{n_splits}] Training size: {len(X_train_fold)}, Testing size: {len(X_test_fold)}")

        # Define the Full ML Pipeline
        full_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', RandomForestRegressor(random_state=42, n_jobs=-1))
        ])
        
        # Perform Randomized Search on the CURRENT Training Data
        random_search = RandomizedSearchCV(
            full_pipeline, 
            param_distributions=param_dist, 
            n_iter=n_iter_search, 
            scoring= SCORING_MARTIX, 
            cv=inner_cv, 
            random_state=42,
            n_jobs=-1,
            verbose=0
        )
        
        print(f"  Tuning Random Forest on current training window (n_iter={n_iter_search})...")
        random_search.fit(X_train_fold, y_train_fold)
        
        # Use the Best Model found to predict on the outer test fold
        best_model = random_search.best_estimator_
        y_pred = best_model.predict(X_test_fold)
        
        # Evaluate Metrics for this Fold
        rmse = np.sqrt(mean_squared_error(y_test_fold, y_pred))
        mae = mean_absolute_error(y_test_fold, y_pred)
        mse_rf = mean_squared_error(y_test_fold, y_pred)
        r2_rf = r2_score(y_test_fold, y_pred)
        
        # 2. Log Fold Metrics as **metrics**
        mlflow.log_metric(f"fold_{fold+1}_rmse", rmse)
        mlflow.log_metric(f"fold_{fold+1}_mae", mae)
        mlflow.log_metric(f"fold_{fold+1}_r2", r2_rf)
        mlflow.log_metric(f"fold_{fold+1}_mse", mse_rf)
        
        # 3. Log Best Parameters for the Fold as **parameters**
        # Note: You may want to simplify this or only log the final best parameters
        # For simplicity, we log all best params for the fold as a parameter
        fold_params = {f"fold_{fold+1}_best_params": json.dumps(random_search.best_params_)}
        mlflow.log_params(fold_params) 
        
        cv_metrics.append({'RMSE': rmse,
                           'MAE': mae, 
                           'MSE': mse_rf,
                           'r2':r2_rf,
                           'Best_Params': random_search.best_params_})
        
        print(f"  Best Parameters: {random_search.best_params_}")
        print(f"  Fold Metrics: RMSE={rmse:.2f}, MAE={mae:.2f}")

    # Calculate and Report Averages
    avg_rmse = np.mean([m['RMSE'] for m in cv_metrics])
    avg_mae = np.mean([m['MAE'] for m in cv_metrics])
    avg_mse = np.mean([m['MSE'] for m in cv_metrics])
    avg_r2 = np.mean([m['r2'] for m in cv_metrics])

    # 4. Log the final average metrics
    mlflow.log_metric("avg_rmse", avg_rmse)
    mlflow.log_metric("avg_mae", avg_mae)
    mlflow.log_metric("avg_mse", avg_mse)
    mlflow.log_metric("avg_r2", avg_r2)

    print("\n--- Cross-Validation Summary ---")
    print(f"Average RMSE over {n_splits} folds: {avg_rmse:.2f}")
    print(f"Average MAE over {n_splits} folds: {avg_mae:.2f}")
    print(f"Average MSE over {n_splits} folds: {avg_mse:.2f}")
    print(f"Average r2 over {n_splits} folds: {avg_r2:.2f}")

    print('avg_RMSE', avg_rmse, 'avg_MAE', avg_mae, 'avg_r2', avg_r2, 'avg_mse', avg_mse, 'individual_folds', cv_metrics)
# The run is automatically ended here


--- Identified Feature Types ---
Numerical Features: ['temp', 'atemp', 'hum', 'windspeed', 'lag_demand_1h', 'atemp_x_is_weekend', 'atemp_x_hour_sin', 'temp_x_is_weekend', 'temp_x_hour_sin', 'lag_1wk', 'lag_24h', 'dayofweek_cos', 'dayofweek_sin', 'hour_cos', 'hour_sin']
Categorical Features: ['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit', 'hr', 'is_weekend']

--- Starting Expanding Window Cross-Validation with Tuning ---

[Fold 1/5] Training size: 16329, Testing size: 210
  Tuning Random Forest on current training window (n_iter=10)...
  Best Parameters: {'regressor__n_estimators': 800, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 3, 'regressor__max_depth': None}
  Fold Metrics: RMSE=88.08, MAE=55.76

[Fold 2/5] Training size: 16539, Testing size: 210
  Tuning Random Forest on current training window (n_iter=10)...
  Best Parameters: {'regressor__n_estimators': 800, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 3, 'reg

KeyboardInterrupt: 

In [605]:
print(df['cnt'].max())
print(df['cnt'].min())
print(df.shape)

977
1
(17379, 29)


In [606]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

In [607]:
df = pd.read_csv("data/dataset/day.csv")
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [608]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   dteday      731 non-null    object 
 2   season      731 non-null    int64  
 3   yr          731 non-null    int64  
 4   mnth        731 non-null    int64  
 5   holiday     731 non-null    int64  
 6   weekday     731 non-null    int64  
 7   workingday  731 non-null    int64  
 8   weathersit  731 non-null    int64  
 9   temp        731 non-null    float64
 10  atemp       731 non-null    float64
 11  hum         731 non-null    float64
 12  windspeed   731 non-null    float64
 13  casual      731 non-null    int64  
 14  registered  731 non-null    int64  
 15  cnt         731 non-null    int64  
dtypes: float64(4), int64(11), object(1)
memory usage: 91.5+ KB


In [609]:
df = df.drop(columns=['instant', 'dteday', 'casual', 'registered'])  # Dropping unnecessary columns

# Apply One-Hot Encoding to categorical features
df_encoded = pd.get_dummies(df, columns=['season', 'yr', 'holiday', 'workingday', 'mnth', 'weekday', 'weathersit'], drop_first=True)

print(df_encoded.shape)

# Display the first few rows of the encoded dataset
df_encoded.head()


(731, 30)


Unnamed: 0,temp,atemp,hum,windspeed,cnt,season_2,season_3,season_4,yr_1,holiday_1,...,mnth_11,mnth_12,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weathersit_2,weathersit_3
0,0.344167,0.363625,0.805833,0.160446,985,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
1,0.363478,0.353739,0.696087,0.248539,801,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,0.196364,0.189405,0.437273,0.248309,1349,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
3,0.2,0.212122,0.590435,0.160296,1562,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
4,0.226957,0.22927,0.436957,0.1869,1600,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


In [610]:
X = df_encoded.drop(columns=['cnt'])  # Features
y = df_encoded['cnt']  # Target variable

# Split data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [611]:
X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

IndexError: positional indexers are out-of-bounds