<a href="https://colab.research.google.com/github/SaiRaghavTelugu/SaiRaghav_RideWise/blob/main/Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1 — mount drive (Colab) & imports
from google.colab import drive           # comment/remove if using local VS Code
drive.mount('/content/drive')

import os, warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns # Import seaborn for enhanced visualizations
%matplotlib inline

In [None]:
# Cell 2 — load datasets (edit paths if needed)
DAY_PATH = "/content/drive/MyDrive/day.csv"   # change if your file is in a folder
HOUR_PATH = "/content/drive/MyDrive/hour.csv"

day = pd.read_csv(DAY_PATH)
hour = pd.read_csv(HOUR_PATH)

# parse datetimes (safe guard)
day['dteday'] = pd.to_datetime(day['dteday'])
if 'dteday' in hour.columns:
    hour['dteday'] = pd.to_datetime(hour['dteday'])

print("day shape:", day.shape)
print("hour shape:", hour.shape)

# peek first 2 rows for each
display(day.head(5))
display(hour.head(5))


In [None]:
# Cell 3 — schema, dtypes and null counts
print("=== day columns and dtypes ===")
print(day.dtypes)
print("\nNull counts (day):")
print(day.isnull().sum())

print("\n=== hour columns and dtypes ===")
print(hour.dtypes)
print("\nNull counts (hour):")
print(hour.isnull().sum())

# Check duplicates in hourly dataset
print("\n=== duplicates in hour dataset===")
print(hour.duplicated().sum())

# Check duplicates in daily dataset
print("\n=== duplicates in day dataset===")
print(day.duplicated().sum())



In [None]:
# Cell 4 — target quick stats + histogram (day)
print("day cnt stats:")
print(day['cnt'].describe())

plt.figure(figsize=(6,3))
plt.hist(day['cnt'], bins=30)
plt.title("day: cnt distribution")
plt.xlabel("cnt")
plt.ylabel("frequency")
plt.show()

# and for hour (optional)
print("\nhour cnt stats:")
print(hour['cnt'].describe())

plt.figure(figsize=(6,3))
plt.hist(hour['cnt'], bins=30)
plt.title("hour: cnt distribution")
plt.xlabel("cnt")
plt.ylabel("frequency")
plt.show()


In [None]:
# Cell 5 — simple time-series plot (day)
plt.figure(figsize=(12,4))
plt.plot(day['dteday'], day['cnt'], marker='.', linewidth=0.5)
plt.title("Daily rentals over time")
plt.xlabel("date"); plt.ylabel("cnt")
plt.tight_layout()
plt.show()

# Hour (plot a subset, hourly can be noisy)
plt.figure(figsize=(12,3))
plt.plot(hour['dteday'].iloc[:1000], hour['cnt'].iloc[:1000], linewidth=0.5)
plt.title("Hourly rentals (first 1000 rows) — noisy")
plt.xlabel("date"); plt.ylabel("cnt")
plt.tight_layout()
plt.show()


In [None]:
# Line plot of cnt vs hour
plt.figure(figsize=(10, 4))
hour.groupby('hr')['cnt'].mean().plot(kind='line', marker='o')
plt.title("Average hourly rentals by Hour of Day")
plt.xlabel("Hour of Day")
plt.ylabel("Average rental count")
plt.xticks(range(24))
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Cell 8 — Scatter plots for numerical features vs cnt (day)

# Scatter plot of cnt vs temp
plt.figure(figsize=(8, 4))
plt.scatter(day['temp'], day['cnt'], alpha=0.5)
plt.title("Daily rentals vs Temperature")
plt.xlabel("Temperature (normalized)")
plt.ylabel("Rental count (cnt)")
plt.tight_layout()
plt.show()

# Scatter plot of cnt vs atemp
plt.figure(figsize=(8, 4))
plt.scatter(day['atemp'], day['cnt'], alpha=0.5)
plt.title("Daily rentals vs 'Feeling' Temperature")
plt.xlabel("'Feeling' Temperature (normalized)")
plt.ylabel("Rental count (cnt)")
plt.tight_layout()
plt.show()

# Scatter plot of cnt vs hum
plt.figure(figsize=(8, 4))
plt.scatter(day['hum'], day['cnt'], alpha=0.5)
plt.title("Daily rentals vs Humidity")
plt.xlabel("Humidity (normalized)")
plt.ylabel("Rental count (cnt)")
plt.tight_layout()
plt.show()

# Scatter plot of cnt vs windspeed
plt.figure(figsize=(8, 4))
plt.scatter(day['windspeed'], day['cnt'], alpha=0.5)
plt.title("Daily rentals vs Windspeed")
plt.xlabel("Windspeed (normalized)")
plt.ylabel("Rental count (cnt)")
plt.tight_layout()
plt.show()

In [None]:
# Bar plot of cnt vs season
plt.figure(figsize=(8, 4))
day.groupby('season')['cnt'].mean().plot(kind='bar')
plt.title("Average daily rentals by Season")
plt.xlabel("Season (1: spring, 2: summer, 3: fall, 4: winter)")
plt.ylabel("Average rental count")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# Bar plot of cnt vs year
plt.figure(figsize=(8, 4))
day.groupby('yr')['cnt'].mean().plot(kind='bar')
plt.title("Average daily rentals by Year")
plt.xlabel("Year (0: 2011, 1: 2012)")
plt.ylabel("Average rental count")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# Bar plot of cnt vs month
plt.figure(figsize=(10, 4))
day.groupby('mnth')['cnt'].mean().plot(kind='bar')
plt.title("Average daily rentals by Month")
plt.xlabel("Month")
plt.ylabel("Average rental count")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# Bar plot of cnt vs holiday
plt.figure(figsize=(8, 4))
day.groupby('holiday')['cnt'].mean().plot(kind='bar')
plt.title("Average daily rentals by Holiday")
plt.xlabel("Holiday (0: no, 1: yes)")
plt.ylabel("Average rental count")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# Bar plot of cnt vs weekday
plt.figure(figsize=(10, 4))
day.groupby('weekday')['cnt'].mean().plot(kind='bar')
plt.title("Average daily rentals by Weekday")
plt.xlabel("Weekday (0: Sun, 1: Mon, ..., 6: Sat)")
plt.ylabel("Average rental count")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# Bar plot of cnt vs workingday
plt.figure(figsize=(8, 4))
day.groupby('workingday')['cnt'].mean().plot(kind='bar')
plt.title("Average daily rentals by Workingday")
plt.xlabel("Workingday (0: no, 1: yes)")
plt.ylabel("Average rental count")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# Bar plot of cnt vs weathersit
plt.figure(figsize=(8, 4))
day.groupby('weathersit')['cnt'].mean().plot(kind='bar')
plt.title("Average daily rentals by Weather Situation")
plt.xlabel("Weather Situation (1: good, 2: moderate, 3: bad, 4: worse)")
plt.ylabel("Average rental count")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

NameError: name 'plt' is not defined

## Explore hourly data

### Subtask:
Conduct exploratory data analysis on the `hour` dataset, similar to what was done for the `day` dataset, to understand hourly patterns in bike rentals.


**Reasoning**:
Generate a histogram for the `cnt` column in the `hour` DataFrame, create a time series plot for the `cnt` column in the `hour` DataFrame over time, calculate and display the correlation matrix for the numerical features in the `hour` DataFrame, generate scatter plots for each numerical feature against `cnt`, and create bar plots for the average `cnt` for each category of the categorical features in the `hour` DataFrame, as requested by the instructions.



In [None]:
# 1. Histogram of cnt (hour)
plt.figure(figsize=(6,3))
plt.hist(hour['cnt'], bins=50)
plt.title("hour: cnt distribution")
plt.xlabel("cnt")
plt.ylabel("frequency")
plt.show()

# 2. Simple time-series plot (hour) - plotting a subset due to dataset size/noise
plt.figure(figsize=(12,4))
plt.plot(hour['dteday'].iloc[:2000], hour['cnt'].iloc[:2000], linewidth=0.5)
plt.title("Hourly rentals (first 2000 rows)")
plt.xlabel("date"); plt.ylabel("cnt")
plt.tight_layout()
plt.show()

# 3. Correlation matrix for numerical features (hour)
numerical_cols_hour = ['temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered', 'cnt']
correlation_matrix_hour = hour[numerical_cols_hour].corr()
print("\nCorrelation matrix (hour):")
display(correlation_matrix_hour)

# 4. Scatter plots for numerical features vs cnt (hour)
numerical_features_hour = ['temp', 'atemp', 'hum', 'windspeed']
for feature in numerical_features_hour:
    plt.figure(figsize=(8, 4))
    plt.scatter(hour[feature], hour['cnt'], alpha=0.2) # Use alpha for better visualization with many points
    plt.title(f"Hourly rentals vs {feature.capitalize()}")
    plt.xlabel(f"{feature.capitalize()} (normalized)")
    plt.ylabel("Rental count (cnt)")
    plt.tight_layout()
    plt.show()

# 5. Bar plots for categorical features vs cnt (hour)
categorical_features_hour = ['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit']
for feature in categorical_features_hour:
    plt.figure(figsize=(8, 4))
    hour.groupby(feature)['cnt'].mean().plot(kind='bar')
    plt.title(f"Average hourly rentals by {feature.capitalize()}")
    plt.xlabel(feature.capitalize())
    plt.ylabel("Average rental count")
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.show()

In [None]:
# Feature Engineering (Day)
def fe_day(df):
    df = df.copy().sort_values('dteday').reset_index(drop=True)

    # Date parts
    df['year'] = df['dteday'].dt.year
    df['month'] = df['dteday'].dt.month
    df['weekday'] = df['dteday'].dt.weekday

    # Seasonality
    df['sin_month'] = np.sin(2 * np.pi * df['month'] / 12)
    df['cos_month'] = np.cos(2 * np.pi * df['month'] / 12)
    df['sin_weekday'] = np.sin(2 * np.pi * df['weekday'] / 7)
    df['cos_weekday'] = np.cos(2 * np.pi * df['weekday'] / 7)

    # Trend
    df['trend'] = (df['dteday'] - df['dteday'].min()).dt.days

    # One-hot encode categorical
    df = pd.get_dummies(df, columns=['season','weathersit'], drop_first=True)

    # Final
    drop_cols = ['instant','dteday','casual','registered','cnt']
    X = df.drop(columns=[c for c in drop_cols if c in df.columns])
    y = df['cnt']
    return X, y

X_day, y_day = fe_day(day)
print("Day features:", X_day.shape)

In [None]:
# Feature Engineering (Hour)
def fe_hour(df):
    df = df.copy().sort_values('dteday').reset_index(drop=True)

    # Date parts
    df['year'] = df['dteday'].dt.year
    df['month'] = df['dteday'].dt.month
    df['weekday'] = df['dteday'].dt.weekday
    df['hour'] = df['hr'] # 'hr' already exists in hour data

    # Seasonality
    df['sin_month'] = np.sin(2 * np.pi * df['month'] / 12)
    df['cos_month'] = np.cos(2 * np.pi * df['month'] / 12)
    df['sin_weekday'] = np.sin(2 * np.pi * df['weekday'] / 7)
    df['cos_weekday'] = np.cos(2 * np.pi * df['weekday'] / 7)
    df['sin_hour'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['cos_hour'] = np.cos(2 * np.pi * df['hour'] / 24)


    # Trend
    df['trend'] = (df['dteday'] + pd.to_timedelta(df['hr'], unit='h') - (df['dteday'].min() + pd.to_timedelta(df['hr'].min(), unit='h'))).dt.total_seconds() / (24*3600)


    # One-hot encode categorical
    df = pd.get_dummies(df, columns=['season','weathersit'], drop_first=True)

    # Final
    drop_cols = ['instant','dteday','casual','registered','cnt', 'hr']
    X = df.drop(columns=[c for c in drop_cols if c in df.columns])
    y = df['cnt']
    return X, y

X_hour, y_hour = fe_hour(hour)
print("Hour features:", X_hour.shape)
print(X_hour.head(5))

In [None]:
from sklearn.model_selection import train_test_split

# Random split for day dataset
X_day_train_rand, X_day_test_rand, y_day_train_rand, y_day_test_rand = train_test_split(
    X_day, y_day, test_size=0.2, random_state=42)

# Time-aware split for day dataset
# Sort data by date before splitting
day_sorted = day.sort_values('dteday').reset_index(drop=True)
X_day_sorted, y_day_sorted = fe_day(day_sorted) # Re-apply feature engineering on sorted data

split_index_day = int(len(day_sorted) * 0.8) # 80% for training, 20% for testing

X_day_train_time = X_day_sorted.iloc[:split_index_day]
X_day_test_time = X_day_sorted.iloc[split_index_day:]
y_day_train_time = y_day_sorted.iloc[:split_index_day]
y_day_test_time = y_day_sorted.iloc[split_index_day:]

print("Day dataset random split shapes:")
print("X_day_train_rand:", X_day_train_rand.shape)
print("X_day_test_rand:", X_day_test_rand.shape)
print("y_day_train_rand:", y_day_train_rand.shape)
print("y_day_test_rand:", y_day_test_rand.shape)

print("\nDay dataset time-aware split shapes:")
print("X_day_train_time:", X_day_train_time.shape)
print("X_day_test_time:", X_day_test_time.shape)
print("y_day_train_time:", y_day_train_time.shape)
print("y_day_test_time:", y_day_test_time.shape)


# Random split for hour dataset
X_hour_train_rand, X_hour_test_rand, y_hour_train_rand, y_hour_test_rand = train_test_split(
    X_hour, y_hour, test_size=0.2, random_state=42)

# Time-aware split for hour dataset
# Sort data by date before splitting
hour_sorted = hour.sort_values('dteday').reset_index(drop=True)
X_hour_sorted, y_hour_sorted = fe_hour(hour_sorted) # Re-apply feature engineering on sorted data


split_index_hour = int(len(hour_sorted) * 0.8) # 80% for training, 20% for testing

X_hour_train_time = X_hour_sorted.iloc[:split_index_hour]
X_hour_test_time = X_hour_sorted.iloc[split_index_hour:]
y_hour_train_time = y_hour_sorted.iloc[:split_index_hour]
y_hour_test_time = y_hour_sorted.iloc[split_index_hour:]

print("\nHour dataset random split shapes:")
print("X_hour_train_rand:", X_hour_train_rand.shape)
print("X_hour_test_rand:", X_hour_test_rand.shape)
print("y_hour_train_rand:", y_hour_train_rand.shape)
print("y_hour_test_rand:", y_hour_test_rand.shape)

print("\nHour dataset time-aware split shapes:")
print("X_hour_train_time:", X_hour_train_time.shape)
print("X_hour_test_time:", X_hour_test_time.shape)
print("y_hour_train_time:", y_hour_train_time.shape)
print("y_hour_test_time:", y_hour_test_time.shape)

NameError: name 'X_day' is not defined

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Define the models to evaluate
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Extra Trees": ExtraTreesRegressor(random_state=42),
    "XGBoost": xgb.XGBRegressor(random_state=42)
}

In [None]:
def evaluate_models(X_train, X_test, y_train, y_test, label, split_type):
    results = []
    for name, model in models.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        mae = mean_absolute_error(y_test, preds)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        r2 = r2_score(y_test, preds)
        results.append({"dataset":label, "split_type": split_type, "model":name, "MAE":mae, "RMSE":rmse, "R2":r2})
    return pd.DataFrame(results)

# Evaluate models on day dataset with random split
res_day_rand = evaluate_models(X_day_train_rand, X_day_test_rand, y_day_train_rand, y_day_test_rand, label="day", split_type="random")

# Evaluate models on hour dataset with random split
res_hour_rand = evaluate_models(X_hour_train_rand, X_hour_test_rand, y_hour_train_rand, y_hour_test_rand, label="hour", split_type="random")

# Concatenate results
results_df = pd.concat([res_day_rand, res_hour_rand])
print(results_df)

In [None]:
from sklearn.model_selection import train_test_split

# Random split for day dataset
X_day_train_rand, X_day_test_rand, y_day_train_rand, y_day_test_rand = train_test_split(
    X_day, y_day, test_size=0.2, random_state=42)

# Random split for hour dataset
X_hour_train_rand, X_hour_test_rand, y_hour_train_rand, y_hour_test_rand = train_test_split(
    X_hour, y_hour, test_size=0.2, random_state=42)

print("Day dataset random split shapes:")
print("X_day_train_rand:", X_day_train_rand.shape)
print("X_day_test_rand:", X_day_test_rand.shape)
print("y_day_train_rand:", y_day_train_rand.shape)
print("y_day_test_rand:", y_day_test_rand.shape)

print("\nHour dataset random split shapes:")
print("X_hour_train_rand:", X_hour_train_rand.shape)
print("X_hour_test_rand:", X_hour_test_rand.shape)
print("y_hour_train_rand:", y_hour_train_rand.shape)
print("y_hour_test_rand:", y_hour_train_rand.shape)

In [None]:
# Visualize model performance (MAE, RMSE, R2) for each dataset separately

# Melt the results_df for easier plotting
results_melted = results_df.melt(id_vars=['dataset', 'split_type', 'model'],
                                 value_vars=['MAE', 'RMSE', 'R2'],
                                 var_name='Metric', value_name='Score')

datasets = results_melted['dataset'].unique()

for dataset in datasets:
    dataset_results = results_melted[results_melted['dataset'] == dataset]

    # Plot MAE
    plt.figure(figsize=(12, 6))
    sns.barplot(data=dataset_results[dataset_results['Metric'] == 'MAE'],
                x='model', y='Score', palette='viridis')
    plt.title(f'Model MAE for {dataset.capitalize()} Dataset')
    plt.ylabel('MAE')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

    # Plot RMSE
    plt.figure(figsize=(12, 6))
    sns.barplot(data=dataset_results[dataset_results['Metric'] == 'RMSE'],
                x='model', y='Score', palette='viridis')
    plt.title(f'Model RMSE for {dataset.capitalize()} Dataset')
    plt.ylabel('RMSE')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

    # Plot R2
    plt.figure(figsize=(12, 6))
    sns.barplot(data=dataset_results[dataset_results['Metric'] == 'R2'],
                x='model', y='Score', palette='viridis')
    plt.title(f'Model R2 for {dataset.capitalize()} Dataset')
    plt.ylabel('R2 Score')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

In [None]:
# # Fine-tune XGBoost for Day Dataset (Time-Aware Split) using Randomized and Grid Search

# # Step 1: Select Data
# X_train = X_day_train_time
# y_train = y_day_train_time
# X_test = X_day_test_time
# y_test = y_day_test_time

# # Step 2: Define a Wide Hyperparameter Distribution for Randomized Search
# from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
# import xgboost as xgb
# import numpy as np
# from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# param_dist = {
#     'n_estimators': [100, 200, 500, 1000],
#     'learning_rate': [0.01, 0.05, 0.1, 0.2],
#     'max_depth': [3, 4, 5, 6, 7],
#     'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
#     'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
#     'gamma': [0, 0.1, 0.2, 0.3, 0.4]
# }

# # Step 3: Perform Randomized Search with Cross-Validation
# xgb_model = xgb.XGBRegressor(random_state=42)

# # Set a reasonable number of iterations for RandomizedSearchCV
# n_iter_search = 50

# random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist,
#                                    n_iter=n_iter_search, scoring='neg_mean_squared_error',
#                                    cv=5, verbose=1, random_state=42, n_jobs=-1)

# print("Performing randomized search for XGBoost (Day Dataset)...")
# random_search.fit(X_train, y_train)

# print("\nBest parameters from randomized search for XGBoost (Day Dataset):")
# best_params_rand_xgb_day = random_search.best_params_
# print(best_params_rand_xgb_day)

# # Step 4: Define a Narrower Hyperparameter Grid based on Randomized Search results
# # Adjust the grid based on the best parameters found in the randomized search.
# # This is an example, you would typically refine this based on the actual output
# param_grid = {
#     'n_estimators': [best_params_rand_xgb_day['n_estimators'] - 100, best_params_rand_xgb_day['n_estimators'], best_params_rand_xgb_day['n_estimators'] + 100],
#     'learning_rate': [best_params_rand_xgb_day['learning_rate'] * 0.9, best_params_rand_xgb_day['learning_rate'], best_params_rand_xgb_day['learning_rate'] * 1.1],
#     'max_depth': [best_params_rand_xgb_day['max_depth'] - 1, best_params_rand_xgb_day['max_depth'], best_params_rand_xgb_day['max_depth'] + 1],
#     'colsample_bytree': [best_params_rand_xgb_day['colsample_bytree'] * 0.9, best_params_rand_xgb_day['colsample_bytree'], best_params_rand_xgb_day['colsample_bytree'] * 1.1],
#     'subsample': [best_params_rand_xgb_day['subsample'] * 0.9, best_params_rand_xgb_day['subsample'], best_params_rand_xgb_day['subsample'] * 1.1],
#     'gamma': [max(0, best_params_rand_xgb_day['gamma'] - 0.1), best_params_rand_xgb_day['gamma'], best_params_rand_xgb_day['gamma'] + 0.1]
# }

# # Ensure values are within reasonable bounds (e.g., colsample_bytree, subsample between 0 and 1)
# for param in ['learning_rate', 'colsample_bytree', 'subsample', 'gamma']:
#     if param in param_grid:
#         param_grid[param] = [max(0, p) for p in param_grid[param]]
#         if param in ['colsample_bytree', 'subsample']:
#              param_grid[param] = [min(1.0, p) for p in param_grid[param]]
#         # Remove duplicates and sort
#         param_grid[param] = sorted(list(set(param_grid[param])))

# # Ensure n_estimators and max_depth are integers
# for param in ['n_estimators', 'max_depth']:
#      if param in param_grid:
#          param_grid[param] = [int(p) for p in param_grid[param] if p is not None and p > 0]
#          param_grid[param] = sorted(list(set(param_grid[param])))


# print("\nDefined narrower grid for grid search for XGBoost (Day Dataset):")
# print(param_grid)


# # Step 5: Perform Grid Search with Cross-Validation
# grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
#                            scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)

# print("\nPerforming grid search for XGBoost (Day Dataset)...")
# grid_search.fit(X_train, y_train)

# # Get the best hyperparameters from grid search
# best_params_grid_xgb_day = grid_search.best_params_
# print("\nBest parameters from grid search for XGBoost (Day Dataset):")
# print(best_params_grid_xgb_day)

# # Step 6: Train Final Model with Best Hyperparameters
# tuned_xgb_model_day = xgb.XGBRegressor(**best_params_grid_xgb_day, random_state=42)
# tuned_xgb_model_day.fit(X_train, y_train)

# # Step 7: Evaluate Tuned Model
# preds_xgb_day = tuned_xgb_model_day.predict(X_test)
# mae_xgb_day = mean_absolute_error(y_test, preds_xgb_day)
# rmse_xgb_day = np.sqrt(mean_squared_error(y_test, preds_xgb_day))
# r2_xgb_day = r2_score(y_test, preds_xgb_day)

# print("\nTuned XGBoost Model Performance (Day Dataset, Time-Aware Split - Randomized + Grid Search):")
# print(f"MAE: {mae_xgb_day:.4f}")
# print(f"RMSE: {rmse_xgb_day:.4f}")
# print(f"R2 Score: {r2_xgb_day:.4f}")

# # Step 8: Finish task (Summary will be provided in a separate message)

In [None]:
# Fine-tune XGBoost for Hour Dataset (Random Split) using Randomized and Grid Search

# Step 1: Select Data (using random split for hour dataset)
X_train = X_hour_train_rand
y_train = y_hour_train_rand
X_test = X_hour_test_rand
y_test = y_hour_test_rand

# Step 2: Define a Wide Hyperparameter Distribution for Randomized Search
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


param_dist_hour = {
    'n_estimators': [100, 200, 500, 1000],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10], # Increased max_depth range for hour data
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'reg_alpha': [0, 0.1, 0.5, 1], # Added L1 regularization
    'reg_lambda': [0, 0.1, 0.5, 1] # Added L2 regularization
}

# Step 3: Perform Randomized Search with Cross-Validation
xgb_model_hour = xgb.XGBRegressor(random_state=42)

# Set a reasonable number of iterations for RandomizedSearchCV
n_iter_search_hour = 50 # Can adjust based on computational resources

random_search_hour = RandomizedSearchCV(estimator=xgb_model_hour, param_distributions=param_dist_hour,
                                   n_iter=n_iter_search_hour, scoring='neg_mean_squared_error', # Using neg_mean_squared_error for tuning
                                   cv=3, verbose=1, random_state=42, n_jobs=-1) # Reduced CV folds for faster search

print("Performing randomized search for XGBoost (Hour Dataset)...")
random_search_hour.fit(X_train, y_train)

print("\nBest parameters from randomized search for XGBoost (Hour Dataset):")
best_params_rand_xgb_hour = random_search_hour.best_params_
print(best_params_rand_xgb_hour)

# Step 4: Define a Narrower Hyperparameter Grid based on Randomized Search results
# Adjust the grid based on the best parameters found in the randomized search.
# This is an example, you would typically refine this based on the actual output
param_grid_hour = {
    'n_estimators': [best_params_rand_xgb_hour['n_estimators'] - 100, best_params_rand_xgb_hour['n_estimators'], best_params_rand_xgb_hour['n_estimators'] + 100],
    'learning_rate': [best_params_rand_xgb_hour['learning_rate'] * 0.9, best_params_rand_xgb_hour['learning_rate'], best_params_rand_xgb_hour['learning_rate'] * 1.1],
    'max_depth': [best_params_rand_xgb_hour['max_depth'] - 1, best_params_rand_xgb_hour['max_depth'], best_params_rand_xgb_hour['max_depth'] + 1],
    'colsample_bytree': [best_params_rand_xgb_hour['colsample_bytree'] * 0.9, best_params_rand_xgb_hour['colsample_bytree'], best_params_rand_xgb_hour['colsample_bytree'] * 1.1],
    'subsample': [best_params_rand_xgb_hour['subsample'] * 0.9, best_params_rand_xgb_hour['subsample'], best_params_rand_xgb_hour['subsample'] * 1.1],
    'gamma': [max(0, best_params_rand_xgb_hour['gamma'] - 0.1), best_params_rand_xgb_hour['gamma'], best_params_rand_xgb_hour['gamma'] + 0.1],
    'reg_alpha': [max(0, best_params_rand_xgb_hour['reg_alpha'] - 0.1), best_params_rand_xgb_hour['reg_alpha'], best_params_rand_xgb_hour['reg_alpha'] + 0.1],
    'reg_lambda': [max(0, best_params_rand_xgb_hour['reg_lambda'] - 0.1), best_params_rand_xgb_hour['reg_lambda'], best_params_rand_xgb_hour['reg_lambda'] + 0.1]
}

# Ensure values are within reasonable bounds (e.g., colsample_bytree, subsample between 0 and 1, learning_rate > 0)
for param in ['learning_rate', 'colsample_bytree', 'subsample', 'gamma', 'reg_alpha', 'reg_lambda']:
    if param in param_grid_hour:
        param_grid_hour[param] = [max(0.0, p) for p in param_grid_hour[param]] # Ensure non-negative
        if param in ['colsample_bytree', 'subsample']:
             param_grid_hour[param] = [min(1.0, p) for p in param_grid_hour[param]] # Ensure max 1.0
        # Remove duplicates and sort
        param_grid_hour[param] = sorted(list(set(param_grid_hour[param])))

# Ensure n_estimators and max_depth are integers
for param in ['n_estimators', 'max_depth']:
     if param in param_grid_hour:
         param_grid_hour[param] = [int(p) for p in param_grid_hour[param] if p is not None and p > 0]
         param_grid_hour[param] = sorted(list(set(param_grid_hour[param])))


print("\nDefined narrower grid for grid search for XGBoost (Hour Dataset):")
print(param_grid_hour)


# Step 5: Perform Grid Search with Cross-Validation
# Reduce CV folds for Grid Search as well for faster execution
grid_search_hour = GridSearchCV(estimator=xgb_model_hour, param_grid=param_grid_hour,
                           scoring='neg_mean_squared_error', cv=3, verbose=1, n_jobs=-1)

print("\nPerforming grid search for XGBoost (Hour Dataset)...")
grid_search_hour.fit(X_train, y_train)

# Get the best hyperparameters from grid search
best_params_grid_xgb_hour = grid_search_hour.best_params_
print("\nBest parameters from grid search for XGBoost (Hour Dataset):")
print(best_params_grid_xgb_hour)

# Step 6: Train Final Model with Best Hyperparameters
tuned_xgb_model_hour = xgb.XGBRegressor(**best_params_grid_xgb_hour, random_state=42)
tuned_xgb_model_hour.fit(X_train, y_train)

# Step 7: Evaluate Tuned Model
preds_xgb_hour = tuned_xgb_model_hour.predict(X_test)
mae_xgb_hour = mean_absolute_error(y_test, preds_xgb_hour)
rmse_xgb_hour = np.sqrt(mean_squared_error(y_test, preds_xgb_hour))
r2_xgb_hour = r2_score(y_test, preds_xgb_hour)

print("\nTuned XGBoost Model Performance (Hour Dataset, Random Split - Randomized + Grid Search):")
print(f"MAE: {mae_xgb_hour:.4f}")
print(f"RMSE: {rmse_xgb_hour:.4f}")
print(f"R2 Score: {r2_xgb_hour:.4f}")

# Step 8: Finish task (Summary will be provided in a separate message)

NameError: name 'X_hour_train_rand' is not defined