In [92]:
#imports
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error

In [None]:
# Create lag features and rolling statistics
df_resampled['car_lag_1'] = df_resampled['car'].shift(1)
df_resampled['car_lag_2'] = df_resampled['car'].shift(2)
df_resampled['car_rolling_mean'] = df_resampled['car'].rolling(window=4).mean()
df_resampled['hour'] = df_resampled.index.hour
df_resampled['day_of_week'] = df_resampled.index.dayofweek

# Define the features (X)
X = df_resampled.drop(columns=['car', 'bike', 'auto', 'bus', 'truck'])

# Define a list of vehicle types to forecast
vehicle_types = ['car', 'bike', 'auto', 'bus', 'truck']

In [94]:
# Define the mean_absolute_percentage_error function
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [95]:
# Create a TimeSeriesSplit object with 5 splits
tscv = TimeSeriesSplit(n_splits=5)

# Initialize a dictionary to store results for each vehicle type
vehicle_results = {}

# Loop through each vehicle type
for vehicle in vehicle_types:
    print(f"\nForecasting for {vehicle}:")
    
    # Define the target variable (y) for this vehicle type
    y = df_resampled[vehicle]
    
    # Initialize a list to store evaluation results for each fold
    fold_results = []
    
    # Define the hyperparameter grid for GridSearchCV
    param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [50, 100, 200],
        'subsample': [0.7, 0.8, 0.9],
    }

    # Create the XGBoost model
    model = xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse')

    # Perform cross-validation with TimeSeriesSplit
    for fold, (train_index, test_index) in enumerate(tscv.split(X), 1):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Set up GridSearchCV within each fold
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        # Get the best hyperparameters from grid search
        best_params = grid_search.best_params_
        print(f"Fold {fold} - Best hyperparameters: {best_params}")
        
        # Train the model with the best hyperparameters
        best_model = grid_search.best_estimator_
        best_model.fit(X_train, y_train)
        
        # Make predictions on the test set
        y_pred = best_model.predict(X_test)
        
        # Evaluate performance using Mean Squared Error (MSE)
        mse = mean_squared_error(y_test, y_pred)
        
        # Calculate MAPE
        mape = mean_absolute_percentage_error(y_test, y_pred)
        
        # Store the results for this fold
        fold_results.append({
            'fold': fold,
            'mse': mse,
            'mape': mape
        })
        
        # Print the evaluation results for the current fold
        print(f"Fold {fold} - MSE: {mse:.2f}, MAPE: {mape:.2f}%")

    # After cross-validation, forecast the next 30 minutes (future prediction)
    X_future = df_resampled.iloc[-2:].drop(columns=['car', 'bike', 'auto', 'bus', 'truck'])  # Select the last two time steps (excluding all vehicles)
    future_predictions = best_model.predict(X_future)  # Use the best model from GridSearchCV
    print(f'Future Predictions (next 30 minutes) for {vehicle}: {future_predictions}')
    
    # Store fold results for this vehicle type
    vehicle_results[vehicle] = fold_results
    
    # Print the overall results for all folds
    print(f"\nOverall results for {vehicle}:")
    for result in fold_results:
        print(f"Fold {result['fold']} - MSE: {result['mse']:.2f}, MAPE: {result['mape']:.2f}%")


Forecasting for car:
Fold 1 - Best hyperparameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.7}
Fold 1 - MSE: 107.77, MAPE: 7.47%
Fold 2 - Best hyperparameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.7}
Fold 2 - MSE: 251.80, MAPE: 18.07%
Fold 3 - Best hyperparameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.7}
Fold 3 - MSE: 6.81, MAPE: 2.38%
Fold 4 - Best hyperparameters: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}
Fold 4 - MSE: 48.56, MAPE: 4.66%
Fold 5 - Best hyperparameters: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.7}
Fold 5 - MSE: 21.91, MAPE: 3.66%
Future Predictions (next 30 minutes) for car: [107.00789 110.3891 ]

Overall results for car:
Fold 1 - MSE: 107.77, MAPE: 7.47%
Fold 2 - MSE: 251.80, MAPE: 18.07%
Fold 3 - MSE: 6.81, MAPE: 2.38%
Fold 4 - MSE: 48.56, MAPE: 4.66%
Fold 5 - MSE: 21.91, MAPE: 3.66%

Fore

In [105]:
#save df_resampled to csv
df_resampled.to_csv('df_resampled.csv')