In [None]:
# problem in last version of code:
# feature 'Unnamed: 0' was taken into account by mistake in the model.

# Import basic modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')

##### RF-based models
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from joblib import dump, load
from tqdm import tqdm
from sklearn.model_selection import RandomizedSearchCV,StratifiedKFold,GridSearchCV,cross_val_score,KFold
import xgboost as xgb
import joblib

########## Quantile Regression Models
from quantile_forest import RandomForestQuantileRegressor

##### For SARIMAX and TBATS models
from pmdarima import auto_arima
## For outliers detection
from sklearn import preprocessing, svm
## For stationarity test and decomposition
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import holidays
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

### XGBoost
import xgboost as xgb

import time
import joblib
import pickle

# parameter tuning
from sklearn.model_selection import TimeSeriesSplit

# load data
df_15min = pd.read_csv('Data/case 1/processed_data.csv')
df_15min_train = df_15min[df_15min['date'] < '2020-09-08']

In [2]:
df_15min_train.head()

Unnamed: 0,date,DayofWeek,Hour,Quarter,OrZone,temp,wspd,prep,Is_Holiday,AR1,AR2,AR3,AR4,counts,zone_id
0,2020-04-01,2,11,3,zone 1,70,30,0,0,3.0,0.0,0.0,0.0,0.0,2
1,2020-04-01,2,11,3,zone 10,70,30,0,0,1.0,0.0,1.0,1.0,0.0,7
2,2020-04-01,2,11,3,zone 11,70,30,0,0,0.0,0.0,0.0,0.0,1.0,15
3,2020-04-01,2,11,3,zone 12,70,30,0,0,1.0,0.0,0.0,0.0,1.0,3
4,2020-04-01,2,11,3,zone 13,70,30,0,0,0.0,0.0,0.0,0.0,0.0,19


In [None]:
def save_model(model,model_name,zone_name,num_weeks):
    'model saving'
    file_name = f'EU_Models/week_{num_weeks}/{model_name}_{zone_name}'
    dump(model, f'{file_name}.joblib')

In [4]:
def evaluation(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred).round(3)
    rmse = mean_squared_error(y_true, y_pred, squared=False).round(3)
    r2 = r2_score(y_true, y_pred).round(3)
    return mae, rmse, r2

In [5]:
df_15min_train.columns

Index(['date', 'DayofWeek', 'Hour', 'Quarter', 'OrZone', 'temp', 'wspd',
       'prep', 'Is_Holiday', 'AR1', 'AR2', 'AR3', 'AR4', 'counts', 'zone_id'],
      dtype='object')

In [None]:
'Model modules'
def RandomForest_predictor_with_hyperparam_tuning(df_train, df_test, 
                                                  model_name='LDRF',
                                                  num_weeks_=4):
    # define hyperparameters
    param_grid_ = {
    'n_estimators': [int(x) for x in np.linspace(start = 50, stop = 200, num = 6)],
    'max_depth': [None, 3,4,5,6],
    'min_samples_split': [4, 6, 8, 10],
    'min_samples_leaf': [2, 3,5,7,10],
    'max_features' : ['auto', 'sqrt'],
    'bootstrap' : [True, False]}

    # fit a model per grid
    for grid in tqdm(df_train.OrZone.unique()):
        print('Training model for', grid)
        df_train_zone = df_train[df_train.OrZone == grid]
        df_test_zone = df_test[df_test.OrZone == grid]
        if model_name == 'LDRF':
            X_train, y_train = df_train_zone[['DayofWeek','Hour','temp','wspd','prep','Is_Holiday','AR1','AR2','AR3','AR4']], df_train_zone['counts']
            X_test, y_test = df_test_zone[['DayofWeek','Hour','temp','wspd','prep','Is_Holiday','AR1','AR2','AR3','AR4']], df_test_zone['counts']
            print('holiday included')
        else:
            X_train, y_train = df_train_zone[['DayofWeek','Hour','temp','wspd','prep','Is_Holiday']], df_train_zone['counts']
            X_test, y_test = df_test_zone[['DayofWeek','Hour','temp','wspd','prep','Is_Holiday']], df_test_zone['counts']
        
        # define random forest regressor
        rf = RandomForestRegressor(random_state=42)
        
        # Create time series cross-validation
        tscv = TimeSeriesSplit(n_splits=10)
        # create RandomizedSearchCV
        gscv = RandomizedSearchCV(estimator=rf, 
                                    param_distributions=param_grid_, 
                                    cv=tscv, n_jobs=6, verbose=0,
                                    n_iter=200, random_state=39,
                                    scoring='neg_root_mean_squared_error')

        start_time = time.time()
        gscv.fit(X_train, y_train)
        train_time_ = time.time() - start_time
        print('Total parameter tuning time:', train_time_)
        # Get the best hyperparameters
        best_params = gscv.best_params_
        print(f"{grid} -- Best Hyperparameters:", best_params)
        start_train_time = time.time()
        best_model = RandomForestRegressor(n_estimators= best_params['n_estimators'], 
                                            min_samples_split= best_params['min_samples_split'], 
                                            min_samples_leaf = best_params['min_samples_leaf'], 
                                            max_features= best_params['max_features'], 
                                            max_depth= best_params['max_depth'], 
                                            bootstrap= best_params['bootstrap'], random_state=42)
        best_model.fit(X_train, y_train)
        end_train_time = time.time() - start_train_time
        print(f"Best Model Training Time : {end_train_time}")
        
        print(f'\n ** Training Set Results ** \n')
        y_pred = best_model.predict(X_train)
        mae, rmse, r2 = evaluation(y_train, y_pred)
        print('MAE:', mae, 'RMSE:', rmse, 'R2:', r2)

        print(f'\n ** Testing Set Results ** \n')
        y_pred = best_model.predict(X_test)
        mae, rmse, r2 = evaluation(y_test, y_pred)
        print('MAE:', mae, 'RMSE:', rmse, 'R2:', r2)

        print('Model saved for', grid)
        save_model(best_model, model_name, grid, num_weeks_)

In [None]:
def LDQRF_predictor_with_hyperparam_tuning(df_train, df_test, 
                                           model_name='LDQRF',num_weeks_=4):
    # define hyperparameters
    param_grid_ = {
    'n_estimators': [int(x) for x in np.linspace(start = 50, stop = 200, num = 6)],
    'max_depth': [None, 3,4,5,6],
    'min_samples_split': [4, 6, 8, 10],
    'min_samples_leaf': [2,3,5,7,10],
    'max_features' : ['auto', 'sqrt'],
    'bootstrap' : [True, False]}

    for grid in tqdm(df_train.OrZone.unique()):
        print('Training model for', grid)
        arqrf = RandomForestQuantileRegressor()
        df_train_zone = df_train[df_train.OrZone == grid]
        df_test_zone = df_test[df_test.OrZone == grid]
        if model_name == 'LDQRF':
            X_train, y_train = df_train_zone[['DayofWeek','Hour','temp','wspd','prep','Is_Holiday','AR1','AR2','AR3','AR4']], df_train_zone['counts']
            X_test, y_test = df_test_zone[['DayofWeek','Hour','temp','wspd','prep','Is_Holiday','AR1','AR2','AR3','AR4']], df_test_zone['counts']
            print('holiday included')
        else:
            X_train, y_train = df_train_zone[['DayofWeek','Hour','temp','wspd','prep','Is_Holiday']], df_train_zone['counts']
            X_test, y_test = df_test_zone[['DayofWeek','Hour','temp','wspd','prep','Is_Holiday']], df_test_zone['counts']
    
        # kf = KFold(n_splits=10, shuffle=False)
        # Create time series cross-validation
        tscv = TimeSeriesSplit(n_splits=10)
        # Random search of parameters, using 10 fold cross validation, 
        gscv = RandomizedSearchCV(estimator = arqrf, 
                                            param_distributions = param_grid_, 
                                            n_iter = 200, cv = tscv, 
                                            verbose=2, n_jobs = 6)
        start_time = time.time()
        gscv.fit(X_train, y_train)
        train_time_ = time.time() - start_time
        print('Total parameter tuning time:', train_time_)
        # Get the best hyperparameters
        best_params = gscv.best_params_
        print(f"{grid} -- Best Hyperparameters:", best_params)

        start_train_time = time.time()
        best_model = RandomForestQuantileRegressor(n_estimators= best_params['n_estimators'], 
                                            min_samples_split= best_params['min_samples_split'], 
                                            min_samples_leaf = best_params['min_samples_leaf'], 
                                            max_features= best_params['max_features'], 
                                            max_depth= best_params['max_depth'], 
                                            bootstrap= best_params['bootstrap'])
        best_model.fit(X_train, y_train)
        end_train_time = time.time() - start_train_time
        print(f"Best Model Training Time : {end_train_time}")

        print(f'\n ** Training Set Results ** \n')
        y_pred = best_model.predict(X_train)
        mae, rmse, r2 = evaluation(y_train, y_pred)
        print('MAE:', mae, 'RMSE:', rmse, 'R2:', r2)

        print(f'\n ** Testing Set Results ** \n')
        y_pred = best_model.predict(X_test)
        mae, rmse, r2 = evaluation(y_test, y_pred)
        print('MAE:', mae, 'RMSE:', rmse, 'R2:', r2)

        print('Model saved for', grid)
        save_model(best_model, model_name, grid, num_weeks_)


In [None]:
def XGBoost_predictor_with_hyperparam_tuning(df_train, df_test, 
                                             model_name='LDXGB',num_weeks_=4):
    'an xgboost model with hyperparameter tuning'
    # define hyperparameters
    param_grid_ = {'objective' : ["reg:squarederror"], 
                'n_estimators' : [int(x) for x in np.linspace(start = 50, stop = 200, num = 10)],
                'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
                'max_depth': [int(x) for x in np.linspace(3, 6, num = 3)],
                'subsample':[0.5,0.75,1]
                }
    # fit a model per grid
    for grid in tqdm(df_train.OrZone.unique()):
        print('Training model for', grid)
        xgb_regressor = xgb.XGBRegressor()
        df_train_zone = df_train[df_train.OrZone == grid]
        df_test_zone = df_test[df_test.OrZone == grid]
        if model_name == 'LDXGB':
            X_train, y_train = df_train_zone[['DayofWeek','Hour','temp','wspd','prep','Is_Holiday','AR1','AR2','AR3','AR4']], df_train_zone['counts']
            X_test, y_test = df_test_zone[['DayofWeek','Hour','temp','wspd','prep','Is_Holiday','AR1','AR2','AR3','AR4']], df_test_zone['counts']
            print('holiday included')
        else:
            X_train, y_train = df_train_zone[['DayofWeek','Hour','temp','wspd','prep','Is_Holiday']], df_train_zone['counts']
            X_test, y_test = df_test_zone[['DayofWeek','Hour','temp','wspd','prep','Is_Holiday']], df_test_zone['counts']

        # Create time series cross-validation
        tscv = TimeSeriesSplit(n_splits=10)
        gscv = RandomizedSearchCV(estimator=xgb_regressor, param_distributions=param_grid_, 
                                scoring='neg_root_mean_squared_error', cv=tscv, 
                                n_iter = 200, verbose=2, n_jobs = 6, random_state=42)
        start_time = time.time()
        gscv.fit(X_train, y_train)
        train_time_ = time.time() - start_time
        print('Total parameter tuning time:', train_time_)
        # Get the best hyperparameters
        best_params = gscv.best_params_
        print(f"{grid} -- Best Hyperparameters:", best_params)

        start_train_time = time.time()
        best_model = xgb.XGBRegressor(objective = "reg:squarederror", 
                                    n_estimators = best_params['n_estimators'], 
                                    learning_rate = best_params['learning_rate'], 
                                    max_depth = best_params['max_depth'], 
                                    subsample = best_params['subsample'],
                                    nthread = 4, random_state=42)
        best_model.fit(X_train, y_train)
        end_train_time = time.time() - start_train_time
        print(f"Best Model Training Time : {end_train_time}")

        print(f'\n ** Training Set Results ** \n')
        y_pred = best_model.predict(X_train)
        mae, rmse, r2 = evaluation(y_train, y_pred)
        print('MAE:', mae, 'RMSE:', rmse, 'R2:', r2)

        print(f'\n ** Testing Set Results ** \n')
        y_pred = best_model.predict(X_test)
        mae, rmse, r2 = evaluation(y_test, y_pred)
        print('MAE:', mae, 'RMSE:', rmse, 'R2:', r2)

        print('Model saved for', grid)
        save_model(best_model, model_name, grid, num_weeks_)

## models training

In [9]:
from datetime import datetime

# Define the start and end dates
start_date = datetime(2020, 4, 13)
end_date = datetime(2020, 9, 14)

# Calculate the difference in days
difference = end_date - start_date
days_difference = difference.days

# Calculate the difference in weeks
weeks_difference = days_difference // 7
extra_days = days_difference % 7

print(f"Total days: {days_difference}")
print(f"Total weeks: {weeks_difference} weeks and {extra_days} days")


Total days: 154
Total weeks: 22 weeks and 0 days


In [10]:
print('num of time windows / day:', len(df_15min[df_15min['date'] == '2020-09-08'])/20)

num of time windows / day: 43.0


In [11]:
df_test = df_15min.iloc[-20*308:]
df_train_21w = df_15min.iloc[-20*308*22:-20*308]
df_train_4w = df_15min.iloc[-20*308*5:-20*308]
print(len(df_train_21w)/20)
print(len(df_train_4w)/20)
print(len(df_test)/20)

6468.0
1232.0
308.0


In [None]:
LDQRF_predictor_with_hyperparam_tuning(df_train_21w, df_test, model_name='LDQRF', num_weeks_=21)

In [None]:
LDQRF_predictor_with_hyperparam_tuning(df_train_21w, df_test, model_name='QRF', num_weeks_=21)

In [None]:
XGBoost_predictor_with_hyperparam_tuning(df_train_21w, df_test, model_name='LDXGB',num_weeks_=21)

In [None]:
XGBoost_predictor_with_hyperparam_tuning(df_train_21w, df_test, model_name='XGB',num_weeks_=21)

In [None]:
RandomForest_predictor_with_hyperparam_tuning(df_train_21w, df_test, model_name='LDRF',num_weeks_=21)

In [None]:
RandomForest_predictor_with_hyperparam_tuning(df_train_21w, df_test, model_name='RF',num_weeks_=21)