# Automated Light GBM model Optimization and Logging
- Split into training and testing sets
- Optimize model parameters
- Automate training across n=6 countries and k=2 datasets
- Log results on MLflow server

Setup

In [1]:
import mlflow
import pandas as pd
import pickle
from pathlib import Path
import lightgbm as lgbm
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit 
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sbn
from skopt import BayesSearchCV
from skopt.space import Real, Integer
from mlflow.models.signature import infer_signature

# Multiple outputs per notebook cell
%config InteractiveShell.ast_node_interactivity = 'all'

# random_state for different processes
RANDOM_STATE = 221

In [2]:
def num_rows_horizon(df, timeframe='7day'):
    ''' 
    Input: Dataframe, Timeframe (in to_timedelta format)
    Output: Outputs number of rows in dataframe to reach timeframe
    '''
    return len(df.loc[df['day'] > (df['day'].max() - pd.to_timedelta('7day'))])

# validate test
def trainval_test_split(df):
    '''
    Input: Dataframe from features prep
    Output: Train and final test sets with a 7 day window
    '''
    train_validate = df.loc[df['day'] < (df['day'].max() - pd.to_timedelta('7day'))]
    final_test = df.loc[df['day'] > (df['day'].max() - pd.to_timedelta('8day'))]

    return train_validate, final_test

# train validate
def load_train_test(df, drop_index=True):
    ''' 
    Input: Dataframe from features prep
    Output: Data prepared for hyperparameter search
    '''
    X = (df.drop(columns=['load_actual', 'country', 'day']) 
           .dropna()
           .reset_index()
           )
    if drop_index:
        X = X.drop(columns='utc_timestamp')
     
    y = (df.reset_index(drop=True)[['load_actual']])
    
    return X, y


### Load Data
- Creating train/test and validation sets, then ensure windows are correct

In [3]:
cd = Path.cwd()
data_dir = str(cd.parents[1])
upsampled = data_dir + '/datasets/country_energy/load_wthr_downsample_update.pickle'
downsampled = data_dir + '/datasets/country_energy/load_wthr_upsample_update.pickle'

# Xu - upsampled
with open(upsampled, 'rb') as f:
    Xu = pickle.load(f)

# Xd - downsampled
with open(downsampled, 'rb') as f:
    Xd = pickle.load(f)

# Create test/train and validation set. Ensure dates are correct
final_test_u = Xu.loc[Xu['day'] > (Xu['day'].max() - pd.to_timedelta('8day'))] # 8 days is selected because last day 2019-4-30 has 0 hours
Xu_check = Xu.loc[Xu['day'] < (Xu['day'].max() - pd.to_timedelta('7day'))]
print(f'Ensure validation set is last 7 days; min date: {final_test_u.index.min()}, max date: {final_test_u.index.max()}')
print(f'Ensure train/test set excludes last 7 days; min date: {Xu_check.index.min()}, max date: {Xu_check.index.max()}')

final_test_d = Xd.loc[Xd['day'] > (Xd['day'].max() - pd.to_timedelta('8day'))]
Xd_check = Xd.loc[Xd['day'] < (Xd['day'].max() - pd.to_timedelta('7day'))]
print(f'Ensure validation set is last 7 days; min date: {final_test_d.index.min()}, max date: {final_test_d.index.max()}')
print(f'Ensure train/test set excludes last 7 days; min date: {Xd_check.index.min()}, max date: {Xd_check.index.max()}')

Ensure validation set is last 7 days; min date: 2019-04-23 00:00:00+00:00, max date: 2019-04-30 00:00:00+00:00
Ensure train/test set excludes last 7 days; min date: 2016-01-01 01:45:00+00:00, max date: 2019-04-22 23:45:00+00:00
Ensure validation set is last 7 days; min date: 2019-04-23 00:00:00+00:00, max date: 2019-04-30 00:00:00+00:00
Ensure train/test set excludes last 7 days; min date: 2016-01-01 07:00:00+00:00, max date: 2019-04-22 23:00:00+00:00


Hyperparameter optimization and logging for MLflow

In [None]:
from forecast_pipeline.lightgbm_forecasting_pipeline import *

Automated forecasting per country

In [5]:

def automated_forecast(datasets, dataset_names, iter_per_model=25, nested_windows=10, experiment_name='No experiment name given'):

    if __name__ == "__main__":
        # Initialize server
        mlflow.set_tracking_uri("http://127.0.0.1:5000")
        mlflow.set_experiment(experiment_name)

        for name, data in zip(dataset_names, datasets):
            mlflow.start_run(run_name=f'{name} Country Energy Forcast')

            # Group data, begin process for each group in data
            country_data = data.groupby('country')

            for country, data in country_data: 
                #with mlflow.start_run(nested=True, run_name=f"Country: {country}"):
                hyperparam_opt(data, country, iterations=iter_per_model, nested_windows=nested_windows)  
               
mlflow.end_run()

Run automated forecast

In [None]:
# # Params for testing
# datasets = [Xd, Xu] 
# dataset_names = ['1hour', '15min']
# iterations = [15, 30]

mlflow.end_run()
#mlflow.start_run()
automated_forecast(datasets=[Xu],
                   dataset_names='15min Intervals',
                   iter_per_model=15, # 20
                   experiment_name=f'Added Variables',
                   nested_windows=10)
