In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 12})
plt.rcParams["font.family"] = "Times New Roman"

from utils import get_hdf_keys
from train import training
from evaluation import evaluate, get_run_results
import warnings
import pickle
warnings.filterwarnings('ignore')
# Set seed
np.random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set working directory
import wandb
wandb.login()

os.chdir(r"..") # should be the git repo root directory, checking below:
print("Current working directory: " + os.getcwd())
assert os.getcwd()[-8:] == "WattCast"
dir_path = os.path.join(os.getcwd(), 'data', 'clean_data')
model_dir = os.path.join(os.getcwd(), 'models')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnikolaushouben[0m ([33mwattcast[0m). Use [1m`wandb login --relogin`[0m to force relogin


Current working directory: /Users/nikolaushouben/Desktop/WattCast


In [3]:
locations_dict, resolutions_dict = get_hdf_keys(dir_path)

In [4]:
locations_dict

{'2_town.h5': ['GLENDOVEER-13596',
  'GLENDOVEER-13597',
  'GLENDOVEER-13598',
  'GLENDOVEER-13599',
  'GLENDOVEER-CLIFFGATE',
  'GLENDOVEER-NORTHEAST',
  'KELLY',
  'LENTS-13101',
  'LENTS-HAPPY',
  'LENTS-MT',
  'LENTS-NORTH',
  'MIDWAY-DIVISION',
  'MIDWAY-DOUGLAS',
  'MIDWAY-LYNCH',
  'MIDWAY-POWELLHURST',
  'RAMAPO-EMERALD',
  'RAMAPO-GILBERT',
  'RAMAPO-RAMAPO']}

### Loop through all locations and resolutions

In [None]:
for scale, locations in locations_dict.items():

    scale = scale.split('.')[0]
    for location in locations:

        print(f"Training {location} at {scale} scale")
        config, models = training(scale, location, tuned_models=['xgb']) # loads existing models (from disk) if they exist, otherwise trains new models with optimial hyperparameters (from wandb) if they exist
        eval_dict = evaluate(config, models) # loads existing run results (from wandb) if they exist, otherwise runs a backtest for each model on the val and test set, and then formats it into the various horizons
        df_metrics = get_run_results(eval_dict, config) # calculates the error scores and produces plots, logging them to wandb if possible

        wandb.finish()

### Run for a single location

In [None]:
scale = '2_town'
location = 'GLENDOVEER-13596'

print(f"Training {location} at {scale} scale")

config_run = {
    'spatial_scale': scale,
    'temp_resolution': 60,
    'location': location,
    'models_to_train': ['rf', 'xgb'],
    'horizon_in_hours': 24,
    'lookback_in_hours': 24,
    'boxcox': True,
    'liklihood': None,
    'weather_available': True,
    'datetime_encodings': True,
    'heat_wave_binary': True,
    'datetime_attributes': ["dayofweek", "week"]
}

config, models = training(config_run)

eval_dict = evaluate(config, models)

df_metrics = get_run_results(eval_dict, config)

wandb.finish()

## Debugging

In [5]:
from train import Config, derive_config_params, load_data, get_model_instances, load_trained_models, get_best_run_config


init_config = {
    'spatial_scale': "2_town",
    'temp_resolution': 60,
    'location': 'GLENDOVEER-13596',
    'models_to_train': ["rf", "xgb"],
    'horizon_in_hours': 24,
    'lookback_in_hours': 24,
    'boxcox': True,
    'liklihood': None,
    'weather_available': True,
    'datetime_encodings': True,
    'heat_wave_binary': True,
    'datetime_attributes': ["dayofweek", "week"],
    'use_cov_as_past_cov': False,
}


config = Config().from_dict(init_config)
config = derive_config_params(config)
models_to_train = config.models_to_train

# Importing hyperparameters from wandb for models that have previously been tuned
config_per_model = {}
for model in models_to_train:
    model_config, _ = get_best_run_config(
        "Wattcast_tuning", "-eval_loss", model, config.spatial_scale
    )
    # update model_config with basic config if they are not yet in the keys of the model config
    for key, value in config.data.items():
        if key not in model_config.data.keys():
            model_config[key] = value
    
    config_per_model[model] = model_config




[34m[1mwandb[0m: Sorting runs by -summary_metrics.eval_loss


Fetched sweep with name pleasant-sweep-24 for model rf


[34m[1mwandb[0m: Sorting runs by -summary_metrics.eval_loss


Fetched sweep with name ethereal-sweep-2 for model xgb


In [6]:
config_per_model

{'rf': <train.Config at 0x11311f6d0>, 'xgb': <train.Config at 0x1130f5450>}

In [7]:
# getting the model instances for all models
model_instances = get_model_instances(models_to_train, config_per_model)


Getting model instance for rf...
Getting model instance for xgb...
Getting model instance for linear regression...


In [8]:
model_instances

{'rf': RandomForest(lags=24, lags_past_covariates=None, lags_future_covariates=None, output_chunk_length=24, add_encoders=None, n_estimators=500, max_depth=2, multi_models=True, use_static_covariates=True, random_state=42, min_samples_split=10, min_samples_leaf=2),
 'xgb': XGBModel(lags=24, lags_past_covariates=None, lags_future_covariates=None, output_chunk_length=24, add_encoders=None, likelihood=None, quantiles=None, random_state=42, multi_models=True, use_static_covariates=True, n_estimators=200, learning_rate=0.2, max_depth=15, min_child_weight=10, objective=reg:pseudohubererror, reg_lambda=0.1),
 'lr': LinearRegressionModel(lags=24, lags_past_covariates=None, lags_future_covariates=[0], output_chunk_length=24, add_encoders=None, likelihood=None, quantiles=None, random_state=42, multi_models=True, use_static_covariates=True)}

In [9]:
# loading the trained models from disk, which have been trained already
trained_models, untrained_models = load_trained_models(config, model_instances)

ValueError: The file /Users/nikolaushouben/Desktop/WattCast/models/2_town_GLENDOVEER-13596/XGBModel.joblib doesn't exist


In [10]:
import time

from train import data_pipeline

def train_models(untrained_models, config_per_model):
    
    """
    This function does the actual training and is used by 'training'.
    Takes in a list of models on the training data and validates them on the validation data if it is available.

    Returns the trained models and the runtimes (how long a model took to train).

    """

    run_times = {}

    data = load_data(config)

    models = []

    for model_abbr, model in untrained_models.items():
        start_time = time.time()
        print(f"Training {model.__class__.__name__}")
        
        model_config = config_per_model[model_abbr]

        piped_data, _ = data_pipeline(data, model_config)

        (ts_train_piped,
        ts_val_piped,
        ts_test_piped,
        ts_train_weather_piped,
        ts_val_weather_piped,
        ts_test_weather_piped) = piped_data


        if model.supports_future_covariates:
            try:
                model.fit(
                    ts_train_piped,
                    future_covariates=ts_train_weather_piped,
                    val_series=ts_val_piped,
                    val_future_covariates=ts_val_weather_piped,
                )
            except:
                model.fit(ts_train_piped, future_covariates=ts_train_weather_piped)
        elif model_config.use_cov_as_past_cov and not model.supports_future_covariates:
            try:
                model.fit(
                    ts_train_piped,
                    past_covariates=ts_train_weather_piped,
                    val_series=ts_val_piped,
                    val_past_covariates=ts_val_weather_piped,
                )
            except:
                model.fit(ts_train_piped, past_covariates=ts_train_weather_piped)
        else:
            try:
                model.fit(ts_train_piped, val_series=ts_val_piped)
            except:
                model.fit(ts_train_piped)

        models.append(model)
    
        end_time = time.time()
        run_times[model.__class__.__name__] = end_time - start_time
    return models, run_times


In [11]:
models, runtimes = train_models(untrained_models, config_per_model)

Training XGBModel
[0]	validation_0-mphe:0.01837
[1]	validation_0-mphe:0.01174
[2]	validation_0-mphe:0.00763
[3]	validation_0-mphe:0.00506
[4]	validation_0-mphe:0.00342
[5]	validation_0-mphe:0.00237
[6]	validation_0-mphe:0.00170
[7]	validation_0-mphe:0.00126
[8]	validation_0-mphe:0.00097
[9]	validation_0-mphe:0.00079
[10]	validation_0-mphe:0.00067
[11]	validation_0-mphe:0.00059
[12]	validation_0-mphe:0.00054
[13]	validation_0-mphe:0.00050
[14]	validation_0-mphe:0.00048
[15]	validation_0-mphe:0.00047
[16]	validation_0-mphe:0.00046
[17]	validation_0-mphe:0.00045
[18]	validation_0-mphe:0.00044
[19]	validation_0-mphe:0.00044
[20]	validation_0-mphe:0.00044
[21]	validation_0-mphe:0.00044
[22]	validation_0-mphe:0.00044
[23]	validation_0-mphe:0.00043
[24]	validation_0-mphe:0.00043
[25]	validation_0-mphe:0.00043
[26]	validation_0-mphe:0.00043
[27]	validation_0-mphe:0.00043
[28]	validation_0-mphe:0.00043
[29]	validation_0-mphe:0.00043
[30]	validation_0-mphe:0.00043
[31]	validation_0-mphe:0.00043
