In [1]:
from datetime import datetime
import os

import heliopy.data.omni as omni
from matplotlib import pyplot as plt
import optuna
from optuna import visualization as viz
import numpy as np
from tensorflow import keras

from typing import *

In [2]:
START_TIME = datetime(1995, 1, 1)
END_TIME = datetime(2018, 2, 28)
INPUT_LENGTH = 24

## Table of Contents

1. Load Data
2. Split data into time series sections
3. Split into train/val/test
4. Baseline models
5. Initial LSTM
6. Multivar LSTM (repeating 2-5)
7. Optuna multivar LSTM 
8. CNN preprocessing

### 1. Load in data

In [3]:
def get_omni_rtn_data(start_time, end_time):
    identifier = 'OMNI_COHO1HR_MERGED_MAG_PLASMA'  # COHO 1HR data
    omni_data = omni._omni(start_time, end_time, identifier=identifier, intervals='yearly', warn_missing_units=False)
    return omni_data

In [4]:
data = get_omni_rtn_data(START_TIME, END_TIME).to_dataframe()

In [5]:
mag_field_strength = np.array(data["BR"])

### 2. Split into INPUT_LENGTH sections 

In [6]:
len(np.array([mag_field_strength[i:i + INPUT_LENGTH] 
                        for i in range(len(mag_field_strength) - INPUT_LENGTH)])[:, :, np.newaxis])

203014

In [7]:
inputs = np.array([mag_field_strength[i:i + INPUT_LENGTH] 
                        for i in range(len(mag_field_strength) - INPUT_LENGTH)])[:, :, np.newaxis]
outputs = np.array(mag_field_strength[INPUT_LENGTH:])

nan_check = np.array([mag_field_strength[i:i + INPUT_LENGTH + 1] 
                      for i in range(len(mag_field_strength) - INPUT_LENGTH)])

inputs = inputs[np.where([~np.any(np.isnan(i)) for i in nan_check])]
outputs = outputs[np.where([~np.any(np.isnan(i)) for i in nan_check])]

print("Input shape:", inputs.shape)
print("Output shape:", outputs.shape)

print("Any Nans?:", np.any(np.isnan(outputs)) or np.any(np.isnan(inputs)))

Input shape: (201389, 24, 1)
Output shape: (201389,)
Any Nans?: False


### 3. Split into train/val/test

In [8]:
# Doing this time-based, so most recent = test set, earliest = train set 
train_size = 0.7
val_size = 0.15
data_size = len(inputs)

inputs_train, outputs_train = inputs[:int(train_size * data_size)], outputs[:int(train_size * data_size)]
inputs_val, outputs_val = inputs[int(train_size * data_size):int((train_size + val_size) * data_size)], outputs[int(train_size * data_size):int((train_size + val_size) * data_size)]
inputs_test, outputs_test = inputs[int((train_size + val_size) * data_size):], outputs[int((train_size + val_size) * data_size):]

print("Train size:", len(inputs_train))
print("Val size:", len(inputs_val))
print("Test size:", len(inputs_test))

Train size: 140972
Val size: 30208
Test size: 30209


### 4. Baselines

In [9]:
baselines = {}

##### 1: Last timestep

In [10]:
for name, dset, out in zip(["last_train", "last_val", "last_test"], 
                           [inputs_train, inputs_val, inputs_test],
                           [outputs_train, outputs_val, outputs_test]):
    baselines[name] = dset[:, -1, 0]
    print("MSE {}:".format(name), np.mean((baselines[name] - out) ** 2))

MSE last_train: 3.14658
MSE last_val: 2.4640298
MSE last_test: 3.3855927


##### 2: Mean 

In [11]:
for name, dset, out in zip(["mean_train", "mean_val", "mean_test"], 
                           [inputs_train, inputs_val, inputs_test],
                           [outputs_train, outputs_val, outputs_test]):
    baselines[name] = np.mean(dset, axis=(1, 2))
    print("MSE {}:".format(name), np.mean((baselines[name] - out) ** 2))

MSE mean_train: 6.9930334
MSE mean_val: 5.7877436
MSE mean_test: 6.955669


##### 3: Median

In [12]:
for name, dset, out in zip(["median_train", "median_val", "median_test"], 
                           [inputs_train, inputs_val, inputs_test],
                           [outputs_train, outputs_val, outputs_test]):
    baselines[name] = np.median(dset, axis=(1, 2))
    print("MSE {}:".format(name), np.mean((baselines[name] - out) ** 2))

MSE median_train: 7.735584
MSE median_val: 6.426136
MSE median_test: 7.6656737


##### 4: Start

In [13]:
for name, dset, out in zip(["start_train", "start_val", "start_test"], 
                           [inputs_train, inputs_val, inputs_test],
                           [outputs_train, outputs_val, outputs_test]):
    baselines[name] = dset[:, 0, 0]
    print("MSE {}:".format(name), np.mean((baselines[name] - out) ** 2))

MSE start_train: 15.318962
MSE start_val: 12.723228
MSE start_test: 15.642863


### 5. Initial LSTM 

In [None]:
model = keras.models.Sequential(
    [
        keras.layers.LSTM(20, activation="relu", name="lstm_initial", input_shape=(None, 1), return_sequences=True),
        keras.layers.LSTM(20, activation="relu", name="lstm_second"),
        keras.layers.Dense(1, name="dense_final", activation="linear"),
    ]
)
model.summary()

In [None]:
optimizer = keras.optimizers.Adam(lr=1e-4)
model.compile(optimizer=optimizer, loss="mse", metrics=["mae"])
model.fit(inputs_train, outputs_train, validation_data=(inputs_val, outputs_val),
          batch_size=32, epochs=500, 
          callbacks=[keras.callbacks.EarlyStopping(restore_best_weights=True, patience=10),
                     keras.callbacks.ModelCheckpoint("../models/baseline_model.h5", save_best_only=True)])

##### Test set evaluation

In [14]:
model = keras.models.load_model("../models/baseline_model.h5")

In [16]:
model.evaluate(inputs_test, outputs_test, verbose=2)

945/945 - 3s - loss: 3.0155 - mean_absolute_error: 1.2339


[3.0155394077301025, 1.2338838577270508]

### 6. Repeat above but use more variables

In [17]:
# Standardise all variables 
data_array = np.array(data)

In [32]:
# Split into INPUT_LENGTH sections
def split_data_into_sections(input_length=INPUT_LENGTH):
    inputs = np.array([data_array[i:i + input_length] for i in range(len(data_array) - input_length)])
    outputs = np.array(mag_field_strength[input_length:]) # np.array(data_array[INPUT_LENGTH:, 2])

    nan_check = np.array([data_array[i:i + input_length + 1] for i in range(len(data_array) - input_length)])

    inputs = inputs[np.where([~np.any(np.isnan(i)) for i in nan_check])]
    outputs = outputs[np.where([~np.any(np.isnan(i)) for i in nan_check])]

    print("Input shape:", inputs.shape)
    print("Output shape:", outputs.shape)

    print("Any Nans?:", np.any(np.isnan(outputs)) or np.any(np.isnan(inputs)))
    return inputs, outputs
inputs, outputs = split_data_into_sections()

Input shape: (189110, 24, 11)
Output shape: (189110,)
Any Nans?: False


In [33]:
def train_val_test_split(train_size=0.7, val_size=0.15):
    train_size = 0.7
    val_size = 0.15
    data_size = len(inputs)

    inputs_train, outputs_train = inputs[:int(train_size * data_size)], outputs[:int(train_size * data_size)]
    inputs_val, outputs_val = inputs[int(train_size * data_size):int((train_size + val_size) * data_size)], outputs[int(train_size * data_size):int((train_size + val_size) * data_size)]
    inputs_test, outputs_test = inputs[int((train_size + val_size) * data_size):], outputs[int((train_size + val_size) * data_size):]


    # standardize
    inputs_val = (inputs_val - np.nanmean(inputs_train, axis=(0, 1))) / np.nanstd(inputs_train, axis=(0, 1))
    inputs_test = (inputs_test - np.nanmean(inputs_train, axis=(0, 1))) / np.nanstd(inputs_train, axis=(0, 1))
    inputs_train = (inputs_train - np.nanmean(inputs_train, axis=(0, 1))) / np.nanstd(inputs_train, axis=(0, 1))


    print("Train size:", len(inputs_train))
    print("Val size:", len(inputs_val))
    print("Test size:", len(inputs_test))
    return inputs_train, inputs_val, inputs_test, outputs_train, outputs_val, outputs_test
inputs_train, inputs_val, inputs_test, outputs_train, outputs_val, outputs_test = train_val_test_split()

Train size: 132377
Val size: 28366
Test size: 28367


In [None]:
model = keras.models.Sequential(
    [
        keras.layers.LSTM(16, activation="relu", name="lstm_initial", input_shape=(None, 11), return_sequences=True),
        keras.layers.LSTM(32, activation="relu", name="lstm_second", return_sequences=True),
        keras.layers.LSTM(64, activation="relu", name="lstm_third"),
        keras.layers.Dense(128, name="dense_initial", activation="relu"),
        keras.layers.Dense(1, name="dense_final", activation="linear")
    ]
)
model.summary()

In [None]:
optimizer = keras.optimizers.Adam(lr=1e-5)
model.compile(optimizer=optimizer, loss="mse", metrics=["mae"])
model.fit(inputs_train, outputs_train, validation_data=(inputs_val, outputs_val),
          batch_size=32, epochs=500, 
          callbacks=[keras.callbacks.EarlyStopping(restore_best_weights=True, patience=10),
                     keras.callbacks.ModelCheckpoint("../models/baseline_model_multivar.h5", save_best_only=True)])

In [None]:
model.evaluate(inputs_test, outputs_test)

### 7. Optuna

In [14]:
def create_model(lstm_layers: int, num_in_lstm_layers: List[int], 
                 dense_layers: int, num_in_dense_layers: List[int]) -> keras.models.Model:
    model = keras.models.Sequential()
    for i in range(lstm_layers):
        if i == 0:
            if i + 1 != lstm_layers:
                model.add(keras.layers.LSTM(num_in_lstm_layers[i], input_shape=(None, 11), return_sequences=True))
            else:
                model.add(keras.layers.LSTM(num_in_lstm_layers[i], input_shape=(None, 11)))
        elif i != 0 and i + 1 == lstm_layers:
            model.add(keras.layers.LSTM(num_in_lstm_layers[i]))
        else:
            model.add(keras.layers.LSTM(num_in_lstm_layers[i], input_shape=(None, 11), return_sequences=True))
            
    for i in range(dense_layers):
        if i + 1 == dense_layers:
            model.add(keras.layers.Dense(num_in_dense_layers[i], activation="linear"))
        else:
            model.add(keras.layers.Dense(num_in_dense_layers[i], activation="relu"))
    
    return model 

In [42]:
def objective(trial):
    
    lstm_layers = trial.suggest_int("lstm_layers", 1, 3)  # Reducing to 3 to save time for now 
    layer_choices = [[4, 8, 16, 32, 64, 128, 256], [8, 16, 32, 64, 128, 256, 512],
                     [16, 32, 64, 128, 256, 512, 1024]]
    lstm_layer_choice = trial.suggest_categorical("lstm_layer_choice", layer_choices)
    lstm_layer_choice = lstm_layer_choice[:lstm_layers]
    
    dense_layers = trial.suggest_int("dense_layers", 1, 2)  # Reducing to 2 to save time for now 
    layer_choices = [[1024, 512, 256, 1], [512, 256, 128, 1], [128, 64, 32, 1]]
    dense_layer_choice = trial.suggest_categorical("dense_layer_choice", layer_choices)
    dense_layer_choice = dense_layer_choice[4 - dense_layers:]
    
    lr = trial.suggest_categorical("lr", [1e-5, 1e-4, 1e-3, 2e-1, 1e-2])
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128, 256])
    
    model = create_model(lstm_layers, lstm_layer_choice, dense_layers, dense_layer_choice)
    
    optimizer = keras.optimizers.Adam(lr=lr)
    model.compile(optimizer=optimizer, loss="mse")
    model.fit(inputs_train, outputs_train, validation_data=(inputs_val, outputs_val),
              batch_size=batch_size, epochs=500, 
              callbacks=[keras.callbacks.EarlyStopping(restore_best_weights=True, patience=10),
                         keras.callbacks.ModelCheckpoint("../models/optuna_{}.h5".format(trial.number), save_best_only=True)],
              verbose=2)

    return model.evaluate(inputs_val, outputs_val)[0]

In [None]:
study = optuna.create_study()  # Create a new study.
study.optimize(objective, n_trials=100)  # Invoke optimization of the objective function.

### 8. CNN Preprocess

In [70]:
inputs, outputs = split_data_into_sections(input_length=24)
inputs_train, inputs_val, inputs_test, outputs_train, outputs_val, outputs_test = train_val_test_split()

Input shape: (189110, 24, 11)
Output shape: (189110,)
Any Nans?: False
Train size: 132377
Val size: 28366
Test size: 28367


In [94]:
model = keras.models.Sequential(
    [
        keras.layers.Conv1D(8, kernel_size=4, strides=2, activation="relu", input_shape=(24, 11)),
        keras.layers.LSTM(16, activation="relu", name="lstm_initial", return_sequences=True),
        keras.layers.Dense(32, name="dense_initial", activation="relu"),
        keras.layers.Dense(1, name="dense_final", activation="linear")
    ]
)
model.summary()

Model: "sequential_47"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_110 (Conv1D)          (None, 11, 8)             360       
_________________________________________________________________
lstm_initial (LSTM)          (None, 11, 16)            1600      
_________________________________________________________________
dense_initial (Dense)        (None, 11, 32)            544       
_________________________________________________________________
dense_final (Dense)          (None, 11, 1)             33        
Total params: 2,537
Trainable params: 2,537
Non-trainable params: 0
_________________________________________________________________


In [None]:
optimizer = keras.optimizers.Adam(lr=1e-3)
model.compile(optimizer=optimizer, loss="mse", metrics=["mae"])
model.fit(inputs_train, outputs_train, validation_data=(inputs_val, outputs_val),
          batch_size=32, epochs=500, verbose=2,
          callbacks=[keras.callbacks.EarlyStopping(restore_best_weights=True, patience=30),
                     keras.callbacks.ModelCheckpoint("../models/cnn_model_multivar.h5", save_best_only=True)])

### 9. WaveNet