In [1]:
from datetime import datetime
import os

import heliopy.data.omni as omni
from matplotlib import pyplot as plt
import numpy as np
from tensorflow import keras

In [2]:
START_TIME = datetime(1995, 1, 1)
END_TIME = datetime(2018, 2, 28)
INPUT_LENGTH = 24

### 1. Load in data

In [28]:
def get_omni_rtn_data(start_time, end_time):
    identifier = 'OMNI_COHO1HR_MERGED_MAG_PLASMA'  # COHO 1HR data
    omni_data = omni._omni(start_time, end_time, identifier=identifier, intervals='yearly', warn_missing_units=False)
    return omni_data

In [29]:
data = get_omni_rtn_data(START_TIME, END_TIME).to_dataframe()

In [30]:
mag_field_strength = np.array(data["BR"])

### 2. Split into INPUT_LENGTH sections 

In [31]:
len(np.array([mag_field_strength[i:i + INPUT_LENGTH] 
                        for i in range(len(mag_field_strength) - INPUT_LENGTH)])[:, :, np.newaxis])

203014

In [32]:
inputs = np.array([mag_field_strength[i:i + INPUT_LENGTH] 
                        for i in range(len(mag_field_strength) - INPUT_LENGTH)])[:, :, np.newaxis]
outputs = np.array(mag_field_strength[INPUT_LENGTH:])

nan_check = np.array([mag_field_strength[i:i + INPUT_LENGTH + 1] 
                      for i in range(len(mag_field_strength) - INPUT_LENGTH)])

inputs = inputs[np.where([~np.any(np.isnan(i)) for i in nan_check])]
outputs = outputs[np.where([~np.any(np.isnan(i)) for i in nan_check])]

print("Input shape:", inputs.shape)
print("Output shape:", outputs.shape)

print("Any Nans?:", np.any(np.isnan(outputs)) or np.any(np.isnan(inputs)))

Input shape: (201389, 24, 1)
Output shape: (201389,)
Any Nans?: False


### 3. Split into train/val/test

In [33]:
# Doing this time-based, so most recent = test set, earliest = train set 
train_size = 0.7
val_size = 0.15
data_size = len(inputs)

inputs_train, outputs_train = inputs[:int(train_size * data_size)], outputs[:int(train_size * data_size)]
inputs_val, outputs_val = inputs[int(train_size * data_size):int((train_size + val_size) * data_size)], outputs[int(train_size * data_size):int((train_size + val_size) * data_size)]
inputs_test, outputs_test = inputs[int((train_size + val_size) * data_size):], outputs[int((train_size + val_size) * data_size):]

print("Train size:", len(inputs_train))
print("Val size:", len(inputs_val))
print("Test size:", len(inputs_test))

Train size: 140972
Val size: 30208
Test size: 30209


### 4. Baselines

In [34]:
baselines = {}

##### 1: Last timestep

In [35]:
for name, dset, out in zip(["last_train", "last_val", "last_test"], 
                           [inputs_train, inputs_val, inputs_test],
                           [outputs_train, outputs_val, outputs_test]):
    baselines[name] = dset[:, -1, 0]
    print("MSE {}:".format(name), np.mean((baselines[name] - out) ** 2))

MSE last_train: 3.14658
MSE last_val: 2.4640298
MSE last_test: 3.3855927


##### 2: Mean 

In [36]:
for name, dset, out in zip(["mean_train", "mean_val", "mean_test"], 
                           [inputs_train, inputs_val, inputs_test],
                           [outputs_train, outputs_val, outputs_test]):
    baselines[name] = np.mean(dset, axis=(1, 2))
    print("MSE {}:".format(name), np.mean((baselines[name] - out) ** 2))

MSE mean_train: 6.9930334
MSE mean_val: 5.7877436
MSE mean_test: 6.955669


##### 3: Median

In [37]:
for name, dset, out in zip(["median_train", "median_val", "median_test"], 
                           [inputs_train, inputs_val, inputs_test],
                           [outputs_train, outputs_val, outputs_test]):
    baselines[name] = np.median(dset, axis=(1, 2))
    print("MSE {}:".format(name), np.mean((baselines[name] - out) ** 2))

MSE median_train: 7.735584
MSE median_val: 6.426136
MSE median_test: 7.6656737


##### 4: Start

In [38]:
for name, dset, out in zip(["start_train", "start_val", "start_test"], 
                           [inputs_train, inputs_val, inputs_test],
                           [outputs_train, outputs_val, outputs_test]):
    baselines[name] = dset[:, 0, 0]
    print("MSE {}:".format(name), np.mean((baselines[name] - out) ** 2))

MSE start_train: 15.318962
MSE start_val: 12.723228
MSE start_test: 15.642863


### 5. Initial LSTM 

In [51]:
model = keras.models.Sequential(
    [
        keras.layers.LSTM(20, activation="relu", name="lstm_initial", input_shape=(None, 1), return_sequences=True),
        keras.layers.LSTM(20, activation="relu", name="lstm_second"),
        keras.layers.Dense(1, name="dense_final", activation="linear"),
    ]
)
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_initial (LSTM)          (None, None, 20)          1760      
_________________________________________________________________
lstm_second (LSTM)           (None, 20)                3280      
_________________________________________________________________
dense_final (Dense)          (None, 1)                 21        
Total params: 5,061
Trainable params: 5,061
Non-trainable params: 0
_________________________________________________________________


In [52]:
optimizer = keras.optimizers.Adam(lr=1e-4)
model.compile(optimizer=optimizer, loss="mse", metrics=["mae"])
model.fit(inputs_train, outputs_train, validation_data=(inputs_val, outputs_val),
          batch_size=32, epochs=500, 
          callbacks=[keras.callbacks.EarlyStopping(restore_best_weights=True, patience=10),
                     keras.callbacks.ModelCheckpoint("../models/baseline_model.h5", save_best_only=True)])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500


<tensorflow.python.keras.callbacks.History at 0x6434b5390>

##### Test set evaluation

In [53]:
model.evaluate(inputs_test, outputs_test)



[3.0155394077301025, 1.2338838577270508]

### 6. Repeat above but use more variables

In [68]:
# Standardise all variables 
data_array = np.array(data)

In [69]:
# Split into INPUT_LENGTH sections
inputs = np.array([data_array[i:i + INPUT_LENGTH] for i in range(len(data_array) - INPUT_LENGTH)])
outputs = np.array(mag_field_strength[INPUT_LENGTH:]) # np.array(data_array[INPUT_LENGTH:, 2])

nan_check = np.array([data_array[i:i + INPUT_LENGTH + 1] for i in range(len(data_array) - INPUT_LENGTH)])

inputs = inputs[np.where([~np.any(np.isnan(i)) for i in nan_check])]
outputs = outputs[np.where([~np.any(np.isnan(i)) for i in nan_check])]

print("Input shape:", inputs.shape)
print("Output shape:", outputs.shape)

print("Any Nans?:", np.any(np.isnan(outputs)) or np.any(np.isnan(inputs)))

Input shape: (189110, 24, 11)
Output shape: (189110,)
Any Nans?: False


In [86]:
train_size = 0.7
val_size = 0.15
data_size = len(inputs)

inputs_train, outputs_train = inputs[:int(train_size * data_size)], outputs[:int(train_size * data_size)]
inputs_val, outputs_val = inputs[int(train_size * data_size):int((train_size + val_size) * data_size)], outputs[int(train_size * data_size):int((train_size + val_size) * data_size)]
inputs_test, outputs_test = inputs[int((train_size + val_size) * data_size):], outputs[int((train_size + val_size) * data_size):]


# standardize
inputs_val = (inputs_val - np.nanmean(inputs_train, axis=(0, 1))) / np.nanstd(inputs_train, axis=(0, 1))
inputs_test = (inputs_test - np.nanmean(inputs_train, axis=(0, 1))) / np.nanstd(inputs_train, axis=(0, 1))
inputs_train = (inputs_train - np.nanmean(inputs_train, axis=(0, 1))) / np.nanstd(inputs_train, axis=(0, 1))


print("Train size:", len(inputs_train))
print("Val size:", len(inputs_val))
print("Test size:", len(inputs_test))

Train size: 132377
Val size: 28366
Test size: 28367


In [87]:
model = keras.models.Sequential(
    [
        keras.layers.LSTM(16, activation="relu", name="lstm_initial", input_shape=(None, 11), return_sequences=True),
        keras.layers.LSTM(32, activation="relu", name="lstm_second", return_sequences=True),
        keras.layers.LSTM(64, activation="relu", name="lstm_third"),
        keras.layers.Dense(128, name="dense_initial", activation="relu"),
        keras.layers.Dense(1, name="dense_final", activation="linear")
    ]
)
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_initial (LSTM)          (None, None, 16)          1792      
_________________________________________________________________
lstm_second (LSTM)           (None, None, 32)          6272      
_________________________________________________________________
lstm_third (LSTM)            (None, 64)                24832     
_________________________________________________________________
dense_initial (Dense)        (None, 128)               8320      
_________________________________________________________________
dense_final (Dense)          (None, 1)                 129       
Total params: 41,345
Trainable params: 41,345
Non-trainable params: 0
_________________________________________________________________


In [88]:
optimizer = keras.optimizers.Adam(lr=1e-5)
model.compile(optimizer=optimizer, loss="mse", metrics=["mae"])
model.fit(inputs_train, outputs_train, validation_data=(inputs_val, outputs_val),
          batch_size=32, epochs=500, 
          callbacks=[keras.callbacks.EarlyStopping(restore_best_weights=True, patience=10),
                     keras.callbacks.ModelCheckpoint("../models/baseline_model_multivar.h5", save_best_only=True)])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500


<tensorflow.python.keras.callbacks.History at 0x64390bef0>

In [89]:
model.evaluate(inputs_test, outputs_test)



[2.9669766426086426, 1.2264602184295654]