In [1]:
from datetime import datetime
import os
import warnings

import heliopy.data.omni as omni
from matplotlib import pyplot as plt
import optuna
from optuna import visualization as viz
import numpy as np
import pandas as pd
from scipy.signal import find_peaks
from scipy.stats import ks_2samp
from tensorflow import keras

from typing import *

warnings.filterwarnings("ignore")
plt.style.use("seaborn")

In [3]:
START_TIME_CYCLE_21 = datetime(1976, 3, 1)  
START_TIME_CYCLE_22 = datetime(1986, 9, 1)  
START_TIME_CYCLE_23 = datetime(1996, 8, 1)  
START_TIME_CYCLE_24 = datetime(2008, 12, 1)  
START_TIME_CYCLE_25 = datetime(2019, 12, 1)

INPUT_LENGTH = 100
OUTPUT_LENGTH = 24

In [4]:
def get_omni_rtn_data(start_time, end_time):
    identifier = 'OMNI_COHO1HR_MERGED_MAG_PLASMA'  # COHO 1HR data
    omni_data = omni._omni(start_time, end_time, identifier=identifier, intervals='yearly', warn_missing_units=False)
    return omni_data

In [5]:
cycle_21 = get_omni_rtn_data(START_TIME_CYCLE_21, START_TIME_CYCLE_22).to_dataframe()
cycle_22 = get_omni_rtn_data(START_TIME_CYCLE_22, START_TIME_CYCLE_23).to_dataframe()
cycle_23 = get_omni_rtn_data(START_TIME_CYCLE_23, START_TIME_CYCLE_24).to_dataframe()
cycle_24 = get_omni_rtn_data(START_TIME_CYCLE_24, START_TIME_CYCLE_25).to_dataframe()

mag_field_strength_21 = np.array(cycle_21["BR"])
mag_field_strength_22 = np.array(cycle_22["BR"])
mag_field_strength_23 = np.array(cycle_23["BR"])
mag_field_strength_24 = np.array(cycle_24["BR"])

In [29]:
def lstm_prepare_1d(mag_field_strength, input_length=INPUT_LENGTH, output_length=OUTPUT_LENGTH):
    inputs = np.array([mag_field_strength[i:i + input_length] 
                       for i in range(len(mag_field_strength) - input_length)])[:, :, np.newaxis]
    outputs = np.array([mag_field_strength[i + input_length:i + input_length + output_length] 
                        for i in range(len(mag_field_strength) - input_length - output_length)])


    nan_check = np.array([mag_field_strength[i:i + input_length + output_length] 
                          for i in range(len(mag_field_strength) - input_length - output_length)])

    inputs = inputs[np.where([~np.any(np.isnan(i)) for i in nan_check])]
    outputs = outputs[np.where([~np.any(np.isnan(i)) for i in nan_check])]
    

    print("Input shape:", inputs.shape)
    print("Output shape:", outputs.shape)
    print("Any Nans?:", np.any(np.isnan(outputs)) or np.any(np.isnan(inputs)))
    print("")
    return inputs, outputs

In [30]:
inputs_21, outputs_21 = lstm_prepare_1d(mag_field_strength_21)
inputs_23, outputs_23 = lstm_prepare_1d(mag_field_strength_23)
inputs_val, outputs_val = lstm_prepare_1d(mag_field_strength_22)
inputs_test, outputs_test = lstm_prepare_1d(mag_field_strength_24)

Input shape: (5132, 100, 1)
Output shape: (5132, 24)
Any Nans?: False

Input shape: (105288, 100, 1)
Output shape: (105288, 24)
Any Nans?: False

Input shape: (11376, 100, 1)
Output shape: (11376, 24)
Any Nans?: False

Input shape: (95144, 100, 1)
Output shape: (95144, 24)
Any Nans?: False



In [31]:
inputs_train = np.concatenate([inputs_21, inputs_23])
outputs_train = np.concatenate([outputs_21, outputs_23])

### Baselines

In [41]:
baselines = {}

##### Last 24 Timestep

In [61]:
for name, dset, out in zip(["last_train", "last_val", "last_test"], 
                           [inputs_train, inputs_val, inputs_test],
                           [outputs_train, outputs_val, outputs_test]):
    baselines[name] = dset[:, -24:, 0]
    print("MSE {}:".format(name), np.mean(np.mean((baselines[name] - out) ** 2, axis=1)))

MSE last_train: 16.976336
MSE last_val: 13.899673
MSE last_test: 12.40154


##### Last Timestep

In [62]:
for name, dset, out in zip(["last_train", "last_val", "last_test"], 
                           [inputs_train, inputs_val, inputs_test],
                           [outputs_train, outputs_val, outputs_test]):
    baselines[name] = dset[:, -1:, 0]
    print("MSE {}:".format(name), np.mean(np.mean((baselines[name] - out) ** 2, axis=1)))

MSE last_train: 12.91327
MSE last_val: 11.227758
MSE last_test: 9.525721


##### Mean of last day

In [69]:
for name, dset, out in zip(["mean_train", "mean_val", "mean_test"], 
                           [inputs_train, inputs_val, inputs_test],
                           [outputs_train, outputs_val, outputs_test]):
    baselines[name] = np.mean(dset[:, -24:, 0], axis=1)[:, np.newaxis]
    print("MSE {}:".format(name), np.mean((baselines[name] - out) ** 2))

MSE mean_train: 11.312171
MSE mean_val: 9.051903
MSE mean_test: 8.162513


##### Median

In [70]:
for name, dset, out in zip(["median_train", "median_val", "median_test"], 
                           [inputs_train, inputs_val, inputs_test],
                           [outputs_train, outputs_val, outputs_test]):
    baselines[name] = np.median(dset[:, -24:, 0], axis=1)[:, np.newaxis]
    print("MSE {}:".format(name), np.mean((baselines[name] - out) ** 2))

MSE median_train: 12.128397
MSE median_val: 9.716257
MSE median_test: 8.827617


##### Start

In [71]:
for name, dset, out in zip(["start_train", "start_val", "start_test"], 
                           [inputs_train, inputs_val, inputs_test],
                           [outputs_train, outputs_val, outputs_test]):
    baselines[name] = dset[:, -24:-23, 0]
    print("MSE {}:".format(name), np.mean((baselines[name] - out) ** 2))

MSE start_train: 18.744093
MSE start_val: 15.315161
MSE start_test: 13.658504


### LSTM

In [85]:
train_mag = np.concatenate([mag_field_strength_21, mag_field_strength_23])

##### Normalised

In [86]:
inputs_train_norm = (inputs_train - np.nanmean(train_mag)) / np.nanstd(train_mag)
outputs_train_norm = (outputs_train - np.nanmean(train_mag)) / np.nanstd(train_mag)

inputs_val_norm = (inputs_val - np.nanmean(train_mag)) / np.nanstd(train_mag)
outputs_val_norm = (outputs_val - np.nanmean(train_mag)) / np.nanstd(train_mag)

inputs_test_norm = (inputs_test - np.nanmean(train_mag)) / np.nanstd(train_mag)

In [None]:
i = 100
j = 32

model = keras.models.Sequential(
    [
        keras.layers.LSTM(j, name="lstm_initial", input_shape=(None, 1)),
        keras.layers.Dense(OUTPUT_LENGTH, name="dense_final", activation="linear"),
    ]
)
optimizer = keras.optimizers.Adam(lr=1e-3)
model.compile(optimizer=optimizer, loss="mse", metrics=["mae"])
model.fit(inputs_train_norm[:, -i:], outputs_train_norm, validation_data=(inputs_val_norm[:, -i:], outputs_val_norm),
          batch_size=32, epochs=500, verbose=2,
          callbacks=[keras.callbacks.EarlyStopping(restore_best_weights=True, patience=10)])


In [88]:
preds = model.predict(inputs_test_norm) * np.nanstd(train_mag) + np.nanmean(train_mag)

In [89]:
np.mean(np.mean((preds - outputs_test) ** 2, axis=1))

6.6148252

##### Non-normalised

In [81]:
i = 100
j = 32

model = keras.models.Sequential(
    [
        keras.layers.LSTM(j, name="lstm_initial", input_shape=(None, 1)),
        keras.layers.Dense(OUTPUT_LENGTH, name="dense_final", activation="linear"),
    ]
)
optimizer = keras.optimizers.Adam(lr=1e-3)
model.compile(optimizer=optimizer, loss="mse", metrics=["mae"])
model.fit(inputs_train[:, -i:], outputs_train, validation_data=(inputs_val[:, -i:], outputs_val),
          batch_size=32, epochs=500, verbose=2,
          callbacks=[keras.callbacks.EarlyStopping(restore_best_weights=True, patience=10),
                     keras.callbacks.ModelCheckpoint("../models/baseline_model_cycle_{}_{}.h5".format(i, j), save_best_only=True)])


Epoch 1/500
3451/3451 - 68s - loss: 9.2058 - mae: 2.1950 - val_loss: 7.6166 - val_mae: 2.0915
Epoch 2/500
3451/3451 - 64s - loss: 8.9666 - mae: 2.1555 - val_loss: 7.6237 - val_mae: 2.1002
Epoch 3/500
3451/3451 - 64s - loss: 8.9366 - mae: 2.1547 - val_loss: 7.6309 - val_mae: 2.0946
Epoch 4/500
3451/3451 - 72s - loss: 8.9116 - mae: 2.1512 - val_loss: 7.7363 - val_mae: 2.1313
Epoch 5/500
3451/3451 - 73s - loss: 8.8589 - mae: 2.1450 - val_loss: 7.8493 - val_mae: 2.1175
Epoch 6/500
3451/3451 - 71s - loss: 8.7945 - mae: 2.1377 - val_loss: 7.7321 - val_mae: 2.1256
Epoch 7/500
3451/3451 - 79s - loss: 8.7154 - mae: 2.1310 - val_loss: 7.7520 - val_mae: 2.1133
Epoch 8/500
3451/3451 - 75s - loss: 8.6372 - mae: 2.1220 - val_loss: 7.9955 - val_mae: 2.1278
Epoch 9/500
3451/3451 - 74s - loss: 8.5847 - mae: 2.1159 - val_loss: 7.8537 - val_mae: 2.1248
Epoch 10/500
3451/3451 - 73s - loss: 8.5448 - mae: 2.1128 - val_loss: 7.9252 - val_mae: 2.1393
Epoch 11/500
3451/3451 - 75s - loss: 8.4902 - mae: 2.1075 -

<tensorflow.python.keras.callbacks.History at 0x7f9af9be9f28>

In [82]:
preds = model.predict(inputs_test)

In [83]:
np.mean(np.mean((preds - outputs_test) ** 2, axis=1))

6.6369243