In [1]:
from datetime import datetime
import os
import warnings

import heliopy.data.omni as omni
from matplotlib import pyplot as plt
import optuna
from optuna import visualization as viz
import numpy as np
import pandas as pd
from scipy.signal import find_peaks
from scipy.stats import ks_2samp
from sklearn.utils import class_weight
import tensorflow as tf
from tensorflow import keras

from typing import *

warnings.filterwarnings("ignore")
plt.style.use("seaborn")

### Data Preparation

In [2]:
START_TIME_CYCLE_21 = datetime(1976, 3, 1)  
START_TIME_CYCLE_22 = datetime(1986, 9, 1)  
START_TIME_CYCLE_23 = datetime(1996, 8, 1)  
START_TIME_CYCLE_24 = datetime(2008, 12, 1)  
START_TIME_CYCLE_25 = datetime(2019, 12, 1)

INPUT_LENGTH = 100
OUTPUT_LENGTH = 24
PERCENTILE = 10

In [3]:
def get_omni_rtn_data(start_time, end_time):
    identifier = 'OMNI_COHO1HR_MERGED_MAG_PLASMA'  # COHO 1HR data
    omni_data = omni._omni(start_time, end_time, identifier=identifier, intervals='yearly', warn_missing_units=False)
    return omni_data

In [4]:
cycle_21 = get_omni_rtn_data(START_TIME_CYCLE_21, START_TIME_CYCLE_22).to_dataframe()
cycle_22 = get_omni_rtn_data(START_TIME_CYCLE_22, START_TIME_CYCLE_23).to_dataframe()
cycle_23 = get_omni_rtn_data(START_TIME_CYCLE_23, START_TIME_CYCLE_24).to_dataframe()
cycle_24 = get_omni_rtn_data(START_TIME_CYCLE_24, START_TIME_CYCLE_25).to_dataframe()

mag_field_strength_21 = np.array(cycle_21["BR"])
mag_field_strength_22 = np.array(cycle_22["BR"])
mag_field_strength_23 = np.array(cycle_23["BR"])
mag_field_strength_24 = np.array(cycle_24["BR"])

In [5]:
def lstm_prepare_1d(mag_field_strength, input_length=INPUT_LENGTH, output_length=OUTPUT_LENGTH):
    inputs = np.array([mag_field_strength[i:i + input_length] 
                       for i in range(len(mag_field_strength) - input_length)])[:, :, np.newaxis]
    outputs = np.array([mag_field_strength[i + input_length:i + input_length + output_length] 
                        for i in range(len(mag_field_strength) - input_length - output_length)])


    nan_check = np.array([mag_field_strength[i:i + input_length + output_length] 
                          for i in range(len(mag_field_strength) - input_length - output_length)])

    inputs = inputs[np.where([~np.any(np.isnan(i)) for i in nan_check])]
    outputs = outputs[np.where([~np.any(np.isnan(i)) for i in nan_check])]
    

    print("Input shape:", inputs.shape)
    print("Output shape:", outputs.shape)
    print("Any Nans?:", np.any(np.isnan(outputs)) or np.any(np.isnan(inputs)))
    print("")
    return inputs, outputs

### Train/Val/Test

In [6]:
inputs_21, outputs_21 = lstm_prepare_1d(mag_field_strength_21)
inputs_23, outputs_23 = lstm_prepare_1d(mag_field_strength_23)
inputs_val, outputs_val = lstm_prepare_1d(mag_field_strength_22)
inputs_test, outputs_test = lstm_prepare_1d(mag_field_strength_24)

Input shape: (5132, 100, 1)
Output shape: (5132, 24)
Any Nans?: False

Input shape: (105288, 100, 1)
Output shape: (105288, 24)
Any Nans?: False

Input shape: (11376, 100, 1)
Output shape: (11376, 24)
Any Nans?: False

Input shape: (95144, 100, 1)
Output shape: (95144, 24)
Any Nans?: False



In [7]:
inputs_train = np.concatenate([inputs_21, inputs_23])
outputs_train = np.concatenate([outputs_21, outputs_23])

### Convert to classification

In [8]:
training_field_strength = np.concatenate([mag_field_strength_21, mag_field_strength_23])
training_field_strength = training_field_strength[training_field_strength == training_field_strength]
solar_storm = np.percentile(training_field_strength, PERCENTILE)

In [9]:
outputs_train = np.any(outputs_train <= solar_storm, axis=1).astype(int)
outputs_val = np.any(outputs_val <= solar_storm, axis=1).astype(int)
outputs_test = np.any(outputs_test <= solar_storm, axis=1).astype(int)

### Normalise

In [10]:
inputs_train_norm = (inputs_train - np.nanmean(training_field_strength)) / np.nanstd(training_field_strength)
inputs_val_norm = (inputs_val - np.nanmean(training_field_strength)) / np.nanstd(training_field_strength)
inputs_test_norm = (inputs_test - np.nanmean(training_field_strength)) / np.nanstd(training_field_strength)

In [11]:
print("Baseline val acc:", 1 - outputs_val.sum() / len(outputs_val))
print("Baseline train acc:", 1 - outputs_train.sum() / len(outputs_train))

Baseline val acc: 0.6044303797468354
Baseline train acc: 0.6679224778119905


### LSTM

In [12]:
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(outputs_train),
                                                 outputs_train)
class_weights = {0: class_weights[0], 1: class_weights[1]}

In [13]:
classes = [0, 1]

class AccuracyCallback(tf.keras.callbacks.Callback):

    def __init__(self, data):
        self.data = data

    def on_epoch_end(self, epoch, logs=None):
        x_data, y_data = self.data
        x_result = self.model.predict(x_data, verbose=0)

        correct, incorrect = 0, 0
        class_correct, class_incorrect = np.zeros(len(classes)), np.zeros(len(classes))
        for i in range(len(x_data)):
            pred_label = np.round(x_result[i])
            actual_label = y_data[i]
            if pred_label == actual_label:
                class_correct[actual_label] += 1   
                correct += 1
            else:
                class_incorrect[actual_label] += 1
                incorrect += 1

        print("\tCorrect: %d" %(correct))
        print("\tIncorrect: %d" %(incorrect))

        for i in range(len(classes)):
            tot = float(class_correct[i] + class_incorrect[i])
            class_acc = -1
            if (tot > 0):
                class_acc = float(class_correct[i]) / tot

            print("\t%s: %.3f" %(classes[i], class_acc)) 

        acc = float(correct) / float(correct + incorrect)  
        print("\tCurrent Network Accuracy: %.3f" %(acc))

In [None]:
model = keras.models.Sequential(
    [
        keras.layers.LSTM(32, name="lstm_initial", input_shape=(None, 1)),
        keras.layers.Dense(1, name="dense_final", activation="sigmoid"),
    ]
)

optimizer = keras.optimizers.Adam(lr=1e-3)
model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"])
model.fit(inputs_train_norm, outputs_train, validation_data=(inputs_val_norm, outputs_val),
          batch_size=32, epochs=500, verbose=2,
          callbacks=[keras.callbacks.EarlyStopping(restore_best_weights=True, patience=10),
                     AccuracyCallback((inputs_val_norm, outputs_val))],
          class_weight=class_weights)


### Larger

In [None]:
i = 100

model = keras.models.Sequential(
    [
        keras.layers.LSTM(16, input_shape=(None, 1), return_sequences=True),
        keras.layers.LSTM(32),
        keras.layers.Dense(64, activation="relu"),
        keras.layers.Dense(1, activation="sigmoid")
    ]
)

optimizer = keras.optimizers.Adam(lr=1e-3)
model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"])
model.fit(inputs_train_norm[:, -i:], outputs_train, validation_data=(inputs_val_norm[:, -i:], outputs_val),
          batch_size=32, epochs=500, verbose=2,
          callbacks=[keras.callbacks.EarlyStopping(restore_best_weights=True, patience=10),
                     AccuracyCallback((inputs_val_norm[:, -i:], outputs_val))],
          class_weight=class_weights)


## With Reg

In [None]:
i = 100

model = keras.models.Sequential(
    [
        keras.layers.LSTM(16, input_shape=(None, 1), return_sequences=True, recurrent_dropout=0.2),
        keras.layers.LSTM(32, recurrent_dropout=0.2),
        keras.layers.Dense(64, activation="relu"),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(1, activation="sigmoid")
    ]
)

optimizer = keras.optimizers.Adam(lr=1e-3)
model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"])
model.fit(inputs_train_norm[:, -i:], outputs_train, validation_data=(inputs_val_norm[:, -i:], outputs_val),
          batch_size=32, epochs=500, verbose=2,
          callbacks=[keras.callbacks.EarlyStopping(restore_best_weights=True, patience=10),
                     AccuracyCallback((inputs_val_norm[:, -i:], outputs_val))],
          class_weight=class_weights)


### Questions Today

In [None]:
# 1. What baselines are we doing? AnEn, SVM, mean/median/last/first, all zeros
# 2. Which percentile do we pick? 
# 3. Sounds like we only care about a large negative value right? A: Yes
# 4. So it sounds like it's not relative value, or did he misinterpret the question? A: Probably

In [None]:
# Max task
# 1. Get test subset for AnEn
# 2. Change output to not do absolute value 
# 3. RUn some initial baselines

# Their data range: 01 Jan 1995 to end of Dec 2019