In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn

In [2]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device, torch.cuda.is_available(), torch.version.cuda, torch.cuda.device_count(), torch.cuda.get_device_name(0)

('cuda:0', True, '12.4', 1, 'NVIDIA GeForce GTX 1650 Ti')

# Data preparation

## Data Processing Functions

In [11]:
def generate_batches(a, b, c, d, y, n):
    '''
    Function creates batches out of a, b, c, d data of size 8 hours,
    8-hour window is moved throughout the arrays with step one hour,
    Than all the 8-hour batches are concatenated together to create one big 32 size batch.
    Finally appended to main array.

    Also cuts first 8 hours out of y data, so that for every 8 hours of x data
    there is one hour of y data hour ahead of input neurons data.

    Parameters
    ----------
    a - Bz
    b - Sigma Bz
    c - n
    d = v
    y - y training data (DST)
    n - length of event

    Returns
    -------
    batches - Matrix where rows correspond to 8 hours of x array data
          y - y training data (DST) that has first 8 hours cut off
    '''

    y = y[8:]
    y = np.array(y)

    batches = []

    for i in range(n):
        if (i+8) <= n:
            # print(i, i+8)
            batch_a = a[i:i+8]
            batch_b = b[i:i+8]
            batch_c = c[i:i+8]
            batch_d = d[i:i+8]

            final_batch = np.concatenate((batch_a, batch_b, batch_c, batch_d), axis=None)

            batches.append(final_batch)

    batches = batches[:-1]
    batches = np.array(batches)

    return batches, y

In [12]:
def DataProcessing():
    '''
    Function Loops through trainind data in steps of 146. Because every event is 146 hours long.
    For every 146 hours of every parameter (Bz, sigma Bz, n, v, DST) is created a matrix with dimensions 146 X 32. of x data
    and vector of  size 146 values of y data. One y value for 32 values of parameters.
    This matrix is than added to final array. Which in the end corresponds to tensor of size 60 X 146 X 32 and matrix 60 X 146.
    60 Because there are 60 events in training data. And we need to have 32 values for every DST values so there is 146 X 32 matrix
    of parameters.

    Returns
    -------
    x_train data
    y_train data

    '''

    data = pd.read_csv('./data/train_dst_new.csv')

    x_train_batches = []
    y_train_batches = []

    for i in range(0, len(data), 147):
        Bz       = data.loc[i:i+146]['Bz_GSE'][:-1].to_numpy()
        Bz_sigma = data.loc[i:i+146]['Sigma_Bz_GSE'][:-1].to_numpy()
        n        = data.loc[i:i+146]['Proton_density'][:-1].to_numpy()
        v        = data.loc[i:i+146]['Plasma_speed'][:-1].to_numpy()
        DST      = data.loc[i:i+146]['Dst_index'][:-1].to_numpy()

        x_train, y_train = generate_batches(Bz, Bz_sigma, n, v, DST, 146)

        y_train_batches.append(y_train)
        x_train_batches.append(x_train)

    return (x_train_batches, y_train_batches)


x_train: There is 60 events, for event there is 138 arrays of 32 values. 32 values correspond to 8 hours of every parameter. 

In [13]:
x_train, y_train = DataProcessing()
x_train = np.array(x_train)
y_train = np.array(y_train)
x_train.shape, y_train.shape

((60, 138, 32), (60, 138))

## Split events to matrix $\mathbb{R}^{60\times138\times8\times4}$
The way data are passed into LSTM are different than normal neural network. We have 8 hours for every parameter for 138 hours concatenated together, which results in matrix $\mathbb{R}^{138\times32}$. However LSTM needs every feature separatly, matter of fact, it needs the 8 hours for the 4 features in columns in a matrix $\mathbb{R}^{8\times4}$ times 138

In [18]:
x_tra = x_train.reshape(60, 138, 4, 8)
x_tra = np.transpose(x_tra, (0, 1, 3, 2))
x_tra = torch.from_numpy(x_tra)
x_tra.shape

torch.Size([60, 138, 8, 4])

In [19]:
y_tra = y_train.reshape(60, -1, 1)
y_tra = torch.from_numpy(y_tra)
y_tra.shape

torch.Size([60, 138, 1])

# Create LSTM model

LSTM or Long-Short-Term-Memory is a type of neural network that is able to remember complex relations within data. It's a special type of recurrent neural network. However, basic RNNs are highly prone to fail due to the vanishing or exploding gradient, which the LSTM NN fixes to some extent. The main difference between RNN and LSTM is that RNN has normal basic structure just like FFNN just there are some conections from the output to some hidden layer or something like that, nevertheless you just have input layers, recurrent layers, hidden layers and input layer and that's it. LSTM is quite different. In LSTM you have this things called cells. Every LSTM cell is a little recurrent neural network on its own. These cells, besides having NNs in them, they also have gates. These gates could be interpreted as parameters that are trainable and decide which data should be forgotten and which should remebered. In other words, what should be passed further and which should be discarded when new data comes.

We could also show the distinction between the RNN and LSTM mathemathically and also learn about their innerworkings:


In [14]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_stacked_layers):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_stacked_layers = num_stacked_layers
        self.rnn = nn.LSTM(
            input_size, hidden_size, num_stacked_layers,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size).to(device)
        c0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size).to(device)
        out, _ = self.rnn(x, (h0, c0))
        out    = self.fc(out[:, -1, :])
        return out
        

In [122]:
hidden_n  = 69
stacked_n = 3

model = LSTM(4, hidden_n, stacked_n)
model.to(device)
model

LSTM(
  (rnn): LSTM(4, 69, num_layers=3, batch_first=True)
  (fc): Linear(in_features=69, out_features=1, bias=True)
)

# Training loop

In [102]:
def train(epochs, x, y, loss_f, optimizerm, silent):
    for i in range(epochs):
        index = 0
        model.train(True)
        running_loss = 0.0

        if silent != True:
            print(f'EPOCH : {i}')
        
        for batch_x, batch_y in zip(x, y):
            index += 1
            
            batch_x = batch_x.to(torch.float32).to(device)
            batch_y = batch_y.to(torch.float32).to(device)

            output = model(batch_x)
            loss = loss_f(output, batch_y)
            running_loss += loss.item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if silent != True:
                if index % 60 == 0:
                    avg_loss_across_batches = running_loss / 100
                    print(f'LOSS : {avg_loss_across_batches}')
                    running_loss = 0.0

    print("DONE Training")

In [123]:
lr = 0.001
num_epochs = 75
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

train(num_epochs, x_tra, y_tra, loss_function, optimizer, silent=False)

EPOCH : 0
LOSS : 0.007425057608634234
EPOCH : 1
LOSS : 0.006735195342916995
EPOCH : 2
LOSS : 0.005313104898668826
EPOCH : 3
LOSS : 0.0051571848907042295
EPOCH : 4
LOSS : 0.004895049746264704
EPOCH : 5
LOSS : 0.004638868366018869
EPOCH : 6
LOSS : 0.004471949061262422
EPOCH : 7
LOSS : 0.004382791297975927
EPOCH : 8
LOSS : 0.004326287002768368
EPOCH : 9
LOSS : 0.004280085172504187
EPOCH : 10
LOSS : 0.004234302748809569
EPOCH : 11
LOSS : 0.004182976743904874
EPOCH : 12
LOSS : 0.0041229681496042754
EPOCH : 13
LOSS : 0.004051450149854645
EPOCH : 14
LOSS : 0.003980912353727035
EPOCH : 15
LOSS : 0.00392408981861081
EPOCH : 16
LOSS : 0.003877423732774332
EPOCH : 17
LOSS : 0.0038388708350248634
EPOCH : 18
LOSS : 0.0038051761157112196
EPOCH : 19
LOSS : 0.0037751639616908504
EPOCH : 20
LOSS : 0.0037477149412734434
EPOCH : 21
LOSS : 0.0037223138508852573
EPOCH : 22
LOSS : 0.0036984588694758715
EPOCH : 23
LOSS : 0.0036757255991688
EPOCH : 24
LOSS : 0.003654122989973985
EPOCH : 25
LOSS : 0.0036330973

# Validation Data Processing

In [22]:
def ValidationDataProcessing(valid_data):

    year_1979 = valid_data[valid_data.index.year == 1979]
    year_1980 = valid_data[valid_data.index.year == 1980]
    year_1981 = valid_data[valid_data.index.year == 1981]

    x_valid_79, y_valid_79 = generate_batches(
        year_1979['Bz_GSE'],
        year_1979['Sigma_Bz_GSE'],
        year_1979['Proton_density'],
        year_1979['Plasma_speed'],
        year_1979['Dst_index'],
        len(year_1979)
    )

    x_valid_80, y_valid_80 = generate_batches(
        year_1980['Bz_GSE'],
        year_1980['Sigma_Bz_GSE'],
        year_1980['Proton_density'],
        year_1980['Plasma_speed'],
        year_1980['Dst_index'],
        len(year_1980)
    )

    x_valid_81, y_valid_81 = generate_batches(
        year_1981['Bz_GSE'],
        year_1981['Sigma_Bz_GSE'],
        year_1981['Proton_density'],
        year_1981['Plasma_speed'],
        year_1981['Dst_index'],
        len(year_1981)
    )

    return x_valid_79, y_valid_79, x_valid_80, y_valid_80, x_valid_81, y_valid_81

In [23]:
valid_data = pd.read_csv('./data/test_dst_new.csv')
valid_data.set_index('index', inplace=True)
valid_data.index = pd.to_datetime(valid_data.index)

x_valid_79, y_valid_79, x_valid_80, y_valid_80, x_valid_81, y_valid_81 = ValidationDataProcessing(valid_data)

In [47]:
x_79 = x_valid_79.reshape(x_valid_79.shape[0], 4, 8)
x_79 = torch.transpose(torch.from_numpy(x_79), 2, 1)
x_79 = x_79.to(torch.float32).to(device)

x_80 = x_valid_80.reshape(x_valid_80.shape[0], 4, 8)
x_80 = torch.transpose(torch.from_numpy(x_80), 2, 1)
x_80 = x_80.to(torch.float32).to(device)

x_81 = x_valid_81.reshape(x_valid_81.shape[0], 4, 8)
x_81 = torch.transpose(torch.from_numpy(x_81), 2, 1)
x_81 = x_81.to(torch.float32).to(device)

# Model Evaluation

In [124]:
model.eval()
preds_79 = model(x_79).cpu().detach()
preds_80 = model(x_80).cpu().detach()
preds_81 = model(x_81).cpu().detach()

In [33]:
%matplotlib qt

plt.figure(figsize=(15,6))
plt.plot(y_valid_79, label='real')
plt.plot(preds_79, label='prediction', c='orange')
plt.legend()

<matplotlib.legend.Legend at 0x25d7ae86ce0>

### Combine All Year, 79-80-81, of validation data and prediction data

In [55]:
Predictions = np.concatenate((preds_79, preds_80, preds_81))
Predictions = pd.DataFrame(data={
    'LSTM_DST+1': Predictions.flatten()
})

stripped_valid = pd.concat([
    valid_data['Dst_index'][valid_data.index.year == 1979].iloc[8:],
    valid_data['Dst_index'][valid_data.index.year == 1980].iloc[8:],
    valid_data['Dst_index'][valid_data.index.year == 1981].iloc[8:]
])
Predictions.index = stripped_valid.index

In [59]:
plt.figure(figsize=(15,6))
plt.plot(np.concatenate((y_valid_79, y_valid_80, y_valid_81)), label='real')
plt.plot(Predictions["LSTM_DST+1"].values)
plt.legend()

<matplotlib.legend.Legend at 0x25de46d6aa0>

In [60]:
 Predictions.to_csv('./data/DST_prediction_LSTM.csv')

## Evaluation metrics

In [28]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy import stats

In [125]:
def metrics(results):
    data = []
    for res in results:
        mae = mean_absolute_error(res[0], res[1])
        mse = mean_squared_error(res[0], res[1])
        rmse = np.sqrt(mse)
        r2 = r2_score(res[0], res[1])
        data.append([mae, mse, rmse, r2])
    
    return pd.DataFrame(data, columns=["MAE", "MSE", "RMSE", "R^2"])
    
metrics(
    [[y_valid_79, preds_79], 
     [y_valid_80, preds_80], 
     [y_valid_81, preds_81]]
)

Unnamed: 0,MAE,MSE,RMSE,R^2
0,0.048028,0.003698,0.060807,0.328861
1,0.042027,0.002713,0.05209,0.360834
2,0.046877,0.003746,0.061203,0.547739


In [66]:
torch.save(model.state_dict(), './data/LSTM_h128_SL3_lr001_ep75.pth')

# Optimizing Hyperparameters

In [99]:
import math
math.floor(8280/(6*(4+1)))

276

In [127]:
all_r2s = []

for s in range(1, 10):
    print()
    print(f"===Stacked Number Of Layers {s}===")

    r2s = []
    
    for i in range(5, 20):
        hidden_n  = math.floor(8280/(i*(4+1)))
        stacked_n = s
        
        model = LSTM(4, hidden_n, stacked_n)
        model.to(device)
    
        print()
        print(f'training {i}, hidden_n = {hidden_n}, stacked_n = {stacked_n}')
        train(num_epochs, x_tra, y_tra, loss_function, optimizer, silent=True)
    
        model.eval()
        preds_79 = model(x_79).cpu().detach()
        preds_80 = model(x_80).cpu().detach()
        preds_81 = model(x_81).cpu().detach()
    
        metrics_r2s = metrics(
            [[y_valid_79, preds_79], 
             [y_valid_80, preds_80], 
             [y_valid_81, preds_81]]
        )['R^2'].values
        r2s.append(metrics_r2s)
        
        print(
            f"R^2 values [79, 80, 81] = {metrics_r2s} with hidden = {hidden_n}, stacked_n = {stacked_n}"
        )
    all_r2s.append(r2s)


===Stacked Number Of Layers 1===

training 5, hidden_n = 331, stacked_n = 1
DONE Training
R^2 values [79, 80, 81] = [-1.4858253  -1.30266096 -1.58573781] with hidden = 331, stacked_n = 1

training 6, hidden_n = 276, stacked_n = 1
DONE Training
R^2 values [79, 80, 81] = [-0.08137745 -0.02267548 -0.24116823] with hidden = 276, stacked_n = 1

training 7, hidden_n = 236, stacked_n = 1
DONE Training
R^2 values [79, 80, 81] = [-2.18459495 -2.07081033 -2.16394647] with hidden = 236, stacked_n = 1

training 8, hidden_n = 207, stacked_n = 1
DONE Training
R^2 values [79, 80, 81] = [-1.88336329 -1.73329576 -1.91506521] with hidden = 207, stacked_n = 1

training 9, hidden_n = 184, stacked_n = 1
DONE Training
R^2 values [79, 80, 81] = [-0.03496847 -0.08261447 -0.1202734 ] with hidden = 184, stacked_n = 1

training 10, hidden_n = 165, stacked_n = 1
DONE Training
R^2 values [79, 80, 81] = [ 0.00696039 -0.06052716 -0.05197656] with hidden = 165, stacked_n = 1

training 11, hidden_n = 150, stacked_n =

KeyboardInterrupt: 