In [1]:
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import root_mean_squared_error
from scipy.ndimage import shift
import math

# Data preparation

In [2]:
NM_data = pd.read_csv('../data/LMKS_NM_1981-2023_hour.csv')
data = pd.read_csv('../data/omni_full_1964-2022.csv')
data

Unnamed: 0.1,Unnamed: 0,time1,Rot$,IMF,PLS,IMF_PTS,PLS_PTS,ABS_B,F,THETA_AV,...,F10_INDEX+48,BZ_GSE+1,BZ_GSE+2,BZ_GSE+3,BZ_GSE+4,BZ_GSE+6,BZ_GSE+8,BZ_GSE+12,BZ_GSE+24,BZ_GSE+48
0,0,1963-01-01 01:00:00,1771.0,99.0,99.0,999.0,999.0,,,,...,,,,,,,,,,
1,1,1963-01-01 02:00:00,1771.0,99.0,99.0,999.0,999.0,,,,...,,,,,,,,,,
2,2,1963-01-01 03:00:00,1771.0,99.0,99.0,999.0,999.0,,,,...,,,,,,,,,,
3,3,1963-01-01 04:00:00,1771.0,99.0,99.0,999.0,999.0,,,,...,,,,,,,,,,
4,4,1963-01-01 05:00:00,1771.0,99.0,99.0,999.0,999.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
520438,520438,2022-05-13 12:00:00,9999.0,99.0,99.0,999.0,999.0,,,,...,135.600006,,,,,,,,,
520439,520439,2022-05-13 13:00:00,9999.0,99.0,99.0,999.0,999.0,,,,...,135.600006,,,,,,,,,
520440,520440,2022-05-13 14:00:00,9999.0,99.0,99.0,999.0,999.0,,,,...,135.600006,,,,,,,,,
520441,520441,2022-05-13 15:00:00,9999.0,99.0,99.0,999.0,999.0,,,,...,135.600006,,,,,,,,,


In [3]:
NM_data.index = NM_data['Unnamed: 0']
NM_data.drop("Unnamed: 0", axis=1, inplace=True)
NM_data

Unnamed: 0_level_0,H_COR
Unnamed: 0,Unnamed: 1_level_1
1981-12-01 00:00:00,87.2680
1981-12-01 01:00:00,86.9240
1981-12-01 02:00:00,86.8670
1981-12-01 03:00:00,86.4080
1981-12-01 04:00:00,86.5230
...,...
2023-07-10 19:00:00,92.4196
2023-07-10 20:00:00,92.4196
2023-07-10 21:00:00,92.4196
2023-07-10 22:00:00,92.4196


In [4]:
NM_data.dropna(inplace=True)
NM_data.isna().sum()

H_COR    0
dtype: int64

In [5]:
data.drop('Unnamed: 0', axis=1, inplace=True)
data.index = data['time1']
data.drop('time1', axis=1, inplace=True)
data

Unnamed: 0_level_0,Rot$,IMF,PLS,IMF_PTS,PLS_PTS,ABS_B,F,THETA_AV,PHI_AV,BX_GSE,...,F10_INDEX+48,BZ_GSE+1,BZ_GSE+2,BZ_GSE+3,BZ_GSE+4,BZ_GSE+6,BZ_GSE+8,BZ_GSE+12,BZ_GSE+24,BZ_GSE+48
time1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1963-01-01 01:00:00,1771.0,99.0,99.0,999.0,999.0,,,,,,...,,,,,,,,,,
1963-01-01 02:00:00,1771.0,99.0,99.0,999.0,999.0,,,,,,...,,,,,,,,,,
1963-01-01 03:00:00,1771.0,99.0,99.0,999.0,999.0,,,,,,...,,,,,,,,,,
1963-01-01 04:00:00,1771.0,99.0,99.0,999.0,999.0,,,,,,...,,,,,,,,,,
1963-01-01 05:00:00,1771.0,99.0,99.0,999.0,999.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-13 12:00:00,9999.0,99.0,99.0,999.0,999.0,,,,,,...,135.600006,,,,,,,,,
2022-05-13 13:00:00,9999.0,99.0,99.0,999.0,999.0,,,,,,...,135.600006,,,,,,,,,
2022-05-13 14:00:00,9999.0,99.0,99.0,999.0,999.0,,,,,,...,135.600006,,,,,,,,,
2022-05-13 15:00:00,9999.0,99.0,99.0,999.0,999.0,,,,,,...,135.600006,,,,,,,,,


In [6]:
data_relevant = data[['BZ_GSE', 'V', 'N', 'DST', 'SIGMA$Bz']].loc["1995-01-01 00:00:00":]
data_relevant.isna().sum()

BZ_GSE       977
V            903
N           5586
DST            0
SIGMA$Bz     977
dtype: int64

In [7]:
data_relevant.index = pd.to_datetime(data_relevant.index)
data_relevant_interpolated = data_relevant.interpolate(method='time')
data_relevant_interpolated.isna().sum()

BZ_GSE      0
V           0
N           0
DST         0
SIGMA$Bz    0
dtype: int64

## Intersection of omni data and NM data

In [8]:
data_relevant_interpolated.index = pd.to_datetime(data_relevant_interpolated.index)
NM_data.index = pd.to_datetime(NM_data.index)

common_index = data_relevant_interpolated.index.intersection(NM_data.index)

data_relevant_interpolated_common = data_relevant_interpolated.loc[common_index]
nm_data_common = NM_data.loc[common_index]

data_relevant_interpolated_common['NM'] = nm_data_common.iloc[:, 0]
data_relevant_interpolated = data_relevant_interpolated_common

data_relevant_interpolated.isna().sum()

BZ_GSE      0
V           0
N           0
DST         0
SIGMA$Bz    0
NM          0
dtype: int64

## Data normalization

In [9]:
min_val = data_relevant_interpolated.min()
max_val = data_relevant_interpolated.max()

# Normalize between -1 and 1
data_relevant_interpolated_normalized = 2 * (data_relevant_interpolated - min_val) / (max_val - min_val) - 1
data_relevant_interpolated_normalized

Unnamed: 0,BZ_GSE,V,N,DST,SIGMA$Bz,NM
1995-01-01 00:00:00,0.131579,-0.818939,-0.763676,0.683367,-0.976134,0.412806
1995-01-01 00:00:00,0.131579,-0.818939,-0.763676,0.683367,-0.976134,0.412806
1995-01-01 01:00:00,0.149123,-0.818939,-0.727206,0.703407,-0.961814,0.405224
1995-01-01 02:00:00,0.168860,-0.808533,-0.719912,0.715431,-0.914081,0.403689
1995-01-01 03:00:00,0.184211,-0.814776,-0.762217,0.711423,-0.961814,0.399131
...,...,...,...,...,...,...
2022-05-13 12:00:00,0.184211,-0.829344,-0.905179,0.699399,-0.904535,0.714550
2022-05-13 13:00:00,0.184211,-0.829344,-0.905179,0.707415,-0.904535,0.715932
2022-05-13 14:00:00,0.184211,-0.829344,-0.905179,0.711423,-0.904535,0.717940
2022-05-13 15:00:00,0.184211,-0.829344,-0.905179,0.711423,-0.904535,0.707895


# Regression model

In [10]:
data_relevant_interpolated_normalized

Unnamed: 0,BZ_GSE,V,N,DST,SIGMA$Bz,NM
1995-01-01 00:00:00,0.131579,-0.818939,-0.763676,0.683367,-0.976134,0.412806
1995-01-01 00:00:00,0.131579,-0.818939,-0.763676,0.683367,-0.976134,0.412806
1995-01-01 01:00:00,0.149123,-0.818939,-0.727206,0.703407,-0.961814,0.405224
1995-01-01 02:00:00,0.168860,-0.808533,-0.719912,0.715431,-0.914081,0.403689
1995-01-01 03:00:00,0.184211,-0.814776,-0.762217,0.711423,-0.961814,0.399131
...,...,...,...,...,...,...
2022-05-13 12:00:00,0.184211,-0.829344,-0.905179,0.699399,-0.904535,0.714550
2022-05-13 13:00:00,0.184211,-0.829344,-0.905179,0.707415,-0.904535,0.715932
2022-05-13 14:00:00,0.184211,-0.829344,-0.905179,0.711423,-0.904535,0.717940
2022-05-13 15:00:00,0.184211,-0.829344,-0.905179,0.711423,-0.904535,0.707895


In [11]:
DST_shift = data_relevant_interpolated_normalized['DST'].shift(1)
DST_shift.fillna(value=0,inplace=True)
DST_shift

1995-01-01 00:00:00    0.000000
1995-01-01 00:00:00    0.683367
1995-01-01 01:00:00    0.683367
1995-01-01 02:00:00    0.703407
1995-01-01 03:00:00    0.715431
                         ...   
2022-05-13 12:00:00    0.699399
2022-05-13 13:00:00    0.699399
2022-05-13 14:00:00    0.707415
2022-05-13 15:00:00    0.711423
2022-05-13 16:00:00    0.711423
Name: DST, Length: 239877, dtype: float64

In [47]:
N = data_relevant_interpolated_normalized['N'].to_numpy()
V = data_relevant_interpolated_normalized['V'].to_numpy()
NV = N * V
Bz= data_relevant_interpolated_normalized['BZ_GSE'].to_numpy()
sigma_bz = data_relevant_interpolated_normalized['SIGMA$Bz'].to_numpy()
nm = data_relevant_interpolated_normalized['NM'].to_numpy()
DST = data_relevant_interpolated_normalized['DST'].to_numpy()

Definicia persistancneho modela ako linearana kombinacia zvysnych parametrov modelu FFNN.
$$
DST_{t+1} = DST_t + \sum_{i = 1}^{|N|} = \omega_i(\psi_i - \psi_i^{ref})
$$
To iste v podobe matic a vektorov.
$$
\vec{y} = \vec{\beta} + \mathbf{X}\vec{\omega}
$$

In [13]:
y = DST_shift.to_numpy()
b = data_relevant_interpolated_normalized['DST'].to_numpy()
b.shape, y.shape

((239877,), (239877,))

## Vypocet matice $X$

In [14]:
X = []

In [15]:
N_mean = N.mean()
NV_mean = NV.mean()
V_mean = V.mean()
Bz_mean = Bz.mean()
sigma_bz_mean = sigma_bz.mean()
nm_mean = nm.mean()
N_mean, V_mean, Bz_mean, sigma_bz_mean, nm_mean, NV_mean

(np.float64(-0.9092295346299987),
 np.float64(-0.5875355186766423),
 np.float64(0.1771662958918724),
 np.float64(-0.9377969344570631),
 np.float64(0.4876614210538083),
 np.float64(0.5285874443950322))

In [16]:
params = [N, V, Bz, sigma_bz, nm]

In [17]:
X = np.array([
    N - N_mean, V - V_mean, Bz - Bz_mean, sigma_bz - sigma_bz_mean, nm - nm_mean, NV - NV_mean
])
X.shape, X.T.shape

((6, 239877), (239877, 6))

In [18]:
X = X.T
X.shape

(239877, 6)

## Vypocet vah
$$
\vec{\omega} = (\mathbf{X}^T\mathbf{X})^{-1}\mathbf{X}^T(\vec{y} - \vec{\beta})
$$

In [19]:
w = (np.linalg.inv(X.T @ X) @ X.T) @ (y - b)

In [20]:
w

array([ 0.06198545,  0.08515494, -0.08380987,  0.02530192,  0.00141577,
        0.09118692])

## Finalny model

In [21]:
persistance = b + np.sum(X*w, axis=1)

In [22]:
persistance

array([0.6842569 , 0.6842569 , 0.70271538, ..., 0.71191417, 0.71189995,
       0.7078833 ], shape=(239877,))

In [23]:
%matplotlib qt
plt.plot(y, label='DST_t+1')
plt.plot(persistance, label='prediction')
plt.legend()

<matplotlib.legend.Legend at 0x1fa3be62d20>

In [24]:
mean_squared_error(y, persistance)

0.0002600635399853744

# Shapely values
Nech $\psi$ je nejaky parameter, $P$ je mnozina parametrov a $f : P \rightarrow DST_{t+1}$ je model a $x_0$ je jedna hodnota DST. Potom shapely hodnota parametru $\psi$ v bode $x_0$ je
$$
\varphi_{\psi}(x_0) = \frac{1}{|P|!} \sum_{S \subseteq P \setminus \{\psi\}} |S|!\cdot(|P| - |S| - 1)! \cdot (f(S \cup \{\psi\}) - f(S))
$$
Vypocet si bude vyzadovat postavit niekolko krokov.
- Napjprv je potrebne spravit **mnozinu podmnozin** mnoziny $P$
- Vytvorit **funkciu** ktora **vytvori model s novym poctom parametrov** na vstupe, kedze $f(S)$ bude vzdy predikcia modelu s inym poctom parametrov.
- Skombinovat

## Mnozina podmnozin $P$
Na $S \subseteq P \setminus \{\psi\}$ sa mozeme pozerat trochu inak. V podstate to hovori ze $S$ je kazda podmnozina mnoziny $P$. Preto mozeme zapisat $S$ ako $S \subseteq \mathcal{P(\mathbf{P})}$, cize $S$ je kazda mnozina potencnej mnozinu $P$.

In [45]:
from itertools import chain, combinations

def powerset(iterable):
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

In [48]:
AllParams = [N, V, Bz, sigma_bz, nm, DST]

In [62]:
def power_set_minus(P, target):
    new_P = []
    for i in P:
        if i == target: continue
        else: new_P.append(i)
    power_P = list(powerset(new_P))

    return power_P

## Dynamicky persistencny model

In [28]:
def f(P, y, b):
    '''
        P - set of parameters ([N, V, Bz,...]) (2D matrix)
        y - DST_t+1 (array)
        b - DST_t   (array)
    '''
    if not P: return np.array([])
    X = np.array([
        p - p.mean() for p in P 
    ])
    X = X.T

    w = (np.linalg.inv(X.T @ X) @ X.T) @ (y - b)

    return w

def pp(X, b, w):
    '''
        X - shape [p1 - p1_mean, p2 - p2_mean, ...]
        b - shape [DST1, DST2, DST3]
        w - weights for parameters p_1 - p_n
    '''
    X = X.T
    if len(X.shape) <= 1: return b + np.sum(X*w)
    else: return b + np.sum(X*w, axis=1)
    

In [29]:
f([], y, b)

array([], dtype=float64)

In [30]:
pp(
    np.array([N[10] - N.mean(), V[10] - V.mean(), Bz[10] - Bz.mean(), sigma_bz[10] - sigma_bz.mean(), nm[10] - nm.mean(), NV[10] - NV.mean()]),
    b[10],
    f([N, V, Bz, sigma_bz, nm, NV], y, b)
)

np.float64(0.6985369524786599)

## Shapely values calculation

In [71]:
def shapely(P, target, point):
    '''
    P - set of parameters
    target - parameter for which we are calculating shapely value
    point - int value in range of the size of the parameters from P
    '''
    # Creating set of parameters without target parameter
    mask = ~np.all(P == target, axis=1)
    indexes = [i for i in range(len(mask)) if mask[i] == True]
    P_wo_target = [P[i] for i in indexes]

    # power set of set w/o target
    p_set = list(powerset(P_wo_target))[1:]

    # main loop 'sum' of the shapely formula
    summ = 0
    for sub_set in p_set:

        # finding the position of DST in every subset
        # because we need to extract DST out of the whole array as it is treated differently in comparison to other parameters
        mask_dst = ~np.all(sub_set == data_relevant_interpolated_normalized['DST'].to_numpy(), axis=1)
        indexes_dst = [i for i in range(len(mask_dst)) if mask_dst[i] == False]


        # Creating X matrix and b vector for linear model
        # based on the existance of DST in subset
        X = []
        b = []
        if not indexes_dst: # indexes_dst is empty, no dst in that subset
            X = [p[point] - p.mean() for p in sub_set]
            b = np.zeros(len(y))
        else: # DST is in the subset, we need to extract it
            X = [p[point] - p.mean() for p in sub_set[:indexes_dst[0]]]
            b = sub_set[indexes_dst[0]]
            sub_set = list(sub_set)
            del sub_set[indexes_dst[0]]

        # contribution value f(S U {i}) - f(S)
        if np.all(target != data_relevant_interpolated_normalized['DST'].to_numpy()):
            prediction          = pp(np.array(X), b[point], f(sub_set, y, b))
            X.append(target[point] - target.mean())
            sub_set = list(sub_set)
            sub_set.append(target)
            prediction_w_target = pp(np.array(X), b[point], f(sub_set, y, b))
    
            contribution = prediction_w_target - prediction
    
            # # multiplication factor |S|! * (|P| - |S| - 1)!
            mult_factor = math.factorial(len(sub_set)) * math.factorial((len(P) - len(sub_set) - 1))

            summ += mult_factor * contribution
        else:
            prediction          = pp(np.array(X), b[point], f(sub_set, y, b))
            b = target
            prediction_w_target = pp(np.array(X), b[point], f(sub_set, y, b))
    
            contribution = prediction_w_target - prediction
    
            # # multiplication factor |S|! * (|P| - |S| - 1)!
            mult_factor = math.factorial(len(sub_set)) * math.factorial((len(P) - len(sub_set) - 1))

            summ += mult_factor * contribution

    return summ/math.factorial(len(P))

def shapely2(P, p, T):
    '''
    P - set of parameters
    p - values for which we wish to calculate shapely values of every parameters
    T - target parameter, integer value from mask
    '''
    if len(P) != len(p):
        print("P and p has bad lengths")
        return

    mask = [i for i in range(len(P))]
    DST_idx = len(P)-1

    pset = power_set_minus(mask, T)

    for S in pset:

        '''
        Creating model
        '''
        X = None
        b = None
        if DST_idx in S:
            b = P[DST_idx]
            S = S[:-1]
            X = [P[i] - P[i].mean() for i in S]
        else:
            b = np.zeros(len(P[DST_idx]))
            X = [P[i] - P[i].mean() for i in S]
        
                

    return pset


In [72]:
shapely2(
    AllParams,
    [N[0], V[0], Bz[0], sigma_bz[0], nm[0], DST[0]],
    2
)

[(),
 (0,),
 (1,),
 (3,),
 (4,),
 (5,),
 (0, 1),
 (0, 3),
 (0, 4),
 (0, 5),
 (1, 3),
 (1, 4),
 (1, 5),
 (3, 4),
 (3, 5),
 (4, 5),
 (0, 1, 3),
 (0, 1, 4),
 (0, 1, 5),
 (0, 3, 4),
 (0, 3, 5),
 (0, 4, 5),
 (1, 3, 4),
 (1, 3, 5),
 (1, 4, 5),
 (3, 4, 5),
 (0, 1, 3, 4),
 (0, 1, 3, 5),
 (0, 1, 4, 5),
 (0, 3, 4, 5),
 (1, 3, 4, 5),
 (0, 1, 3, 4, 5)]

## Calculate shapely values for all parameters

In [33]:
# shapely_params_sum = []
# for p in [N, V, Bz, sigma_bz, nm, data_relevant_interpolated_normalized["DST"].to_numpy()]:
#     temp = 0
#     for i in range(len(data_relevant_interpolated_normalized["DST"].to_numpy())):
#         temp += float(shapely(
#             [N, V, Bz, sigma_bz, nm, data_relevant_interpolated_normalized["DST"].to_numpy()],
#             p,
#             i
#         ))
#     shapely_params_sum.append(temp)

In [34]:
%%time
event_shapely_values = []

for p, sign in zip([N, V, Bz, sigma_bz, nm, NV, data_relevant_interpolated_normalized["DST"].to_numpy()], ["N", "V", "Bz", "sBz", "nm", "NV", "DST"]):
    temp = []
    for i in range(77200, 77500):
        print(f"{sign} -> {i}")
        temp.append(float(shapely(
                    [N, V, Bz, sigma_bz, nm, NV, data_relevant_interpolated_normalized["DST"].to_numpy()],
                    p,
                    i
                )))
    event_shapely_values.append(temp)


N -> 77200
N -> 77201
N -> 77202
N -> 77203
N -> 77204
N -> 77205
N -> 77206
N -> 77207
N -> 77208
N -> 77209
N -> 77210
N -> 77211
N -> 77212
N -> 77213
N -> 77214
N -> 77215
N -> 77216
N -> 77217
N -> 77218
N -> 77219
N -> 77220
N -> 77221
N -> 77222
N -> 77223
N -> 77224
N -> 77225
N -> 77226
N -> 77227
N -> 77228
N -> 77229
N -> 77230
N -> 77231
N -> 77232
N -> 77233
N -> 77234
N -> 77235
N -> 77236
N -> 77237
N -> 77238
N -> 77239
N -> 77240
N -> 77241
N -> 77242
N -> 77243
N -> 77244
N -> 77245
N -> 77246
N -> 77247
N -> 77248
N -> 77249
N -> 77250
N -> 77251
N -> 77252
N -> 77253
N -> 77254
N -> 77255
N -> 77256
N -> 77257
N -> 77258
N -> 77259
N -> 77260
N -> 77261
N -> 77262
N -> 77263
N -> 77264
N -> 77265
N -> 77266
N -> 77267
N -> 77268
N -> 77269
N -> 77270
N -> 77271
N -> 77272
N -> 77273
N -> 77274
N -> 77275
N -> 77276
N -> 77277
N -> 77278
N -> 77279
N -> 77280
N -> 77281
N -> 77282
N -> 77283
N -> 77284
N -> 77285
N -> 77286
N -> 77287
N -> 77288
N -> 77289
N -> 77290

In [44]:
NM_shapely = []
for i in range(len(data_relevant_interpolated_normalized)):
    temp.append(float(shapely(
                [N, V, Bz, sigma_bz, nm, NV, data_relevant_interpolated_normalized["DST"].to_numpy()],
                nm,
                i
            )))

KeyboardInterrupt: 

## Plotting results

In [35]:
string_params = ["N", "V", "Bz", "sigma Bz", "NM", "NV", "DST_t"]
fig, axs = plt.subplots(len(event_shapely_values))
for i in range(len(axs)):
    axs[i].plot(event_shapely_values[i])
    axs[i].set_title(string_params[i])

In [39]:
event_shapely_values_mean = np.mean(event_shapely_values, axis=1)
plt.bar(string_params, event_shapely_values_mean)

<BarContainer object of 7 artists>

In [37]:
event_shapely_values_mean

array([-4.13817701e-04, -9.10952208e-03,  1.14700502e-03, -2.52452403e-04,
       -1.83445254e-02, -7.88439880e-03,  4.71692062e-01])

In [42]:
data_relevant_interpolated_normalized['DST'].iloc[77200:77500].plot()

<Axes: >

In [43]:
len(data_relevant_interpolated_normalized)

239877