In [1]:
import pandas as pd
from keras.models import Model
from keras.layers import LSTM, Dense, Dropout, Input, Bidirectional, Reshape, Conv1D
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from typing import Tuple

In [2]:
# Constants
SEQUENCE_LEN = 21 * 4              # 21 days of 6-hours data
OUTPUT_LEN = 5 * 4                 # 5 days of 6-hours data
INPUT_FEATURES = 23                # Number of input features (e.g., temperature, humidity, etc.)
OUTPUT_FEATURES = 5 * OUTPUT_LEN   # Predicting flatten 4 features for each timestamp (wind direction is described as two cols)

In [3]:
def wind_direction_sin_cos(data: pd.DataFrame):
    """ To avoid problem of huge MSE error degrees are replaced by sin and cos functions """
    data['Kierunek wiatru [sin]'] = np.sin(data['Kierunek wiatru  [°]'] * 2 * np.pi / 360)  # transform to radians and calculate sin
    data['Kierunek wiatru [cos]'] = np.cos(data['Kierunek wiatru  [°]'] * 2 * np.pi / 360)  # transform to radians and calculate sin
    data.drop('Kierunek wiatru  [°]', axis=1, inplace=True)
    return data

def create_sequences(data: pd.DataFrame):
    X, y = [], []
    for i in range(len(data) - SEQUENCE_LEN - OUTPUT_LEN):
        X.append(data[i: i + SEQUENCE_LEN])
        y.append(data[i + SEQUENCE_LEN: i + SEQUENCE_LEN + OUTPUT_LEN, :5].flatten()) # Taking the first 4 features for the next 24 hours
    return np.array(X), np.array(y)

def invert_scale_for_X(X: np.ndarray, scaler):
    copy_X = X.copy()
    for i in range(X.shape[-1]):
        print()
        std, mean = scaler.scale_[i], scaler.mean_[i]
        copy_X[:, :, i] = X[:, :, i] * std + mean
    return copy_X

def invert_scale_for_y(y: np.ndarray, scaler):
    copy_y = y.copy()
    for i in range(5):
        std, mean = scaler.scale_[i], scaler.mean_[i]
        copy_y[:, i::5] = y[:, i::5] * std + mean
    return copy_y

def back_to_degrees_for_X(X: np.ndarray):
    sin_part, cos_part = X[:, :, 2], X[:, :, 3]  # get all 
    X[:, :, 2] = np.arctan2(sin_part, cos_part) * 180 / np.pi
    replaced_X = np.delete(X, 3, axis=2)
    return replaced_X

def back_to_degrees_for_y(y: np.ndarray):
    sin_part, cos_part = y[:, 2::5], y[:, 3::5]  # get all 
    y[:, 2::5] = np.arctan2(sin_part, cos_part) * 180 / np.pi
    replaced_y = np.delete(y, list(range(3, 119, 5)), axis=1)
    return replaced_y

def plot_one_prediction(idx: int, X_true, y_true, y_pred, figsize: Tuple[int] = (15, 10)):
    x = range(-SEQUENCE_LEN, OUTPUT_LEN)
    x1, x2 = x[1:1 + SEQUENCE_LEN], x[SEQUENCE_LEN:]
    labels = ['Temperatura [°C]', 'Ciśnienie atmos. [hPa]', 'Kierunek wiatru [°]', 'Prędkość wiatru [m/s]']

    fig, axs = plt.subplots(4, figsize=figsize)
    fig.suptitle('Porównanie wyników modelu z faktycznym stanem pogody')
    for i in range(4):
        axs[i].plot(x1, X_true[idx + 1, :, i], label='Dane wejściowe', color='green')
        axs[i].plot(x2, y_true[:, i::4][idx], label='Wartość prawdziwa', color='blue')
        axs[i].plot(x2, y_pred[:, i::4][idx], label='Predykcja', color='red')
        axs[i].set_ylabel(labels[i])
        axs[i].grid()
        axs[i].legend()
    axs[3].set_xlabel('Czas [godz.]')
    plt.show()
    
def MSE(y_true: np.ndarray, y_pred: np.ndarray):
    T_mse = np.mean((y_true[:, 0::4] - y_pred[:, 0::4])**2, axis=0).reshape(1, -1)
    P_mse = np.mean((y_true[:, 1::4] - y_pred[:, 1::4])**2, axis=0).reshape(1, -1)
    WD_mse = np.mean((y_true[:, 2::4] - y_pred[:, 2::4])**2, axis=0).reshape(1, -1)
    WV_mse = np.mean((y_true[:, 3::4] - y_pred[:, 3::4])**2, axis=0).reshape(1, -1)
    MSE_by_hour = np.append(np.append(np.append(T_mse, P_mse, axis=0), WD_mse, axis=0), WV_mse, axis=0)
    
    return MSE_by_hour, np.mean(MSE_by_hour, axis=1), np.mean(MSE_by_hour)

In [4]:
data = pd.read_csv('../data/preprocessed_data/complete_krk_2017-22.csv')
data['timestamp'] = pd.to_datetime(data['timestamp'])
data = data.set_index('timestamp')
data_6h = data[data.index.hour.isin([0, 6, 12, 18])]
data_6h = data.drop('timestamp', axis=1)
data_6h = wind_direction_sin_cos(data_6h)
data_6h

Unnamed: 0,Temperatura powietrza [°C]/1,Ciśnienie na poziomie stacji [hPa],Ciśnienie na poziomie morza [hPa],Ciśnienie pary wodnej [hPa],Widzialność operatora [m],Zachmurzenie ogólne [oktanty],Zachmurzenie niskie [oktanty],Chmury CL [kod],Chmury CM [kod],Chmury CH [kod],...,Wilgotność względna [%],Niedosyt wilgotności [hPa],Wystąpienie rosy [0/1],Wartość tendencji [wartość],Opad godzinowy [mm],Rodzaj opadu za 6 godzin [kod],Wysokość pokrywy śnieżnej [cm],Gatunek śniegu [kod],Kierunek wiatru [sin],Kierunek wiatru [cos]
0,-6.8,995.8,1027.1,3.5,3500,0,0,0,0,0,...,95,0.2,0,-1.5,0.0,0.0,0.0,0,-0.052336,0.998630
1,-6.9,995.6,1026.9,3.5,3500,0,0,0,0,0,...,95,0.2,0,-1.4,0.0,0.0,0.0,0,0.990268,-0.139173
2,-3.9,994.9,1025.8,3.8,5000,0,0,0,0,0,...,83,0.8,0,-1.4,0.0,0.0,0.0,0,-0.422618,-0.906308
3,-6.5,994.1,1025.3,3.5,4000,0,0,0,0,0,...,92,0.3,0,-1.7,0.0,0.0,0.0,0,-0.515038,-0.857167
4,-6.1,993.6,1024.8,3.3,4000,0,0,0,0,0,...,85,0.6,0,-2.0,0.0,0.0,0.0,0,0.207912,-0.978148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52579,8.3,991.2,1020.7,9.5,20000,7,0,0,0,0,...,87,1.4,0,0.7,0.0,0.0,0.0,0,-0.939693,-0.342020
52580,9.1,991.4,1020.8,9.2,20000,7,0,0,0,0,...,80,2.3,0,0.5,0.0,0.0,0.0,0,-0.829038,-0.559193
52581,8.4,991.6,1021.1,9.0,20000,7,0,0,0,0,...,82,2.0,0,0.5,0.0,0.0,0.0,0,-0.342020,-0.939693
52582,9.3,991.6,1021.0,8.7,20000,7,0,0,0,0,...,74,3.0,0,0.4,0.0,0.0,0.0,0,-0.990268,-0.139173


In [5]:
cols = data_6h.columns
new_order = [0, 1, 21, 22, 11] + list(range(2, 11)) + list(range(12, 21))
cols = [cols[i] for i in new_order]
data_6h = data_6h[cols]
data

Unnamed: 0,Temperatura powietrza [°C]/1,Ciśnienie na poziomie stacji [hPa],Kierunek wiatru [sin],Kierunek wiatru [cos],Prędkość wiatru [m/s],Ciśnienie na poziomie morza [hPa],Ciśnienie pary wodnej [hPa],Widzialność operatora [m],Zachmurzenie ogólne [oktanty],Zachmurzenie niskie [oktanty],...,Wysokość podstawy chmur CL CM szyfrowana [kod],Temperatura punktu rosy [°C],Wilgotność względna [%],Niedosyt wilgotności [hPa],Wystąpienie rosy [0/1],Wartość tendencji [wartość],Opad godzinowy [mm],Rodzaj opadu za 6 godzin [kod],Wysokość pokrywy śnieżnej [cm],Gatunek śniegu [kod]
0,-6.8,995.8,-0.052336,0.998630,1,1027.1,3.5,3500,0,0,...,9,-7.5,95,0.2,0,-1.5,0.0,0.0,0.0,0
1,-6.9,995.6,0.990268,-0.139173,1,1026.9,3.5,3500,0,0,...,9,-7.6,95,0.2,0,-1.4,0.0,0.0,0.0,0
2,-3.9,994.9,-0.422618,-0.906308,3,1025.8,3.8,5000,0,0,...,9,-6.4,83,0.8,0,-1.4,0.0,0.0,0.0,0
3,-6.5,994.1,-0.515038,-0.857167,2,1025.3,3.5,4000,0,0,...,9,-7.6,92,0.3,0,-1.7,0.0,0.0,0.0,0
4,-6.1,993.6,0.207912,-0.978148,1,1024.8,3.3,4000,0,0,...,9,-8.2,85,0.6,0,-2.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52579,8.3,991.2,-0.939693,-0.342020,3,1020.7,9.5,20000,7,0,...,9,6.3,87,1.4,0,0.7,0.0,0.0,0.0,0
52580,9.1,991.4,-0.829038,-0.559193,3,1020.8,9.2,20000,7,0,...,9,5.8,80,2.3,0,0.5,0.0,0.0,0.0,0
52581,8.4,991.6,-0.342020,-0.939693,2,1021.1,9.0,20000,7,0,...,9,5.5,82,2.0,0,0.5,0.0,0.0,0.0,0
52582,9.3,991.6,-0.990268,-0.139173,2,1021.0,8.7,20000,7,0,...,9,4.9,74,3.0,0,0.4,0.0,0.0,0.0,0


In [6]:
# Assuming 'data' is your dataset with shape (total_hours, 26_features)
# and 'total_hours' is a multiple of 24

# Normalize your data
scaler = StandardScaler()
data_normalized = scaler.fit_transform(data_6h)
data_normalized

array([[-1.84487084,  0.96334401,  0.20771005, ..., -0.6736711 ,
        -0.15217187, -0.28491282],
       [-1.85603007,  0.93829589,  1.79956292, ..., -0.6736711 ,
        -0.15217187, -0.28491282],
       [-1.52125303,  0.85062749, -0.35763876, ..., -0.6736711 ,
        -0.15217187, -0.28491282],
       ...,
       [-0.14866714,  0.43733359, -0.23458116, ..., -0.6736711 ,
        -0.15217187, -0.28491282],
       [-0.04823402,  0.43733359, -1.22432923, ..., -0.6736711 ,
        -0.15217187, -0.28491282],
       [-0.40532954,  0.42480953,  1.53830215, ..., -0.6736711 ,
        -0.15217187, -0.28491282]])

In [7]:
# Create sequences
X, y = create_sequences(data_normalized)

In [8]:
train_size = 4 * (365 * 3 + 366)  # train data includes first 4 years (2017-20)
val_size = 4 * 365                # validation data includes the next 2021 year  
test_size = 4 * 365               # test data includes the last 2022 year

X_train, y_train = X[:train_size], y[:train_size]
X_val, y_val = X[train_size:train_size+val_size], y[train_size:train_size+val_size]
X_test, y_test = X[train_size+val_size:], y[train_size+val_size:]

In [9]:
input_layer = Input(shape=(SEQUENCE_LEN, INPUT_FEATURES))  # 21 days * 4 hours (0,6,12,18), 23 parameters
reshape = Reshape((INPUT_FEATURES, SEQUENCE_LEN)) (input_layer)
# Downsampling
conv1 = Conv1D(filters=24, kernel_size=7, activation='relu', strides=1, padding="causal") (reshape)

lstm1 = Bidirectional(LSTM(84, return_sequences=True))(conv1)
lstm2 = Bidirectional(LSTM(42, return_sequences=False))(lstm1)
dense1 = Dense(60, activation='ReLU') (lstm2)
output_layer = Dense(60, activation='linear') (dense1)

# Build the model
model = Model(inputs=input_layer, outputs=output_layer)

model.compile(optimizer='adam', loss='mean_squared_error')  # Adjust optimizer and loss as needed
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 120, 96)           46080     
                                                                 
 lstm_1 (LSTM)               (None, 172)               185072    
                                                                 
 dense (Dense)               (None, 128)               22144     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 120)               15480     
                                                                 
Total params: 268776 (1.03 MB)
Trainable params: 268776 (1.03 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10

In [None]:
model.save('LSTM_07-01-2024_final')

In [None]:

from keras.models import load_model

model = load_model('LSTM_07-01-2024_final')


In [None]:
# Make predictions
y_pred = model.predict(X_test)

In [None]:
inv_y_pred = invert_scale_for_y(y_pred, scaler)
real_y_pred = back_to_degrees_for_y(inv_y_pred)

inv_y_test = invert_scale_for_y(y_test, scaler)
real_y_test = back_to_degrees_for_y(inv_y_test)

inv_X_test = invert_scale_for_X(X_test, scaler)
real_X_test = back_to_degrees_for_X(inv_X_test)

In [None]:
plot_one_prediction(700, real_X_test, real_y_test, real_y_pred)

In [None]:
MSE(real_y_test, real_y_pred)[0]

In [None]:
MSE(real_y_test, real_y_pred)[1]

In [None]:
X_test

In [None]:
inv_X_test