In [1]:
from keras.metrics import mean_squared_error
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt

In [2]:
# Constants
INPUT_FEATURES = 26                # Number of input features (e.g., temperature, humidity, etc.)
SEQUENCE_LEN = 7 * 24              # 7 days of hourly data
OUTPUT_LEN = 1 * 24                # 24 hours in the next day
OUTPUT_FEATURES = 3 * OUTPUT_LEN   # Predicting 3 features for each day
LSTM_UNITS = 128                   # Number of LSTM units
DENSE_UNITS = 128                   # Number of Dense layer units
DROPOUT_RATE = 0.2                 # Dropout rate

In [3]:
data = pd.read_csv('../data/preprocessed_data/complete_krk_2017-22.csv')
data = data.drop('timestamp', axis=1)
cols = data.columns

In [4]:
# Assuming 'data' is your dataset with shape (total_hours, 26_features)
# and 'total_hours' is a multiple of 24

# Normalize your data
scaler = StandardScaler()
data_normalized = scaler.fit_transform(data)
data_normalized

array([[-1.84487084,  0.96334401, -0.13402178, ..., -0.15217187,
        -0.12556201, -0.28491282],
       [-1.85603007,  0.93829589, -0.13402178, ..., -0.15217187,
        -0.12556201, -0.28491282],
       [-1.52125303,  0.85062749, -0.13402178, ..., -0.15217187,
        -0.12556201, -0.28491282],
       ...,
       [-0.14866714,  0.43733359, -0.13402178, ..., -0.15217187,
        -0.12556201, -0.28491282],
       [-0.04823402,  0.43733359, -0.13402178, ..., -0.15217187,
        -0.12556201, -0.28491282],
       [-0.40532954,  0.42480953, -0.13402178, ..., -0.15217187,
        -0.12556201, -0.28491282]])

In [5]:
# Function to create sequences
def create_sequences(data):
    X, y = [], []
    for i in range(len(data) - SEQUENCE_LEN - OUTPUT_LEN):
        X.append(data[i: i + SEQUENCE_LEN])
        y.append(data[i + SEQUENCE_LEN: i + SEQUENCE_LEN + OUTPUT_LEN, :3].flatten()) # Taking the first 3 features for the next 24 hours
    return np.array(X), np.array(y)

# Create sequences
X, y = create_sequences(data_normalized)

In [6]:
train_size = int(len(X) * 0.7)
val_size = int(len(X) * 0.15)

X_train, y_train = X[:train_size], y[:train_size]
X_val, y_val = X[train_size:train_size+val_size], y[train_size:train_size+val_size]
X_test, y_test = X[train_size+val_size:], y[train_size+val_size:]

In [8]:
# Define the model
model = Sequential()

# LSTM layers
model.add(LSTM(LSTM_UNITS, return_sequences=True, input_shape=(SEQUENCE_LEN, INPUT_FEATURES)))
model.add(LSTM(LSTM_UNITS, return_sequences=False))
# Dropout layer to prevent overfitting
model.add(Dropout(DROPOUT_RATE))
# Dense layers
model.add(Dense(DENSE_UNITS, activation='sigmoid'))
# Output layer
model.add(Dense(OUTPUT_FEATURES, activation='linear'))  # 'linear' activation for regression tasks
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')
# Model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 168, 128)          79360     
                                                                 
 lstm_1 (LSTM)               (None, 512)               1312768   
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense (Dense)               (None, 128)               65664     
                                                                 
 dense_1 (Dense)             (None, 72)                9288      
                                                                 
Total params: 1467080 (5.60 MB)
Trainable params: 1467080 (5.60 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [9]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

 116/1147 [==>...........................] - ETA: 23:03 - loss: 0.5559

KeyboardInterrupt: 

In [None]:
model.save('LSTM_27-12-2023_improved')

In [None]:
from keras.models import load_model

restored_model = load_model('LSTM_27-12-2023_improved')
history = model.history

In [None]:
loss = history.history["loss"]
epochs = range(len(loss))
plt.figure()
plt.plot(epochs, loss, "b", label="Training loss")
plt.title("Training Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.show()

In [None]:
predictions = model.predict(X_val)

# Calculate MSE
mse = mean_squared_error(y_val, predictions).numpy()
print("Mean Squared Error (MSE):", mse)

# Calculate RMSE
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

In [None]:
# Make predictions
y_pred = model.predict(X_test)

In [None]:
IDX = 200  # choose number of prediction

P = 1      # parameter (0 - Temp, 1 - pressure, 2 - rain)

x = range(0, SEQUENCE_LEN + OUTPUT_LEN)
x1, x2 = x[:SEQUENCE_LEN], x[SEQUENCE_LEN:]
std, mean = scaler.scale_[P], scaler.mean_[P]

# Plotting
plt.figure(figsize=(15, 5))

plt.plot(x1, X_test[IDX, :, P] * std + mean, label='Given data', color='green')
plt.plot(x2, y_test[:, P::3][IDX] * std + mean, label='Actual', color='blue')
plt.plot(x2, y_pred[:, P::3][IDX] * std + mean, label='Predicted', color='red')

# Labeling the plot
plt.title('Weather Forecast: Actual vs Predicted')
plt.xlabel('Time (Hours)')
if P == 0:
    plt.ylabel('Temperature [C]')
elif P == 1:
    plt.ylabel('Pressure [hPa]')
else:
    plt.ylabel('Rain [mm]')
plt.legend()
plt.show()