In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Bidirectional, Conv1D
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error

In [2]:
# Load the dataset
url = 'https://drive.google.com/file/d/1hPxlpOFPBruaCOI61U0xeb_ZkWkAEB3v/view?usp=share_link'
file_id = url.split('/')[-2]
read_url = 'https://drive.google.com/uc?id=' + file_id
df = pd.read_csv(read_url)
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14245 entries, 0 to 14244
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   maxtemp  14245 non-null  float64
 1   10u      14245 non-null  float64
 2   10v      14245 non-null  float64
 3   msl      14242 non-null  float64
 4   msnlwrf  14245 non-null  float64
 5   r        14245 non-null  float64
 6   ssr      14245 non-null  float64
dtypes: float64(7)
memory usage: 779.1 KB


Unnamed: 0,maxtemp,10u,10v,msl,msnlwrf,r,ssr
0,28.3200,-2.319565,-2.542244,101485.0561,-52.747184,73.319923,615010.8901
1,28.6500,-2.674598,-2.393440,101509.3420,-59.458288,67.202117,660833.8671
2,28.9900,-2.138307,-2.595200,101537.1517,-64.906373,68.976777,682528.1304
3,29.0900,-1.597971,-2.381015,101514.9530,-71.485442,70.842022,706449.4340
4,28.5500,-1.421151,-2.072065,101446.2626,-72.431924,74.168354,734606.9326
...,...,...,...,...,...,...,...
14240,29.6627,-2.407835,-3.229961,101341.4271,-73.118495,67.071725,681543.8981
14241,29.3466,-2.287320,-2.818933,101321.7717,-56.748524,60.809039,610582.9552
14242,29.8813,-2.167211,-2.782063,101339.4173,-53.864452,68.524707,525189.1380
14243,29.7621,-1.571721,-2.729545,101265.1718,-60.976579,78.350814,598660.3258


In [3]:
# Check for missing values
print(df.isnull().sum())

# Drop the missing values
df.dropna(inplace=True)


maxtemp    0
10u        0
10v        0
msl        3
msnlwrf    0
r          0
ssr        0
dtype: int64


In [4]:
# Scale the data
scaler = MinMaxScaler()
temp = df['maxtemp'].values.reshape(-1,1)
temp_scaled = scaler.fit_transform(temp)

In [5]:
train_split= 0.8
split_idx = int(len(df) * 0.8)
training_set = df.iloc[:,0:4].values
training_set = df[:split_idx].values
test_set = df[split_idx:].values

In [6]:
# 5-day prediction using 30 days data
x_train = []
y_train = []
n_future = 5 #Next 5 days rainfall forecast
n_past = 30 #Past 30 days
for i in range(0, len(training_set) - n_past - n_future + 1):
    x_train.append(training_set[i : i + n_past, 0])
    y_train.append(training_set[i + n_past : i + n_past + n_future, 0])
x_train, y_train = np.array(x_train), np.array(y_train)

x_test = []
y_test = []
for i in range(n_past, len(test_set) - n_future + 1):
    x_test.append(test_set[i - n_past:i, 0])
    y_test.append(test_set[i:i + n_future, 0])
x_test, y_test = np.array(x_test), np.array(y_test)


In [7]:
# Reshape the input data to LSTM format
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))

In [8]:
# Add an additional dimension to the input data for Conv1D layer
x_train = np.expand_dims(x_train, axis=2)
x_test = np.expand_dims(x_test, axis=2)

In [9]:
# Define the model
model = Sequential()
model.add(Conv1D(filters=256, kernel_size=2, activation='relu',input_shape = (x_train.shape[1], 1)))
model.add(Conv1D(filters=128, kernel_size=2, activation='relu'))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(units=30, return_sequences=True)))
model.add(LSTM(units= 30, return_sequences=True))
model.add(LSTM(units= 30, return_sequences=True))
model.add(LSTM(units= 30))
model.add(Dropout(0.2))
model.add(Dense( units= 5 ,activation='relu'))
model.compile(loss='mean_squared_error',optimizer='adam', metrics=['accuracy'])

# Train the model
early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=1)
history = model.fit(x_train, y_train, epochs=500, batch_size=32, validation_data=(x_test, y_test), callbacks=[early_stop])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 24: early stopping


In [10]:
x_test = test_set[: n_past, 0]
y_test = test_set[n_past : n_past + n_future, 0]
x_test, y_test = np.array(x_test), np.array(y_test)
x_test = np.reshape(x_test, (1, x_test.shape[0], 1))
predicted_temperature = model.predict(x_test)
print('Predicted temperature {}'.format(predicted_temperature))
print('Real temperature {}'.format(y_test))

# Print the heatwave array
heatwaves = []
for i in range(len(predicted_temperature[0])):
    if predicted_temperature[0][i] >= y_test[i]:
        heatwaves.append(1)
    else:
        heatwaves.append(0)

print("Heatwave" + str(heatwaves))

total_accuracy = 0
for i in range(n_future):
    accuracy = 100 - abs((predicted_temperature[0][i] - y_test[i]) / y_test[i]) * 100
    total_accuracy += accuracy
    print("Day {}: {:.2f}%".format(i+1, accuracy))

overall_accuracy = total_accuracy / n_future
print("Overall Accuracy: {:.2f}%".format(overall_accuracy))


Predicted temperature [[33.41237  33.417213 33.43283  33.410736 33.44559 ]]
Real temperature [38.48 37.82 36.06 37.46 36.99]
Heatwave[0, 0, 0, 0, 0]
Day 1: 86.83%
Day 2: 88.36%
Day 3: 92.71%
Day 4: 89.19%
Day 5: 90.42%
Overall Accuracy: 89.50%
