In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv("../data/final/master_dataset.csv")
df["dateTime"] = pd.to_datetime(df["dateTime"])
df = df.sort_values("dateTime")
df.head()


Unnamed: 0,dateTime,TN,TP,NH3,NO23,OP,SSC
0,1978-06-05,2.51,0.05,0.02,1.8,0.05,247.0
1,1978-07-12,2.13,0.19,0.02,1.3,0.05,359.0
2,1978-08-15,2.47,0.34,0.17,1.5,0.05,367.0
3,1978-09-14,2.4,0.16,0.02,0.6,0.05,159.0
4,1978-10-23,1.98,0.2,0.02,1.3,0.05,196.0


In [3]:
tn_series = df["TN"].values


In [4]:
def create_sequences(data, window_size=5):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i+window_size])
        y.append(data[i+window_size])
    return np.array(X), np.array(y)

X, y = create_sequences(tn_series, window_size=5)

print(X.shape, y.shape)


(536, 5) (536,)


In [6]:
X = X.reshape((X.shape[0], X.shape[1], 1))
print(X.shape)


(536, 5, 1)


In [5]:
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)


In [8]:
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test  = X_test.reshape(X_test.shape[0],  X_test.shape[1], 1)


In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(X_train.shape[1], 1)),
    Dropout(0.2),

    LSTM(32),
    Dropout(0.2),

    Dense(1)
])

model.compile(
    optimizer="adam",
    loss="mse"
)

model.summary()


  if not hasattr(np, "object"):
  super().__init__(**kwargs)


In [10]:
history = model.fit(
    X_train, y_train,
    epochs=30,
    batch_size=16,
    validation_data=(X_test, y_test)
)


Epoch 1/30
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 46ms/step - loss: 1.8259 - val_loss: 0.4616
Epoch 2/30
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.5478 - val_loss: 0.2667
Epoch 3/30
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.5049 - val_loss: 0.2623
Epoch 4/30
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.4877 - val_loss: 0.2686
Epoch 5/30
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - loss: 0.4943 - val_loss: 0.2633
Epoch 6/30
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - loss: 0.4840 - val_loss: 0.2619
Epoch 7/30
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - loss: 0.4714 - val_loss: 0.2550
Epoch 8/30
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 0.4766 - val_loss: 0.2388
Epoch 9/30
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━

In [11]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("LSTM RMSE:", rmse)
print("LSTM R2:", r2)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 180ms/step
LSTM RMSE: 0.43427792535225584
LSTM R2: 0.35324435257046116
