In [1]:
import pandas as pd

file_path = "Measurement_summary.csv"
data = pd.read_csv(file_path)

data.head()

Unnamed: 0,Measurement date,Station code,Address,Latitude,Longitude,SO2,NO2,O3,CO,PM10,PM2.5
0,2017-01-01 00:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.004,0.059,0.002,1.2,73.0,57.0
1,2017-01-01 01:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.004,0.058,0.002,1.2,71.0,59.0
2,2017-01-01 02:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.004,0.056,0.002,1.2,70.0,59.0
3,2017-01-01 03:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.004,0.056,0.002,1.2,70.0,58.0
4,2017-01-01 04:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.003,0.051,0.002,1.2,69.0,61.0


In [6]:
data["Measurement date"] = pd.to_datetime(data["Measurement date"])
data["Hour"] = data["Measurement date"].dt.hour
data["Day of Week"] = data["Measurement date"].dt.dayofweek
data["Month"] = data["Measurement date"].dt.month

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
pollution_columns = ["SO2", "NO2", "O3", "CO", "PM10", "PM2.5"]
data[pollution_columns] = scaler = scaler.fit_transform(data[pollution_columns])

organized_data = data[["Measurement date", "Station code", "Latitude", "Longitude", "Hour", "Day of Week", "Month"] + pollution_columns]

organized_data.to_csv("organized_air_pollution_data.csv", index=False)

In [None]:
import pandas as pd

file_path = "./organized_air_pollution_data.csv"
data = pd.read_csv(file_path)

pollution_columns = ["SO2", "NO2", "O3", "CO", "PM10", "PM2.5"]

import numpy as np

sequence_length = 24

X = []
y = []

for i in range(len(data) - sequence_length):
    X.append(data[pollution_columns].iloc[i:i+sequence_length].values)
    y.append(data[pollution_columns].iloc[i+sequence_length].values)

X = np.array(X)
y = np.array(y)

print(f"Input shapes: {X.shape}, Target shape: {y.shape}")

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = False)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

model = Sequential([LSTM(50, return_sequence=False, input_shape=(sequence_length, len(pollution_columns))), Dense(len(pollution_columns))])

model.compile(optimizer="adam", loss="mse", metrics=["mae"])

history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1)

In [None]:
test_loss, test_mae = model.evaluate(X_test, y_test)
print(f"Test loss: {test_loss}, Test MAE: {test_mae}")

y_pred = model.predict(X_test)

In [None]:
import matplotlib.pyplot as plt

plt.plot(y_test[:, 5], label="True PM2.5")
plt.plot(y_pred[:, 5], label="Predicted PM2.5")
plt.legend()
plt.title("True vs Predicted PM2.5")
plt.show()

In [None]:
for i, column in enumerate(pollution_columns):
    plt.figure()
    plt.plot(y_test[:,i], label=f"True {column}")
    plt.plot(y_pred[:, i], label=f"Predicted {column}")
    plt.legend()
    plt.title(f"True vs Predicted {column}")
    plt.show()

In [None]:
data_encoded = pd.get_dummies(data, columns=["Station code"])

pollution_columns = list(data_encoded.columns.difference(["Measurement date"]))

In [None]:
model.save("multi_output_lstm_model.h5")