In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from math import sqrt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense


In [3]:
df=pd.read_csv("C:\\Users\\Dell\\Downloads\\pollution.csv")

In [4]:
df.head()

Unnamed: 0,No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
0,1,2010,1,1,0,,-21,-11.0,1021.0,NW,1.79,0,0
1,2,2010,1,1,1,,-21,-12.0,1020.0,NW,4.92,0,0
2,3,2010,1,1,2,,-21,-11.0,1019.0,NW,6.71,0,0
3,4,2010,1,1,3,,-21,-14.0,1019.0,NW,9.84,0,0
4,5,2010,1,1,4,,-20,-12.0,1018.0,NW,12.97,0,0


In [5]:
df.tail()

Unnamed: 0,No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
43819,43820,2014,12,31,19,8.0,-23,-2.0,1034.0,NW,231.97,0,0
43820,43821,2014,12,31,20,10.0,-22,-3.0,1034.0,NW,237.78,0,0
43821,43822,2014,12,31,21,10.0,-22,-3.0,1034.0,NW,242.7,0,0
43822,43823,2014,12,31,22,8.0,-22,-4.0,1034.0,NW,246.72,0,0
43823,43824,2014,12,31,23,12.0,-21,-3.0,1034.0,NW,249.85,0,0


In [6]:
df.columns

Index(['No', 'year', 'month', 'day', 'hour', 'pm2.5', 'DEWP', 'TEMP', 'PRES',
       'cbwd', 'Iws', 'Is', 'Ir'],
      dtype='object')

In [7]:
df.dtypes

No         int64
year       int64
month      int64
day        int64
hour       int64
pm2.5    float64
DEWP       int64
TEMP     float64
PRES     float64
cbwd      object
Iws      float64
Is         int64
Ir         int64
dtype: object

In [8]:
df.isnull

<bound method DataFrame.isnull of           No  year  month  day  hour  pm2.5  DEWP  TEMP    PRES cbwd     Iws  \
0          1  2010      1    1     0    NaN   -21 -11.0  1021.0   NW    1.79   
1          2  2010      1    1     1    NaN   -21 -12.0  1020.0   NW    4.92   
2          3  2010      1    1     2    NaN   -21 -11.0  1019.0   NW    6.71   
3          4  2010      1    1     3    NaN   -21 -14.0  1019.0   NW    9.84   
4          5  2010      1    1     4    NaN   -20 -12.0  1018.0   NW   12.97   
...      ...   ...    ...  ...   ...    ...   ...   ...     ...  ...     ...   
43819  43820  2014     12   31    19    8.0   -23  -2.0  1034.0   NW  231.97   
43820  43821  2014     12   31    20   10.0   -22  -3.0  1034.0   NW  237.78   
43821  43822  2014     12   31    21   10.0   -22  -3.0  1034.0   NW  242.70   
43822  43823  2014     12   31    22    8.0   -22  -4.0  1034.0   NW  246.72   
43823  43824  2014     12   31    23   12.0   -21  -3.0  1034.0   NW  249.85   

     

In [9]:
df.isnull().sum()


No          0
year        0
month       0
day         0
hour        0
pm2.5    2067
DEWP        0
TEMP        0
PRES        0
cbwd        0
Iws         0
Is          0
Ir          0
dtype: int64

In [10]:
df['pm2.5'].fillna(0, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['pm2.5'].fillna(0, inplace=True)


In [11]:
df = pd.get_dummies(df)


In [12]:
#Normalize and Prepare Supervised Dataset

In [13]:
def to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = data.shape[1]
    df = pd.DataFrame(data)
    cols, names = [], []

    # Input (t-1 to t-n)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [f'var{j+1}(t-{i})' for j in range(n_vars)]

    # Output (t)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        names += [f'var{j+1}(t+{i})' for j in range(n_vars)]

    agg = pd.concat(cols, axis=1)
    agg.columns = names

    if dropnan:
        agg.dropna(inplace=True)
    return agg


In [14]:
#Normalize and Transform

In [15]:
# Normalize values
scaler = MinMaxScaler()
scaled = scaler.fit_transform(df)

# Frame as supervised
reframed = to_supervised(scaled, n_in=1, n_out=1)

# Only keep input vars and pm2.5 as output
n_features = df.shape[1]
reframed = reframed.iloc[:, :n_features + 1]


In [16]:
#Train-Test Split and Reshape


In [17]:
values = reframed.values

# 1 year of data for training
n_train = 365 * 24

train = values[:n_train, :]
test = values[n_train:, :]

# Split into input and output
train_X, train_y = train[:, :-1], train[:, -1]
test_X, test_y = test[:, :-1], test[:, -1]

# Reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

print(train_X.shape, test_X.shape)


(8760, 1, 16) (35063, 1, 16)


In [18]:
#Define and Train LSTM Model

In [None]:
# Define model
model = Sequential()
model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')

# Fit model
history = model.fit(train_X, train_y, epochs=50, batch_size=72,
                    validation_data=(test_X, test_y), verbose=2, shuffle=False)


Epoch 1/50
122/122 - 8s - loss: 0.0085 - val_loss: 0.4101 - 8s/epoch - 68ms/step
Epoch 2/50
122/122 - 2s - loss: 0.0186 - val_loss: 0.4253 - 2s/epoch - 16ms/step
Epoch 3/50
122/122 - 2s - loss: 0.0097 - val_loss: 0.4355 - 2s/epoch - 16ms/step
Epoch 4/50
122/122 - 2s - loss: 0.0065 - val_loss: 0.4300 - 2s/epoch - 16ms/step
Epoch 5/50
122/122 - 2s - loss: 0.0085 - val_loss: 0.4442 - 2s/epoch - 15ms/step
Epoch 6/50
122/122 - 2s - loss: 0.0051 - val_loss: 0.4466 - 2s/epoch - 16ms/step
Epoch 7/50
122/122 - 2s - loss: 0.0029 - val_loss: 0.4450 - 2s/epoch - 15ms/step
Epoch 8/50
122/122 - 2s - loss: 0.0050 - val_loss: 0.4518 - 2s/epoch - 15ms/step
Epoch 9/50
122/122 - 2s - loss: 0.0057 - val_loss: 0.4560 - 2s/epoch - 16ms/step
Epoch 10/50
122/122 - 2s - loss: 0.0038 - val_loss: 0.4516 - 2s/epoch - 15ms/step
Epoch 11/50
122/122 - 2s - loss: 0.0051 - val_loss: 0.4515 - 2s/epoch - 16ms/step
Epoch 12/50
122/122 - 1s - loss: 0.0027 - val_loss: 0.4585 - 1s/epoch - 12ms/step
Epoch 13/50
122/122 - 1s 

In [None]:
#Plot Training History

In [None]:
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='validation')
plt.legend()
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('MAE Loss')
plt.show()


In [None]:
#Predict and Invert Scaling

In [None]:
# Predict
yhat = model.predict(test_X)

# Reshape test_X for inverse transformation
test_X_reshaped = test_X.reshape((test_X.shape[0], test_X.shape[2]))

# Invert scaling for predicted
inv_yhat = np.concatenate((yhat, test_X_reshaped[:, 1:]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:, 0]

# Invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = np.concatenate((test_y, test_X_reshaped[:, 1:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:, 0]


In [None]:
# Evaluate Performance



In [None]:
rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
print(f'Test RMSE: {rmse:.3f}')


In [None]:
 #Plot Predictions vs Actual



In [None]:
plt.figure(figsize=(10,5))
plt.plot(inv_y[:200], label='Actual PM2.5')
plt.plot(inv_yhat[:200], label='Predicted PM2.5')
plt.xlabel('Hours')
plt.ylabel('PM2.5 Concentration')
plt.title('Actual vs Predicted PM2.5 (first 200 hours)')
plt.legend()
plt.show()
