#  STEP 3: Feature Engineering 

## Drop unnecessary variables

In [1]:
data.drop(columns = ['No', 'year', 'day', 'hour', 'month', 'Season'], axis = 1, inplace = True)

NameError: name 'data' is not defined

## Building ColumnTransformer

In [None]:
preprocessor = ColumnTransformer(transformers = [
    ("encoder", CountFrequencyEncoder(), ['Wind_direction']),
    ("imputer", KNNImputer(), ['Pollution']),
], remainder = 'passthrough')

In [None]:
data_t = preprocessor.fit_transform(data)

In [None]:
data_transformed = pd.DataFrame(data_t)
data_transformed

## Scaling

In [None]:
scaler = RobustScaler()

In [None]:
data_t = scaler.fit_transform(data_transformed)
data_transformed = pd.DataFrame(data_t)
data_transformed.columns = ['Wind_direction', 'Pollution', 'Dew_point', 'Temprerature', 'Pressure', 'Wind_speed', 'Snow', 'Rain']
data_transformed.index = data.index
data_transformed

#  STEP 4: Splitting to train / test sets 

In [None]:
data_transformed['Predicted_pollution'] = data_transformed['Pollution'].shift(-1)
data_transformed.head()

In [None]:
data_transformed.drop(data.tail(1).index, inplace = True)
data_transformed.tail()

In [None]:
train = data_transformed.values[:(365 * 24 * 3)]
test = data_transformed.values[(365 * 24 * 3):]

train_X, train_y = train[:, :-1], train[:, -1]
test_X, test_y = test[:, :-1], test[:, -1]

train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

train_X.shape, train_y.shape, test_X.shape, test_y.shape

#  STEP 5: Predicting pollution rates for nex hour 

## Callbacks

In [None]:
checkpoint = ModelCheckpoint(
    filepath = './base/model_hours_2_layers_weights.h5py',
    monitor = 'val_loss',
    verbose = 1,
    save_best_only = True,
    mode = 'min',
    save_weights_only = True,
    save_freq = 'epoch'
)

earlyStop = EarlyStopping(
    monitor = 'val_loss',
    min_delta = 0.001,
    patience = 50,
    verbose = 1,
    mode = 'min'
)

callbacks = [checkpoint, earlyStop]

#  Train RNN Model with 2 hidden layer 

In [None]:
model_hours_2_layers = Sequential([
    LSTM(100, return_sequences = True, input_shape = (train_X.shape[1], train_X.shape[2])),
    LSTM(100),
    Dense(1)
])

model_hours_2_layers.compile(loss = 'mean_squared_error', optimizer = 'adam')
history_hours_2_layers = model_hours_2_layers.fit(
    train_X,
    train_y,
    epochs = 100,
    batch_size = 30,
    validation_data = (test_X, test_y),
    verbose = 1,
    callbacks = callbacks,
    shuffle = False ### IMPORTANT: We can not loose the time sequence,
    
)

#  Result analysis 

## Loss plot

In [None]:
plt.plot(history_hours_2_layers.history['loss'], label = 'train')
plt.plot(history_hours_2_layers.history['val_loss'], label = 'test')
plt.legend(['train', 'test'])

## Making predictions

In [None]:
# test set
yhat_test = model_hours_2_layers.predict(test_X)

test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))

inv_yhat_test = np.concatenate((yhat_test, test_X[:, 1:]), axis = 1)
inv_yhat_test = scaler.inverse_transform(inv_yhat_test)
inv_yhat_test = inv_yhat_test[:, 0]

test_y = test_y.reshape(-1, 1)
inv_y_test = np.concatenate((test_y, test_X[:, 1:]), axis = 1)
inv_y_test = scaler.inverse_transform(inv_y_test)
inv_y_test = inv_y_test[:, 0]

rmse_test = mean_squared_error(inv_y_test, inv_yhat_test, squared = False)
print('Test RMSE: {:.3f}'.format(rmse_test))

# train set
yhat_train = model_hours_2_layers.predict(train_X)

train_X = train_X.reshape((train_X.shape[0], train_X.shape[2]))

inv_yhat_train = np.concatenate((yhat_train, train_X[:, 1:]), axis = 1)
inv_yhat_train = scaler.inverse_transform(inv_yhat_train)
inv_yhat_train = inv_yhat_train[:, 0]

train_y = train_y.reshape(-1, 1)
inv_y_train = np.concatenate((train_y, train_X[:, 1:]), axis = 1)
inv_y_train = scaler.inverse_transform(inv_y_train)
inv_y_train = inv_y_train[:, 0]

rmse_train = mean_squared_error(inv_y_train, inv_yhat_train, squared = False)
print('Test RMSE: {:.3f}'.format(rmse_train))

## Residual plot

In [None]:
#train
y_train = inv_y_train.reshape(-1, 1)
y_pred_train = inv_yhat_train.reshape(-1, 1)

plt.figure(figsize = (16, 8))
plt.plot(y_pred_train[:200, :], color = 'green', label = 'Predicted pollution level')
plt.plot(y_train[:200, :], color = 'red', label = 'Actual pollution level')
plt.title('Air pollution prediction (Train)')
plt.xlabel('Date')
plt.ylabel('Pollution level')
plt.legend()
plt.show()

#test
y_test = inv_y_test.reshape(-1, 1)
y_pred_test = inv_yhat_test.reshape(-1, 1)

plt.figure(figsize = (16, 8))
plt.plot(y_pred_test[:200, :], color = 'green', label = 'Predicted pollution level')
plt.plot(y_test[:200, :], color = 'red', label = 'Actual pollution level')
plt.title('Air pollution prediction (Test)')
plt.xlabel('Date')
plt.ylabel('Pollution level')
plt.legend()
plt.show()

## R2 Score

In [None]:
print('****Train****')
print('R2 score: {}'.format(r2_score(inv_y_train, inv_yhat_train)))

print('****Test****')
print('R2 score: {}'.format(r2_score(inv_y_test, inv_yhat_test)))

#  Saving the model 


In [None]:
model_hours_2_layers = model_hours_2_layers.to_json()
with open('model_hours_2_layers.json', 'w') as json_file:
    json_file.write(model_hours_2_layers)

#  Train RNN Model with 4 hidden layers 

In [None]:
train = data_transformed.values[:(365 * 24 * 3)]
test = data_transformed.values[(365 * 24 * 3):]

train_X, train_y = train[:, :-1], train[:, -1]
test_X, test_y = test[:, :-1], test[:, -1]

train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

train_X.shape, train_y.shape, test_X.shape, test_y.shape

## Callbacks

In [None]:
checkpoint = ModelCheckpoint(
    filepath = './base/model_hours_4_layers_weights.h5py',
    monitor = 'val_loss',
    verbose = 1,
    save_best_only = True,
    mode = 'min',
    save_weights_only = True,
    save_freq = 'epoch'
)

earlyStop = EarlyStopping(
    monitor = 'val_loss',
    min_delta = 0.001,
    patience = 50,
    verbose = 1,
    mode = 'min'
)

callbacks = [checkpoint, earlyStop]

In [None]:
model_hours_4_layers = Sequential([
    LSTM(100, return_sequences = True, input_shape = (train_X.shape[1], train_X.shape[2])),
    Dropout(0.3),
    LSTM(100, return_sequences = True),
    Dropout(0.3),
    LSTM(100, return_sequences = True),
    Dropout(0.3),
    LSTM(100),
    Dense(1)
])

model_hours_4_layers.compile(loss = 'mean_squared_error', optimizer = 'adam')
history_hours_4_layers = model_hours_4_layers.fit(
    train_X,
    train_y,
    epochs = 100,
    batch_size = 30,
    validation_data = (test_X, test_y),
    verbose = 1,
    callbacks = callbacks,
    shuffle = False ### IMPORTANT: We can not loose the time sequence,
    
)

# ========================= Result analysis =========================

## Loss plot

In [None]:
plt.plot(history_hours_4_layers.history['loss'], label = 'train')
plt.plot(history_hours_4_layers.history['val_loss'], label = 'test')
plt.legend(['train', 'test'])

## Making predictions

In [None]:
# test set
yhat_test = model_hours_4_layers.predict(test_X)

test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))

inv_yhat_test = np.concatenate((yhat_test, test_X[:, 1:]), axis = 1)
inv_yhat_test = scaler.inverse_transform(inv_yhat_test)
inv_yhat_test = inv_yhat_test[:, 0]

test_y = test_y.reshape(-1, 1)
inv_y_test = np.concatenate((test_y, test_X[:, 1:]), axis = 1)
inv_y_test = scaler.inverse_transform(inv_y_test)
inv_y_test = inv_y_test[:, 0]

rmse_test = mean_squared_error(inv_y_test, inv_yhat_test, squared = False)
print('Test RMSE: {:.3f}'.format(rmse_test))

# train set
yhat_train = model_hours_4_layers.predict(train_X)

train_X = train_X.reshape((train_X.shape[0], train_X.shape[2]))

inv_yhat_train = np.concatenate((yhat_train, train_X[:, 1:]), axis = 1)
inv_yhat_train = scaler.inverse_transform(inv_yhat_train)
inv_yhat_train = inv_yhat_train[:, 0]

train_y = train_y.reshape(-1, 1)
inv_y_train = np.concatenate((train_y, train_X[:, 1:]), axis = 1)
inv_y_train = scaler.inverse_transform(inv_y_train)
inv_y_train = inv_y_train[:, 0]

rmse_train = mean_squared_error(inv_y_train, inv_yhat_train, squared = False)
print('Test RMSE: {:.3f}'.format(rmse_train))

## Residual plot

In [None]:
#train
y_train = inv_y_train.reshape(-1, 1)
y_pred_train = inv_yhat_train.reshape(-1, 1)

plt.figure(figsize = (16, 8))
plt.plot(y_pred_train[:200, :], color = 'green', label = 'Predicted pollution level')
plt.plot(y_train[:200, :], color = 'red', label = 'Actual pollution level')
plt.title('Air pollution prediction (Train)')
plt.xlabel('Date')
plt.ylabel('Pollution level')
plt.legend()
plt.show()

#test
y_test = inv_y_test.reshape(-1, 1)
y_pred_test = inv_yhat_test.reshape(-1, 1)

plt.figure(figsize = (16, 8))
plt.plot(y_pred_test[:200, :], color = 'green', label = 'Predicted pollution level')
plt.plot(y_test[:200, :], color = 'red', label = 'Actual pollution level')
plt.title('Air pollution prediction (Test)')
plt.xlabel('Date')
plt.ylabel('Pollution level')
plt.legend()
plt.show()

## R2 Score

In [None]:
print('****Train****')
print('R2 score: {}'.format(r2_score(inv_y_train, inv_yhat_train)))

print('****Test****')
print('R2 score: {}'.format(r2_score(inv_y_test, inv_yhat_test)))

#  Saving the model 

In [None]:
model_hours_4_layers = model_hours_4_layers.to_json()
with open('model_hours_4_layers.json', 'w') as json_file:
    json_file.write(model_hours_4_layers)