# Comparison of Air Quality Index (AQI) Prediction Based on AlexNet, VGGNet, ResNet

Kelompok 01 Kecerdasan buatan 02:
* Fateen Najib Indramustika - 2006468522
* Joshevan - 2006577321
* Airell Ramadhan Budiraharjo - 2006535230

## Initialization

In [None]:
import os
import pandas as pd
from datetime import datetime
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.applications import ResNet101
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from datetime import datetime
import pytz
import matplotlib.pyplot as plt

## Load data
load data from csv and image files

In [None]:
print(os.listdir())
os.chdir('/home/fateenindramustika/predict-air-quality-with-ANN')
image_files = os.listdir('image-dataset')
image_timestamps = [datetime.strptime(os.path.splitext(file)[0][0:15], "%Y%m%d_%H%M%S") for file in image_files]

data = {'File Name': image_files, 'Timestamp': image_timestamps}
df_image = pd.DataFrame(data)

image_timestamps = [os.path.splitext(file)[0] for file in image_files]

aqi_data = pd.read_csv('aqi-dataset/air_quality_data.csv')

aqi_timestamps = aqi_data['Now Timestamp'].tolist()

aqi_timestamps = [datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%f") for timestamp in aqi_timestamps]

data = {'Timestamp': aqi_timestamps, 'AQI': aqi_data['AQI']}
df_aqi = pd.DataFrame(data)
print(df_aqi)

## Pairing the data from CSV and image files
match the data by checking the timestamp and find the air quality index based on the timestamp

In [None]:
def find_nearest(row, df, column='Timestamp'):
    absolute_difference_function = lambda x: abs(x - row['Timestamp'])
    nearest_timestamp = df[column].apply(absolute_difference_function).idxmin()
    return df.loc[nearest_timestamp]

nearest_aqi = df_image.apply(find_nearest, args=(df_aqi,), axis=1)

df_image = pd.concat([df_image, nearest_aqi], axis=1)

df_image.drop(columns=['Timestamp'], inplace=True)
df_image

## Image data preprocessing
preprocess the image data by resizing the image to 224x224 and normalize the image data

In [None]:
train_data, test_data = train_test_split(df_image, test_size=0.2, random_state=42)
train_datagen = ImageDataGenerator(rescale=0.2)
test_datagen = ImageDataGenerator(rescale=0.2)
train_generator = train_datagen.flow_from_dataframe(
    train_data,
    directory='image-dataset/',
    x_col='File Name',
    y_col='AQI',
    target_size=(224, 224),
    batch_size=32,
    class_mode='raw'
)
test_generator = test_datagen.flow_from_dataframe(
    test_data,
    directory='image-dataset/',
    x_col='File Name',
    y_col='AQI',
    target_size=(224, 224),
    batch_size=32,
    class_mode='raw',
    shuffle=False
)

## Model building
build the model using ResNet101 with custom output layer

In [None]:
base_model = ResNet101(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
predictions = Dense(1)(x) 

model = Model(inputs=base_model.input, outputs=predictions)

for layer in base_model.layers:
    layer.trainable = False

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error',tf.keras.metrics.RootMeanSquaredError(),r_squared])

# Summary of the model
model.summary()

## Model training
train the model using the image data and air quality index data

In [None]:
# Train the model
history = model.fit(
    train_generator,
    steps_per_epoch=len(train_data) // 32,
    validation_data=test_generator,
    validation_steps=len(test_data) // 32,
    epochs=500
)

saving the model and history to file

In [None]:
now = datetime.now(pytz.timezone('Asia/Jakarta'))

timestamp = now.strftime("%Y%m%d_%H%M%S")

model.save(f'models/resnet_aqi_prediction_{timestamp}.h5')

hist_df = pd.DataFrame(history.history)

hist_df.to_csv(f'histories/history_{timestamp}.csv')

In [None]:
hist_df = pd.DataFrame(history.history)

hist_df.to_csv(f'histories/history_{timestamp}.csv')

## Ploting the training metrics
plot the training metrics collected from the training process

In [None]:
val_mae = history.history['val_root_mean_squared_error']
mae = history.history['root_mean_squared_error']

plt.figure(figsize=(12, 6))
plt.plot(val_mae)
plt.plot(mae)
plt.title('RMSE over epochs')
plt.xlabel('Epochs')
plt.ylabel('RMSE')
plt.legend(['Validation RMSE', 'RMSE'])
plt.show()

In [None]:
val_mae = history.history['val_mean_absolute_error']
mae = history.history['mean_absolute_error']

plt.figure(figsize=(12, 6))
plt.plot(val_mae)
plt.plot(mae)
plt.title('MAE over epochs')
plt.xlabel('Epochs')
plt.ylabel('MAE')
plt.legend(['Validation MAE', 'MAE'])
plt.show()

In [None]:
val_lost = history.history['val_loss']
lost = history.history['loss']

plt.figure(figsize=(12, 6))
plt.plot(val_lost)
plt.plot(lost)
plt.title('Loss over epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(['Validation Loss', 'Loss'])
plt.show()

evaluate the model using the test data

In [None]:
test_loss, test_mae, test_rmse  = model.evaluate(test_generator, steps=len(test_data) // 32)
print(f"Test Loss: {test_loss}")
print(f"Test MAE: {test_mae}")
print(f"Test RMSE: {test_rmse}")

In [None]:
predictions = model.predict(test_generator)

df_result = pd.DataFrame({
    'File Name': test_data['File Name'],
    'Actual AQI': test_data['AQI'],
    'Predicted AQI': predictions.flatten()
})
df_result

In [None]:
df_result = df_result.sort_index()
plt.figure(figsize=(10, 6))
plt.plot(df_result['Actual AQI'], label='Actual AQI')
plt.plot(df_result['Predicted AQI'], label='Predicted AQI')
plt.title('Actual AQI vs Predicted AQI')
plt.xlabel('Index')
plt.ylabel('AQI')
plt.legend()
plt.show()