In [1]:
import pandas as pd
import numpy as np

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip"
!wget {url}
!unzip AirQualityUCI.zip

df = pd.read_csv('AirQualityUCI.csv', sep=';', decimal=',')

df = df.iloc[:, :-2].dropna(how='all')
df.head()

--2026-02-24 14:50:06--  https://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘AirQualityUCI.zip’

AirQualityUCI.zip       [   <=>              ]   1.47M  3.62MB/s    in 0.4s    

2026-02-24 14:50:07 (3.62 MB/s) - ‘AirQualityUCI.zip’ saved [1543989]

Archive:  AirQualityUCI.zip
  inflating: AirQualityUCI.csv       
  inflating: AirQualityUCI.xlsx      


Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,10/03/2004,18.00.00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
1,10/03/2004,19.00.00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2,10/03/2004,20.00.00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
3,10/03/2004,21.00.00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
4,10/03/2004,22.00.00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888


# -------------- Data Cleansing --------------

In [2]:
df.replace(-200, np.nan, inplace=True)

print(df.isnull().sum())

df = df.fillna(method='ffill')

features = ['CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)',
            'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)', 'T']
X = df[features]
y = df['RH']

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Date                0
Time                0
CO(GT)           1683
PT08.S1(CO)       366
NMHC(GT)         8443
C6H6(GT)          366
PT08.S2(NMHC)     366
NOx(GT)          1639
PT08.S3(NOx)      366
NO2(GT)          1642
PT08.S4(NO2)      366
PT08.S5(O3)       366
T                 366
RH                366
AH                366
dtype: int64


  df = df.fillna(method='ffill')


**-------------- Model 1 --------------**

In [3]:
from sklearn.ensemble import VotingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

m1 = RandomForestRegressor(n_estimators=100)
m2 = SVR(kernel='rbf')
m3 = KNeighborsRegressor(n_neighbors=5)

ensemble_model = VotingRegressor(estimators=[('rf', m1), ('svr', m2), ('knn', m3)])
ensemble_model.fit(X_train_scaled, y_train)

print(f"Ensemble Score: {ensemble_model.score(X_test_scaled, y_test)}")

Ensemble Score: 0.9143873421367117


**-------------- Model 2 --------------**

In [4]:
import tensorflow as tf
from tensorflow.keras import layers

nn_model = tf.keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(1)
])

nn_model.compile(optimizer='adam', loss='mse', metrics=['mae'])
history = nn_model.fit(X_train_scaled, y_train, epochs=50, validation_split=0.2, batch_size=32)

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 1788.2878 - mae: 37.1292 - val_loss: 329.9985 - val_mae: 14.5459
Epoch 2/50
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 243.8039 - mae: 12.3671 - val_loss: 85.1493 - val_mae: 7.1478
Epoch 3/50
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 75.6101 - mae: 6.5263 - val_loss: 50.6597 - val_mae: 5.4119
Epoch 4/50
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 52.2598 - mae: 5.3647 - val_loss: 43.9051 - val_mae: 4.9955
Epoch 5/50
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 45.8448 - mae: 5.0980 - val_loss: 40.8435 - val_mae: 4.9609
Epoch 6/50
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 42.3942 - mae: 4.9273 - val_loss: 38.2629 - val_mae: 4.7049
Epoch 7/50
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms

In [8]:
import pickle

with open('ensemble_model.pkl', 'wb') as f:
    pickle.dump(ensemble_model, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

nn_model.save('nn_model.keras')

----------------------------------------------


In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

url_bkk = "https://github.com/prasertcbs/basic-dataset/raw/master/bangkok-air-quality.csv"
df_bkk = pd.read_csv(url_bkk)

df_bkk.columns = df_bkk.columns.str.strip().str.lower()

cols_to_fix = ['pm25', 'pm10', 'o3', 'no2', 'so2', 'co']
for col in cols_to_fix:
    if col in df_bkk.columns:
        df_bkk[col] = pd.to_numeric(df_bkk[col], errors='coerce')

df_bkk = df_bkk.fillna(method='ffill')
df_bkk = df_bkk.fillna(method='bfill')
df_bkk = df_bkk.dropna()

if 'pm25' in df_bkk.columns:
    df_bkk = df_bkk[df_bkk['pm25'] >= 0]

features = ['pm10', 'o3', 'no2', 'so2', 'co']
X = df_bkk[features]
y = df_bkk['pm25']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler_bkk = StandardScaler()
X_train_scaled = scaler_bkk.fit_transform(X_train)
X_test_scaled = scaler_bkk.transform(X_test)


  df_bkk = df_bkk.fillna(method='ffill')
  df_bkk = df_bkk.fillna(method='bfill')


**-------------- Model 1 --------------**

In [44]:
from sklearn.ensemble import VotingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

m1 = RandomForestRegressor(n_estimators=100, random_state=42)
m2 = SVR(kernel='rbf')
m3 = KNeighborsRegressor(n_neighbors=5)

ensemble_bkk = VotingRegressor(estimators=[('rf', m1), ('svr', m2), ('knn', m3)])
ensemble_bkk.fit(X_train_scaled, y_train)

print(f"Ensemble Score: {ensemble_bkk.score(X_test_scaled, y_test):.4f}")

Ensemble Score: 0.7583


**-------------- Model 2 --------------**

In [42]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

nn_bkk = tf.keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(1)
])

nn_bkk.compile(optimizer='adam', loss='mse', metrics=['mae'])

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

history_bkk = nn_bkk.fit(
    X_train_scaled, y_train,
    epochs=100,
    validation_split=0.2,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 38ms/step - loss: 6151.9868 - mae: 72.0593 - val_loss: 4077.7068 - val_mae: 54.8254
Epoch 2/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 2658.1270 - mae: 40.8834 - val_loss: 1049.5564 - val_mae: 20.5175
Epoch 3/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 572.3347 - mae: 17.6646 - val_loss: 747.6382 - val_mae: 17.4798
Epoch 4/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 468.9662 - mae: 16.0320 - val_loss: 643.3959 - val_mae: 16.3632
Epoch 5/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 425.3825 - mae: 14.6348 - val_loss: 593.9897 - val_mae: 15.4011
Epoch 6/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 384.1509 - mae: 13.9723 - val_loss: 514.1208 - val_mae: 14.6899
Epoch 7/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[

In [46]:
import pickle

with open('model1_ensemble_bkk.pkl', 'wb') as f:
    pickle.dump(ensemble_bkk, f)

with open('scaler_bkk.pkl', 'wb') as f:
    pickle.dump(scaler_bkk, f)

nn_bkk.save('nn_bkk.keras')
