In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [2]:
import pandas as pd


In [3]:
df = pd.read_csv("clean_air_quality_data.csv")
df.head()

Unnamed: 0,from_date,to_date,pm25,pm10,no2,nox,nh3,so2,ozone,benzene,toluene,temperature,wind_speed,wind_direction,pressure
0,2019-06-20 14:00:00,2019-06-20 15:00:00,33.0,119.5,23.38,18.15,30.5,7.57,186.32,0.4,64.0,33.97,0.62,172.75,741.0
1,2019-06-20 15:00:00,2019-06-20 16:00:00,33.0,120.0,23.38,18.15,30.5,7.1,191.27,0.4,66.65,33.78,0.62,172.75,741.0
2,2019-06-20 16:00:00,2019-06-20 17:00:00,34.5,117.0,23.38,18.15,30.5,12.0,196.9,0.43,61.3,33.88,0.62,172.75,741.0
3,2019-06-20 17:00:00,2019-06-20 18:00:00,41.67,138.0,23.38,18.15,30.5,12.87,188.53,0.45,62.55,30.75,0.62,172.75,741.0
4,2019-06-20 18:00:00,2019-06-20 19:00:00,46.0,143.0,23.38,18.15,30.5,6.53,168.06,0.35,71.6,29.85,0.62,172.75,741.0


In [4]:
df.dtypes

from_date          object
to_date            object
pm25              float64
pm10              float64
no2               float64
nox               float64
nh3               float64
so2               float64
ozone             float64
benzene           float64
toluene           float64
temperature       float64
wind_speed        float64
wind_direction    float64
pressure          float64
dtype: object

In [5]:
df['from_date'] = pd.to_datetime(df['from_date'])
df['to_date'] = pd.to_datetime(df['to_date'])

In [6]:
df.dtypes

from_date         datetime64[ns]
to_date           datetime64[ns]
pm25                     float64
pm10                     float64
no2                      float64
nox                      float64
nh3                      float64
so2                      float64
ozone                    float64
benzene                  float64
toluene                  float64
temperature              float64
wind_speed               float64
wind_direction           float64
pressure                 float64
dtype: object

In [7]:
df = df.sort_values('from_date')
df = df.set_index('from_date')


In [8]:
df = df.drop(columns=['to_date'])


In [9]:
df.isna().sum().sum()

0

In [10]:
df['pm25_lag_3']  = df['pm25'].shift(3)
df['pm25_lag_6']  = df['pm25'].shift(6)
df['pm25_lag_24'] = df['pm25'].shift(24)  # same hour yesterday


In [11]:
df = df.dropna()


In [12]:
df['pm25_roll_6']  = df['pm25'].rolling(6).mean()
df['pm25_roll_24'] = df['pm25'].rolling(24).mean()


In [13]:
X_ts = df.drop(columns=['pm25'])
y_ts = df['pm25']


In [14]:
split_date = pd.to_datetime("2022-01-01")

X_train = X_ts[X_ts.index < split_date]
X_test  = X_ts[X_ts.index >= split_date]

y_train = y_ts[y_ts.index < split_date]
y_test  = y_ts[y_ts.index >= split_date]


In [15]:
rf_ts = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rf_ts.fit(X_train, y_train)


0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [22]:
y_pred = rf_ts.predict(X_test)


In [23]:
print("MAE :", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2  :", r2_score(y_test, y_pred))


MAE : 14.77124611263736
RMSE: 30.552481541500228
R2  : 0.8207671941970055


In [25]:
import joblib
joblib.dump(rf_ts, "pm25_random_forest_model11.pkl")


['pm25_random_forest_model11.pkl']

In [26]:
pip install xgboost


Collecting xgboost
  Downloading xgboost-3.1.3-py3-none-macosx_12_0_arm64.whl.metadata (2.0 kB)
Downloading xgboost-3.1.3-py3-none-macosx_12_0_arm64.whl (2.2 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m3.2 MB/s[0m  [33m0:00:00[0m3.2 MB/s[0m eta [36m0:00:01[0m:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.1.3
Note: you may need to restart the kernel to use updated packages.


In [15]:
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [16]:
xgb_model = xgb.XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1
)


In [17]:
xgb_model.fit(X_train, y_train)


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [18]:
y_pred_xgb = xgb_model.predict(X_test)


In [19]:
mae  = mean_absolute_error(y_test, y_pred_xgb)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2   = r2_score(y_test, y_pred_xgb)

print("XGBoost MAE :", mae)
print("XGBoost RMSE:", rmse)
print("XGBoost R2  :", r2)


XGBoost MAE : 14.020803511639436
XGBoost RMSE: 26.49240971439282
XGBoost R2  : 0.8652379997177512


In [20]:
import joblib
joblib.dump(xgb_model, "pm25_XGBoost_model.pkl")

['pm25_XGBoost_model.pkl']

In [21]:
pip install lightgbm


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl (1.6 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m5.0 MB/s[0m  [33m0:00:00[0mm [31m7.0 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


In [16]:
import lightgbm as lgb
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


ValueError: Key backend: 'module://matplotlib_inline.backend_inline' is not a valid value for backend; supported values are ['gtk3agg', 'gtk3cairo', 'gtk4agg', 'gtk4cairo', 'macosx', 'nbagg', 'notebook', 'qtagg', 'qtcairo', 'qt5agg', 'qt5cairo', 'tkagg', 'tkcairo', 'webagg', 'wx', 'wxagg', 'wxcairo', 'agg', 'cairo', 'pdf', 'pgf', 'ps', 'svg', 'template']