In [73]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [74]:
# Step 1: Load dataset from CSV
# Replace 'sensor_data.csv' with your actual filename
data = pd.read_csv('global_aqi_2020_2025.csv')
data = data.drop(data.index[10001:])

In [75]:
# Step 2: Define features and target 
X = data.drop(['AQI','Date','City','Country','PM2.5_subindex'], axis=1) # drop ID/meta + target 
y = data['AQI']

In [76]:
# Step 3: Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [77]:
# Step 4: Preprocessing
cat_features = []  # assume all sensor readings are numeric
num_features = [col for col in X.columns if col not in cat_features]

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(), cat_features)
])

In [78]:
# Step 5: Pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [79]:
# Step 6: Train initial model
model_pipeline.fit(X_train, y_train)
y_pred = model_pipeline.predict(X_test)

print("Initial Model Performance:")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

Initial Model Performance:
MSE: 6.039292674354548
MAE: 0.34484532867711815
R²: 0.9991643813788553


In [None]:
# Step 7: Hyperparameter tuning
param_grid = {
    'regressor__n_estimators': [50, 100],
    'regressor__max_depth': [10, 20, None],
    'regressor__min_samples_split': [2, 5]
}
grid_search = GridSearchCV(model_pipeline, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [None]:
# Step 8: Evaluate tuned model
y_pred_tuned = grid_search.predict(X_test)
print("Tuned Model Performance:")
print("MSE:", mean_squared_error(y_test, y_pred_tuned))
print("MAE:", mean_absolute_error(y_test, y_pred_tuned))
print("R²:", r2_score(y_test, y_pred_tuned))

In [None]:
# Step 9: Feature importance
best_model = grid_search.best_estimator_.named_steps['regressor']
all_features = num_features  # no categorical features here

importances = best_model.feature_importances_
feat_imp = pd.Series(importances, index=all_features).sort_values(ascending=False)

print("\nFeature Importances:")
print(feat_imp)

plt.figure(figsize=(10,6))
sns.barplot(x=feat_imp.values, y=feat_imp.index)
plt.title('Feature Importance in AQI Prediction')
plt.show()