# Air Pollution Prediction: PM2.5 and NO₂
This notebook uses traffic and weather data to model air pollution levels using regression and clustering techniques.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cluster import KMeans

In [None]:
# Load dataset
df = pd.read_csv('AirPollution_Traffic_Weather_Dataset.csv')

# Drop missing values
df.dropna(inplace=True)

In [None]:
# Normalize numerical features
scale_cols = ['Vehicle_Count', 'Congestion_Index', 'Temperature', 'Humidity', 'Wind_Speed']
scaler = MinMaxScaler()
df[scale_cols] = scaler.fit_transform(df[scale_cols])

In [None]:
# Feature and target selection
features = ['Vehicle_Count', 'Traffic_Level', 'Congestion_Index', 'Temperature', 'Humidity', 'Wind_Speed']
X = df[features]
y_pm25 = df['PM2.5']
y_no2 = df['NO2']

In [None]:
# Train-test split
X_train_pm25, X_test_pm25, y_train_pm25, y_test_pm25 = train_test_split(X, y_pm25, test_size=0.2, random_state=42)
X_train_no2, X_test_no2, y_train_no2, y_test_no2 = train_test_split(X, y_no2, test_size=0.2, random_state=42)

In [None]:
# PM2.5 Modeling
rf_pm25 = RandomForestRegressor(n_estimators=100, random_state=42)
rf_pm25.fit(X_train_pm25, y_train_pm25)
pred_rf_pm25 = rf_pm25.predict(X_test_pm25)

gb_pm25 = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gb_pm25.fit(X_train_pm25, y_train_pm25)
pred_gb_pm25 = gb_pm25.predict(X_test_pm25)

print("PM2.5 - Random Forest")
print("  RMSE:", mean_squared_error(y_test_pm25, pred_rf_pm25, squared=False))
print("  R² Score:", r2_score(y_test_pm25, pred_rf_pm25))

print("\nPM2.5 - Gradient Boosting")
print("  RMSE:", mean_squared_error(y_test_pm25, pred_gb_pm25, squared=False))
print("  R² Score:", r2_score(y_test_pm25, pred_gb_pm25))

In [None]:
# NO2 Modeling
rf_no2 = RandomForestRegressor(n_estimators=100, random_state=42)
rf_no2.fit(X_train_no2, y_train_no2)
pred_rf_no2 = rf_no2.predict(X_test_no2)

gb_no2 = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gb_no2.fit(X_train_no2, y_train_no2)
pred_gb_no2 = gb_no2.predict(X_test_no2)

print("\nNO2 - Random Forest")
print("  RMSE:", mean_squared_error(y_test_no2, pred_rf_no2, squared=False))
print("  R² Score:", r2_score(y_test_no2, pred_rf_no2))

print("\nNO2 - Gradient Boosting")
print("  RMSE:", mean_squared_error(y_test_no2, pred_gb_no2, squared=False))
print("  R² Score:", r2_score(y_test_no2, pred_gb_no2))

In [None]:
# Feature Importance
sns.barplot(x=rf_pm25.feature_importances_, y=features)
plt.title("Feature Importance - PM2.5 (Random Forest)")
plt.xlabel("Importance")
plt.ylabel("Features")
plt.show()

sns.barplot(x=rf_no2.feature_importances_, y=features)
plt.title("Feature Importance - NO2 (Random Forest)")
plt.xlabel("Importance")
plt.ylabel("Features")
plt.show()

In [None]:
# K-Means Clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(df[['Latitude', 'Longitude', 'PM2.5', 'NO2']])

plt.figure(figsize=(10, 6))
plt.scatter(df['Longitude'], df['Latitude'], c=df['Cluster'], cmap='viridis', alpha=0.6)
plt.title("K-Means Clustering of Pollution Hotspots")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.colorbar(label='Cluster')
plt.show()