In [1]:
import pandas as pd


In [4]:
df = pd.read_csv("cleaned_air_quality_data.csv")

In [5]:
# 1. REMOVE UNNECESSARY COLUMNS
# - 'Date': We already extracted Year and Month.
# - 'AQI' & 'AQI_Bucket': These are calculated FROM PM2.5. Using them to predict PM2.5 is "cheating" (Data Leakage).
cols_to_drop = ['Date', 'AQI', 'AQI_Bucket', 'PM2.5']

# Check if they exist before dropping to avoid errors
existing_drop_cols = [col for col in cols_to_drop if col in df.columns]
X = df.drop(columns=existing_drop_cols)

In [6]:
y = df['PM2.5']

In [7]:
print("Encoding categorical data...")
X = pd.get_dummies(X, columns=['City'], drop_first=True)

Encoding categorical data...


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)

In [14]:
X_train_scaled.head()

Unnamed: 0,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,...,City_Jorapokhar,City_Kochi,City_Kolkata,City_Lucknow,City_Mumbai,City_Patna,City_Shillong,City_Talcher,City_Thiruvananthapuram,City_Visakhapatnam
6688,2.951769,0.547567,0.525519,1.03402,-1.10381,3.484848,0.733216,-0.556965,-0.429746,0.570951,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29333,0.091653,-0.568052,0.236121,-0.331999,-0.67162,0.212121,-0.137295,1.683178,1.046663,0.490342,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
27996,-1.974072,-0.544416,-0.575386,-0.614969,-1.371458,-0.434343,-0.374885,-0.077472,0.615175,0.570951,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
26848,-2.095881,-0.765019,-0.355461,-0.572143,-1.319905,0.242424,4.698228,-0.543491,-0.429746,-0.53529,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
21186,0.0,-0.2143,0.277464,-0.051431,0.240075,-0.090909,-0.449861,-0.690113,-0.402793,-0.506686,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
X_train.shape

(23624, 38)

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

In [17]:
results = []

In [18]:
for name, model in models.items():
    # Train
    model.fit(X_train_scaled, y_train)
    # Predict
    y_pred = model.predict(X_test_scaled)
    # Evaluate
    rmse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results.append({"Model": name, "RMSE": rmse, "R2 Score": r2})



In [19]:
results_df = pd.DataFrame(results).sort_values(by="R2 Score", ascending=False)
display(results_df)

Unnamed: 0,Model,RMSE,R2 Score
2,Random Forest,674.032376,0.814642
3,Gradient Boosting,1057.299657,0.709244
1,Decision Tree,1447.854879,0.601841
0,Linear Regression,1893.730076,0.479226


In [20]:
from sklearn.model_selection import GridSearchCV

# Define the grid of settings to test
param_grid = {
    'n_estimators': [100, 200],      # Number of trees
    'max_depth': [10, 20, None],     # Maximum depth of tree
    'min_samples_split': [2, 5]      # Minimum samples to split a node
}

# Setup Grid Search with 3-fold Cross Validation
rf_grid = GridSearchCV(estimator=RandomForestRegressor(random_state=42, n_jobs=-1),
                       param_grid=param_grid,
                       cv=3,
                       n_jobs=-1,
                       verbose=1,
                       scoring='r2')

# Fit search
rf_grid.fit(X_train_scaled, y_train)

# Get best model
best_model = rf_grid.best_estimator_

print(f"\nBest Parameters: {rf_grid.best_params_}")
print(f"Best CV R2 Score: {rf_grid.best_score_:.4f}")

# Final Test on the held-out test set
final_pred = best_model.predict(X_test_scaled)
print(f"Final Test R2 Score: {r2_score(y_test, final_pred):.4f}")



Fitting 3 folds for each of 12 candidates, totalling 36 fits

Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}
Best CV R2 Score: 0.7654
Final Test R2 Score: 0.8107




In [21]:
import joblib

joblib.dump(best_model, 'best_pollution_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(X.columns.tolist(), 'model_columns.pkl')


['model_columns.pkl']