In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('/content/city_day.csv')

In [4]:
df.head()

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.0,0.02,0.0,,
1,Ahmedabad,2015-01-02,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.5,3.77,,
2,Ahmedabad,2015-01-03,,,17.4,19.3,29.7,,17.4,29.07,30.7,6.8,16.4,2.25,,
3,Ahmedabad,2015-01-04,,,1.7,18.48,17.97,,1.7,18.59,36.08,4.43,10.14,1.0,,
4,Ahmedabad,2015-01-05,,,22.1,21.42,37.76,,22.1,39.33,39.31,7.01,18.89,2.78,,


In [5]:
df.shape

(29531, 16)

In [6]:
df.isnull().sum()

Unnamed: 0,0
City,0
Date,0
PM2.5,4598
PM10,11140
NO,3582
NO2,3585
NOx,4185
NH3,10328
CO,2059
SO2,3854


In [7]:
df.fillna(method='ffill',inplace=True)

In [8]:
df.shape

(29531, 16)

In [9]:
df.isnull().sum()

Unnamed: 0,0
City,0
Date,0
PM2.5,27
PM10,1594
NO,0
NO2,0
NOx,0
NH3,2009
CO,0
SO2,0


In [10]:
df.dropna(inplace=True)

In [11]:
df.shape

(27522, 16)

In [12]:
df.head()

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
2009,Aizawl,2020-03-11,32.69,47.91,6.99,2.85,11.93,26.64,0.6,4.53,4.48,0.03,0.3,1.87,119.0,Moderate
2010,Aizawl,2020-03-12,31.21,38.66,7.2,1.27,10.65,25.63,0.56,4.22,2.81,0.01,0.08,1.87,52.0,Satisfactory
2011,Aizawl,2020-03-13,38.39,46.68,7.19,0.91,10.37,29.16,0.57,4.46,0.18,0.0,0.0,1.87,60.0,Satisfactory
2012,Aizawl,2020-03-14,43.23,50.83,7.14,1.07,10.48,28.95,0.57,4.53,0.41,0.0,0.0,1.87,62.0,Satisfactory
2013,Aizawl,2020-03-15,33.82,41.03,7.09,0.36,9.73,28.41,0.48,4.63,0.3,0.0,0.0,1.87,70.0,Satisfactory


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import pickle


In [None]:
# Define X and y
features = ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3']
X = df[features]
y = df['AQI']


In [None]:
# Train/test split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Create pipeline with scaler and model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestRegressor(random_state=42))
])


In [None]:
# Train
pipeline.fit(x_train, y_train)


In [None]:
# Predict and evaluate
y_pred = pipeline.predict(x_test)
r2 = r2_score(y_test, y_pred)
print(f"Pipeline R² Score: {r2:.4f}")


In [None]:
# Save the entire pipeline with feature names preserved
with open('best_model_pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

print("✅ Saved model with pipeline as best_model_pipeline.pkl")


In [23]:
input_dict = {"PM2.5": 96.9, "PM10": 489.9, "NO2": 13.3, "SO2": 20.5, "CO": 436 / 1000, "O3": 114}  # Convert CO µg/m³ → mg/m³ if needed

# Create DataFrame
input_df = pd.DataFrame([input_dict])

# Predict
y_pred = pipeline.predict(input_df)
print("Predicted AQI:", y_pred[0])

Predicted AQI: 394.09
