# AQI Prediction Model using Python

- PM2.5 PM10
- NO, NO2
- NH3 - Ammonia
- CO
- So2
- O3
- Benzene, Toluene, Xylene

In [3]:
# pip install numpy pandas matplotlib seaborn scikit-learn

In [4]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

In [5]:
df = pd.read_csv('air quality data.csv')
df.head() #Top 5 rows!

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.0,0.02,0.0,,
1,Ahmedabad,2015-01-02,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.5,3.77,,
2,Ahmedabad,2015-01-03,,,17.4,19.3,29.7,,17.4,29.07,30.7,6.8,16.4,2.25,,
3,Ahmedabad,2015-01-04,,,1.7,18.48,17.97,,1.7,18.59,36.08,4.43,10.14,1.0,,
4,Ahmedabad,2015-01-05,,,22.1,21.42,37.76,,22.1,39.33,39.31,7.01,18.89,2.78,,


In [6]:
# Shape -rows and cols!
df.shape

(29531, 16)

In [7]:
# Infornation
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29531 entries, 0 to 29530
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        29531 non-null  object 
 1   Date        29531 non-null  object 
 2   PM2.5       24933 non-null  float64
 3   PM10        18391 non-null  float64
 4   NO          25949 non-null  float64
 5   NO2         25946 non-null  float64
 6   NOx         25346 non-null  float64
 7   NH3         19203 non-null  float64
 8   CO          27472 non-null  float64
 9   SO2         25677 non-null  float64
 10  O3          25509 non-null  float64
 11  Benzene     23908 non-null  float64
 12  Toluene     21490 non-null  float64
 13  Xylene      11422 non-null  float64
 14  AQI         24850 non-null  float64
 15  AQI_Bucket  24850 non-null  object 
dtypes: float64(13), object(3)
memory usage: 3.6+ MB


In [8]:
# to know the duplicate value
df.duplicated().sum()

np.int64(0)

In [9]:
# To check missing values
df.isnull().sum()

City              0
Date              0
PM2.5          4598
PM10          11140
NO             3582
NO2            3585
NOx            4185
NH3           10328
CO             2059
SO2            3854
O3             4022
Benzene        5623
Toluene        8041
Xylene        18109
AQI            4681
AQI_Bucket     4681
dtype: int64

In [10]:
# Drop the rows where 'AQI' has missing values
df.dropna(subset=['AQI'], inplace = True)

In [11]:
df.isnull().sum().sort_values(ascending=False)

Xylene        15372
PM10           7086
NH3            6536
Toluene        5826
Benzene        3535
NOx            1857
O3              807
PM2.5           678
SO2             605
CO              445
NO2             391
NO              387
Date              0
City              0
AQI               0
AQI_Bucket        0
dtype: int64

In [12]:
df.shape

(24850, 16)

In [13]:
# Summary of Statistics in the dataset
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PM2.5,24172.0,67.476613,63.075398,0.04,29.0,48.785,80.925,914.94
PM10,17764.0,118.454435,89.487976,0.03,56.7775,96.18,150.1825,917.08
NO,24463.0,17.622421,22.421138,0.03,5.66,9.91,20.03,390.68
NO2,24459.0,28.978391,24.627054,0.01,11.94,22.1,38.24,362.21
NOx,22993.0,32.289012,30.712855,0.0,13.11,23.68,40.17,378.24
NH3,18314.0,23.848366,25.875981,0.01,8.96,16.31,30.36,352.89
CO,24405.0,2.345267,7.075208,0.0,0.59,0.93,1.48,175.81
SO2,24245.0,14.362933,17.428693,0.01,5.73,9.22,15.14,186.08
O3,24043.0,34.912885,21.724525,0.01,19.25,31.25,46.08,257.73
Benzene,21315.0,3.458668,16.03602,0.0,0.23,1.29,3.34,455.03


In [14]:
# Percentage of null values
null_values_percentage = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending=False)
null_values_percentage

Xylene        61.859155
PM10          28.515091
NH3           26.301811
Toluene       23.444668
Benzene       14.225352
NOx            7.472837
O3             3.247485
PM2.5          2.728370
SO2            2.434608
CO             1.790744
NO2            1.573441
NO             1.557344
Date           0.000000
City           0.000000
AQI            0.000000
AQI_Bucket     0.000000
dtype: float64

#### Key Considerations:
- Xylene has the highest percentage of missing values - 61.86%
- PM10 and NH3 28 - 26 %

## Week2 - Visualization

In [None]:
# Univariate analysis
df['Xylene'].plot(kind = 'hist', figsize=(10,5))
plt.legend()
plt.show()

In [None]:
df['PM10'].plot(kind = 'hist', figsize=(10,5))
plt.legend()
plt.show()

In [None]:
df['NH3'].plot(kind = 'hist', figsize=(10,5))
plt.legend()
plt.show()

In [None]:
df['Toluene'].plot(kind = 'hist', figsize=(10,5))
plt.legend()
plt.show()

In [None]:
df['Benzene'].plot(kind = 'hist', figsize=(10,5))
plt.legend()
plt.show()

In [None]:
df['NOx'].plot(kind = 'hist', figsize=(10,5))
plt.legend()
plt.show()

In [None]:
df['O3'].plot(kind = 'hist', figsize=(10,5))
plt.legend()
plt.show()

In [None]:
df['PM2.5'].plot(kind = 'hist', figsize=(10,5))
plt.legend()
plt.show()

In [None]:
df['SO2'].plot(kind = 'hist', figsize=(10,5))
plt.legend()
plt.show()

In [None]:
df['CO'].plot(kind = 'hist', figsize=(10,5))
plt.legend()
plt.show()

In [None]:
df['AQI'].plot(kind = 'hist', figsize=(10,5))
plt.legend()
plt.show()

In [None]:
# Distribution of AQI from 2015 to 2020
sns.displot(df, x='AQI', color='purple')
plt.show

In [None]:
# Bivariate
sns.set_theme(style="darkgrid")
graph = sns.catplot(x="City", kind='count',data=df, height=5, aspect=3)
graph.set_xticklabels(rotation=90)

In [None]:
sns.set_theme(style="darkgrid")
graph = sns.catplot(x="City", kind='count',data=df, col="AQI_Bucket", col_wrap=2, height=3.5, aspect=3)
graph.set_xticklabels(rotation=90)

In [None]:
graph1 = sns.catplot(x='City', y='PM2.5', kind='box', data=df, height=5, aspect=3)
graph1.set_xticklabels(rotation=90)

In [None]:
graph2 = sns.catplot(x='City', y='O3', kind='box', data=df, height=5, aspect=3)
graph2.set_xticklabels(rotation=90)

In [None]:
graph3 = sns.catplot(x='City', y='SO2', kind='box', data=df, height=5, aspect=3)
graph3.set_xticklabels(rotation=90)

In [None]:
graph4 = sns.catplot(x='City', y='NO2', kind='box', data=df, height=5, aspect=3)
graph4.set_xticklabels(rotation=90)

In [None]:
graph5 = sns.catplot(x='AQI_Bucket', data=df, kind='count', height=6, aspect=3)
graph5.set_xticklabels(rotation=90)

In [None]:
# To check the null values
df.isnull().sum().sort_values(ascending=False)

In [None]:
df.describe().loc['mean']

In [None]:
df = df.replace({
    "PM2.5":{np.nan:67.476613},
    "PM10":{np.nan:118.454435},
    "NO":{np.nan:17.622421},
    "NO2":{np.nan:28.978391},
    "NOx":{np.nan:32.289012},
    "NH3":{np.nan:23.848366},
    "CO":{np.nan:2.345267},
    "SO2":{np.nan:14.362933},
    "O3":{np.nan:34.912885},
    "Benzene":{np.nan:3.458668},
    "Toluene":{np.nan:9.525714},
    "Xylene":{np.nan:3.588683}
})

In [None]:
df.isnull().sum()

In [None]:
df = df.drop(['AQI_Bucket'], axis=1)

In [None]:
df.head()

In [None]:
sns.boxplot(data=df[['PM2.5', 'PM10']])

In [None]:
sns.boxplot(data=df[['NO', 'NO2', 'NOx', 'NH3']])

In [None]:
sns.boxplot(data=df[['O3', 'SO2']])

In [None]:
sns.boxplot(data=df[['Benzene', 'Toluene', 'Xylene']])

In [None]:
# IQR Method - Q3 Q1
def replace_outliers(df):
    for column in df.select_dtypes(include=['number']).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lb = Q1 - 1.5 * IQR
        ub = Q3 + 1.5 * IQR
        df[column] = df[column].apply(
            lambda x: Q1 if x < lb else (Q3 if x > ub else x)
        )
    return df

In [None]:
df = replace_outliers(df)

In [None]:
df.describe().T

In [None]:
sns.boxplot(data=df[['PM2.5', 'PM10']])

In [None]:
sns.boxplot(data=df[['O3', 'SO2']])

In [None]:
sns.boxplot(data=df[['NO', 'NO2', 'NOx', 'NH3']])

In [None]:
sns.boxplot(data=df[['Benzene', 'Toluene', 'Xylene']])

In [None]:
sns.displot(df, x='AQI', color='red')
plt.show()

In [None]:
df1 = df.drop(columns=['City'])

In [None]:
# Multivariate Analysis - Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df1.select_dtypes(include=['number']).corr(), annot=True, cmap='Pastel1')
plt.show()

# Week 3 - Data Modeling - 10 March 2025

In [None]:
df.drop(['Date', 'City'], axis = 1, inplace=True)
df.head()

In [None]:
# Scaling - Standard Scaler
from sklearn.preprocessing import StandardScaler
df1 = StandardScaler().fit_transform(df)
df1

In [None]:
df = pd.DataFrame(df1, columns=df.columns)
df.head()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn. tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
df.columns

In [None]:
# Featur & Target Selection
X = df[['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3',
       'Benzene', 'Toluene', 'Xylene']]
y = df['AQI']

In [None]:
X.head()

In [None]:
# Split the data into training and testing data - Training set - 80%  | Testing set - 20%
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)
print('Shape of X Train',X_train.shape)
print('Shape of X Test',X_test.shape)
print('Shape of y Train',y_train.shape)
print('Shape of y Test',y_test.shape)

In [None]:
# Linear Regression Model
LR = LinearRegression()
LR.fit(X_train,y_train)

In [None]:
# Predicting the values:
train_pred = LR.predict(X_train) # Predicting train
test_pred = LR.predict(X_test) # Predicting test

In [None]:
# Evaluation for Linear Regression
RMSE_train = (np.sqrt(mean_squared_error(y_train, train_pred)))
RMSE_test = (np.sqrt(mean_squared_error(y_test, test_pred)))
print('RMSE Train Data = ', str(RMSE_train))
print('RMSE Test Data = ', str(RMSE_test))
print('_'*60)
print('R Squared value for Train = ', LR.score(X_train, y_train))
print('R Squared value on Test = ', LR.score(X_test,y_test))

In [None]:
#KNN
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

In [None]:
# Predicting the values:
train_pred = knn.predict(X_train) # Predicting train
test_pred = knn.predict(X_test) # Predicting test

# Evaluation for KNN Regression
RMSE_train = (np.sqrt(mean_squared_error(y_train, train_pred)))
RMSE_test = (np.sqrt(mean_squared_error(y_test, test_pred)))
print('RMSE Train Data = ', str(RMSE_train))
print('RMSE Test Data = ', str(RMSE_test))
print('_'*60)
print('R Squared value for Train = ', knn.score(X_train, y_train))
print('R Squared value on Test = ', knn.score(X_test,y_test))

In [None]:
# Decision Tree
dtr = DecisionTreeRegressor()
dtr.fit(X_train,y_train)

In [None]:
# Predicting the values:
train_pred = dtr.predict(X_train) # Predicting train
test_pred = dtr.predict(X_test) # Predicting test

# Evaluation for Decision Tree Regression
RMSE_train = (np.sqrt(mean_squared_error(y_train, train_pred)))
RMSE_test = (np.sqrt(mean_squared_error(y_test, test_pred)))
print('RMSE Train Data = ', str(RMSE_train))
print('RMSE Test Data = ', str(RMSE_test))
print('_'*60)
print('R Squared value for Train = ', dtr.score(X_train, y_train))
print('R Squared value on Test = ', dtr.score(X_test,y_test))

In [None]:
# Random Forest Regressor
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)

In [None]:
# Predicting the values:
train_pred = rfr.predict(X_train) # Predicting train
test_pred = rfr.predict(X_test) # Predicting test

# Evaluation for Random Forest Regression
RMSE_train = (np.sqrt(mean_squared_error(y_train, train_pred)))
RMSE_test = (np.sqrt(mean_squared_error(y_test, test_pred)))
print('RMSE Train Data = ', str(RMSE_train))
print('RMSE Test Data = ', str(RMSE_test))
print('_'*60)
print('R Squared value for Train = ', rfr.score(X_train, y_train))
print('R Squared value on Test = ', rfr.score(X_test,y_test))