IMPORTING LIBRARIES

In [32]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

DATASET LOADING

In [33]:
data = pd.read_csv(r"C:\Users\Asus\Downloads\Delhi_AQI_Dataset.csv")

# Explore the dataset 

In [34]:
data.head() # First 5 rows

Unnamed: 0,City,Date,AQI,PM2.5,PM10,NO2,SO2,CO,O3,Unnamed: 9,Unnamed: 10
0,Delhi,01/01/18,406,223.3,438.48,336.98,462.84,4.26,385.7,,
1,Delhi,02/01/18,418,229.9,451.44,346.94,476.52,4.39,397.1,,
2,Delhi,03/01/18,382,210.1,412.56,317.06,435.48,4.01,362.9,,
3,Delhi,04/01/18,366,201.3,395.28,303.78,417.24,3.84,347.7,,
4,Delhi,05/01/18,390,214.5,421.2,323.7,444.6,4.1,370.5,,


In [35]:
data.shape

(2191, 11)

In [36]:
print(data.columns)


Index(['City', 'Date', 'AQI', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3',
       'Unnamed: 9', 'Unnamed: 10'],
      dtype='object')


 # Remove useless columns 

In [37]:
data = data.drop(columns=["Unnamed: 9", "Unnamed: 10"], errors="ignore") 
data


Unnamed: 0,City,Date,AQI,PM2.5,PM10,NO2,SO2,CO,O3
0,Delhi,01/01/18,406,223.30,438.48,336.98,462.84,4.26,385.70
1,Delhi,02/01/18,418,229.90,451.44,346.94,476.52,4.39,397.10
2,Delhi,03/01/18,382,210.10,412.56,317.06,435.48,4.01,362.90
3,Delhi,04/01/18,366,201.30,395.28,303.78,417.24,3.84,347.70
4,Delhi,05/01/18,390,214.50,421.20,323.70,444.60,4.10,370.50
...,...,...,...,...,...,...,...,...,...
2186,Delhi,27/12/24,353,194.15,381.24,292.99,402.42,3.71,335.35
2187,Delhi,28/12/24,139,76.45,150.12,115.37,158.46,1.46,132.05
2188,Delhi,29/12/24,225,123.75,243.00,186.75,256.50,2.36,213.75
2189,Delhi,30/12/24,173,95.15,186.84,143.59,197.22,1.82,164.35


# Handle categorical/constant columns 

In [38]:
if "City" in data.columns and data["City"].nunique() == 1:
    data = data.drop(columns=["City"])
    


# Convert Date column

In [39]:
data["Date"] = pd.to_datetime(data["Date"], errors="coerce")
data

  data["Date"] = pd.to_datetime(data["Date"], errors="coerce")


Unnamed: 0,Date,AQI,PM2.5,PM10,NO2,SO2,CO,O3
0,2018-01-01,406,223.30,438.48,336.98,462.84,4.26,385.70
1,2018-02-01,418,229.90,451.44,346.94,476.52,4.39,397.10
2,2018-03-01,382,210.10,412.56,317.06,435.48,4.01,362.90
3,2018-04-01,366,201.30,395.28,303.78,417.24,3.84,347.70
4,2018-05-01,390,214.50,421.20,323.70,444.60,4.10,370.50
...,...,...,...,...,...,...,...,...
2186,2024-12-27,353,194.15,381.24,292.99,402.42,3.71,335.35
2187,2024-12-28,139,76.45,150.12,115.37,158.46,1.46,132.05
2188,2024-12-29,225,123.75,243.00,186.75,256.50,2.36,213.75
2189,2024-12-30,173,95.15,186.84,143.59,197.22,1.82,164.35


In [40]:
data["Year"] = data["Date"].dt.year
data["Month"] = data["Date"].dt.month
data["Day"] = data["Date"].dt.day

In [41]:
data

Unnamed: 0,Date,AQI,PM2.5,PM10,NO2,SO2,CO,O3,Year,Month,Day
0,2018-01-01,406,223.30,438.48,336.98,462.84,4.26,385.70,2018,1,1
1,2018-02-01,418,229.90,451.44,346.94,476.52,4.39,397.10,2018,2,1
2,2018-03-01,382,210.10,412.56,317.06,435.48,4.01,362.90,2018,3,1
3,2018-04-01,366,201.30,395.28,303.78,417.24,3.84,347.70,2018,4,1
4,2018-05-01,390,214.50,421.20,323.70,444.60,4.10,370.50,2018,5,1
...,...,...,...,...,...,...,...,...,...,...,...
2186,2024-12-27,353,194.15,381.24,292.99,402.42,3.71,335.35,2024,12,27
2187,2024-12-28,139,76.45,150.12,115.37,158.46,1.46,132.05,2024,12,28
2188,2024-12-29,225,123.75,243.00,186.75,256.50,2.36,213.75,2024,12,29
2189,2024-12-30,173,95.15,186.84,143.59,197.22,1.82,164.35,2024,12,30


# Handle missing values

In [42]:
data.isnull().sum()

Date     0
AQI      0
PM2.5    0
PM10     0
NO2      0
SO2      0
CO       0
O3       0
Year     0
Month    0
Day      0
dtype: int64

# Remove duplicates

In [43]:
data.duplicated().sum()

np.int64(0)

# Feature Scaling (Normalization)

In [44]:
numeric_cols = data.select_dtypes(include=np.number).columns

In [45]:
numeric_cols

Index(['AQI', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3', 'Year', 'Month',
       'Day'],
      dtype='object')

In [49]:
scaler = StandardScaler()

In [50]:
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])
data

Unnamed: 0,Date,AQI,PM2.5,PM10,NO2,SO2,CO,O3,Year,Month,Day
0,2018-01-01,1.854899,1.854899,1.854899,1.854899,1.854899,1.851877,1.854899,-1.340092,-1.601882,-1.673704
1,2018-02-01,1.967480,1.967480,1.967480,1.967480,1.967480,1.968027,1.967480,-1.340092,-1.311894,-1.673704
2,2018-03-01,1.629738,1.629738,1.629738,1.629738,1.629738,1.628511,1.629738,-1.340092,-1.021906,-1.673704
3,2018-04-01,1.479630,1.479630,1.479630,1.479630,1.479630,1.476622,1.479630,-1.340092,-0.731918,-1.673704
4,2018-05-01,1.704792,1.704792,1.704792,1.704792,1.704792,1.708923,1.704792,-1.340092,-0.441930,-1.673704
...,...,...,...,...,...,...,...,...,...,...,...
2186,2024-12-27,1.357668,1.357668,1.357668,1.357668,1.357668,1.360472,1.357668,1.496303,1.587985,1.281240
2187,2024-12-28,-0.650019,-0.650019,-0.650019,-0.650019,-0.650019,-0.649823,-0.650019,1.496303,1.587985,1.394892
2188,2024-12-29,0.156809,0.156809,0.156809,0.156809,0.156809,0.154295,0.156809,1.496303,1.587985,1.508544
2189,2024-12-30,-0.331041,-0.331041,-0.331041,-0.331041,-0.331041,-0.328176,-0.331041,1.496303,1.587985,1.622195


In [53]:
print(data.head())

        Date       AQI     PM2.5      PM10       NO2       SO2        CO  \
0 2018-01-01  1.854899  1.854899  1.854899  1.854899  1.854899  1.851877   
1 2018-02-01  1.967480  1.967480  1.967480  1.967480  1.967480  1.968027   
2 2018-03-01  1.629738  1.629738  1.629738  1.629738  1.629738  1.628511   
3 2018-04-01  1.479630  1.479630  1.479630  1.479630  1.479630  1.476622   
4 2018-05-01  1.704792  1.704792  1.704792  1.704792  1.704792  1.708923   

         O3      Year     Month       Day  
0  1.854899 -1.340092 -1.601882 -1.673704  
1  1.967480 -1.340092 -1.311894 -1.673704  
2  1.629738 -1.340092 -1.021906 -1.673704  
3  1.479630 -1.340092 -0.731918 -1.673704  
4  1.704792 -1.340092 -0.441930 -1.673704  


In [54]:
print("Final dataset shape:", data.shape)


Final dataset shape: (2191, 11)
