In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('delhi_aqi.csv')

In [3]:
df.head()

Unnamed: 0,date,co,no,no2,o3,so2,pm2_5,pm10,nh3
0,2020-11-25 01:00:00,2616.88,2.18,70.6,13.59,38.62,364.61,411.73,28.63
1,2020-11-25 02:00:00,3631.59,23.25,89.11,0.33,54.36,420.96,486.21,41.04
2,2020-11-25 03:00:00,4539.49,52.75,100.08,1.11,68.67,463.68,541.95,49.14
3,2020-11-25 04:00:00,4539.49,50.96,111.04,6.44,78.2,454.81,534.0,48.13
4,2020-11-25 05:00:00,4379.27,42.92,117.9,17.17,87.74,448.14,529.19,46.61


In [5]:
df['date'] = pd.to_datetime(df['date'])

In [6]:
df.head()

Unnamed: 0,date,co,no,no2,o3,so2,pm2_5,pm10,nh3
0,2020-11-25 01:00:00,2616.88,2.18,70.6,13.59,38.62,364.61,411.73,28.63
1,2020-11-25 02:00:00,3631.59,23.25,89.11,0.33,54.36,420.96,486.21,41.04
2,2020-11-25 03:00:00,4539.49,52.75,100.08,1.11,68.67,463.68,541.95,49.14
3,2020-11-25 04:00:00,4539.49,50.96,111.04,6.44,78.2,454.81,534.0,48.13
4,2020-11-25 05:00:00,4379.27,42.92,117.9,17.17,87.74,448.14,529.19,46.61


In [7]:
df.set_index('date',inplace=True)

In [9]:
df = df.asfreq('H')

  df = df.asfreq('H')


In [12]:
df = df.fillna(method='ffill')

  df = df.fillna(method='ffill')


In [13]:
df.head()

Unnamed: 0_level_0,co,no,no2,o3,so2,pm2_5,pm10,nh3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-11-25 01:00:00,2616.88,2.18,70.6,13.59,38.62,364.61,411.73,28.63
2020-11-25 02:00:00,3631.59,23.25,89.11,0.33,54.36,420.96,486.21,41.04
2020-11-25 03:00:00,4539.49,52.75,100.08,1.11,68.67,463.68,541.95,49.14
2020-11-25 04:00:00,4539.49,50.96,111.04,6.44,78.2,454.81,534.0,48.13
2020-11-25 05:00:00,4379.27,42.92,117.9,17.17,87.74,448.14,529.19,46.61


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 18968 entries, 2020-11-25 01:00:00 to 2023-01-24 08:00:00
Freq: h
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   co      18968 non-null  float64
 1   no      18968 non-null  float64
 2   no2     18968 non-null  float64
 3   o3      18968 non-null  float64
 4   so2     18968 non-null  float64
 5   pm2_5   18968 non-null  float64
 6   pm10    18968 non-null  float64
 7   nh3     18968 non-null  float64
dtypes: float64(8)
memory usage: 1.3 MB


In [20]:
aqi_breakpoints = {
    "pm2_5": [(0, 30, 0, 50), (31, 60, 51, 100), (61, 90, 101, 200), (91, 120, 201, 300), (121, 250, 301, 400), (251, 500, 401, 500)],
    "pm10": [(0, 50, 0, 50), (51, 100, 51, 100), (101, 250, 101, 200), (251, 350, 201, 300), (351, 430, 301, 400), (431, 600, 401, 500)],
    "no2": [(0, 40, 0, 50), (41, 80, 51, 100), (81, 180, 101, 200), (181, 280, 201, 300), (281, 400, 301, 400), (401, 1000, 401, 500)],
    "so2": [(0, 40, 0, 50), (41, 80, 51, 100), (81, 380, 101, 200), (381, 800, 201, 300), (801, 1600, 301, 400), (1601, 2500, 401, 500)],
    "o3": [(0, 50, 0, 50), (51, 100, 51, 100), (101, 168, 101, 200), (169, 208, 201, 300), (209, 748, 301, 400), (749, 1000, 401, 500)],
    "co": [(0, 1, 0, 50), (1.1, 2, 51, 100), (2.1, 10, 101, 200), (10.1, 17, 201, 300), (17.1, 34, 301, 400), (34.1, 50, 401, 500)]
}

In [21]:
def calculate_individual_aqi(concentration, breakpoints):
    for bp_low, bp_high, i_low, i_high in breakpoints:
        if bp_low <= concentration <= bp_high:
            return ((i_high - i_low) / (bp_high - bp_low)) * (concentration - bp_low) + i_low
    return np.nan

In [22]:
df["AQI_PM2_5"] = df["pm2_5"].apply(lambda x: calculate_individual_aqi(x, aqi_breakpoints["pm2_5"]))
df["AQI_PM10"] = df["pm10"].apply(lambda x: calculate_individual_aqi(x, aqi_breakpoints["pm10"]))
df["AQI_NO2"] = df["no2"].apply(lambda x: calculate_individual_aqi(x, aqi_breakpoints["no2"]))
df["AQI_SO2"] = df["so2"].apply(lambda x: calculate_individual_aqi(x, aqi_breakpoints["so2"]))
df["AQI_O3"] = df["o3"].apply(lambda x: calculate_individual_aqi(x, aqi_breakpoints["o3"]))
df["AQI_CO"] = df["co"].apply(lambda x: calculate_individual_aqi(x, aqi_breakpoints["co"]))

In [23]:
df.head()

Unnamed: 0_level_0,co,no,no2,o3,so2,pm2_5,pm10,nh3,AQI_PM2_5,AQI_PM10,AQI_NO2,AQI_SO2,AQI_O3,AQI_CO
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2020-11-25 01:00:00,2616.88,2.18,70.6,13.59,38.62,364.61,411.73,28.63,446.170241,377.104684,88.189744,48.275,13.59,
2020-11-25 02:00:00,3631.59,23.25,89.11,0.33,54.36,420.96,486.21,41.04,468.574458,433.341953,109.11,67.785641,0.33,
2020-11-25 03:00:00,4539.49,52.75,100.08,1.11,68.67,463.68,541.95,49.14,485.559518,465.994379,120.08,85.764872,1.11,
2020-11-25 04:00:00,4539.49,50.96,111.04,6.44,78.2,454.81,534.0,48.13,482.032892,461.337278,131.04,97.738462,6.44,
2020-11-25 05:00:00,4379.27,42.92,117.9,17.17,87.74,448.14,529.19,46.61,479.380964,458.519586,137.9,103.231639,17.17,


In [25]:
df.drop('AQI_CO',axis=1,inplace=True)

In [27]:
df["AQI"] = df[["AQI_PM2_5", "AQI_PM10", "AQI_NO2", "AQI_SO2", "AQI_O3"]].max(axis=1)

In [29]:
df[["pm2_5", "pm10", "no2", "so2", "o3", "co", "AQI"]].head()

Unnamed: 0_level_0,pm2_5,pm10,no2,so2,o3,co,AQI
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-11-25 01:00:00,364.61,411.73,70.6,38.62,13.59,2616.88,446.170241
2020-11-25 02:00:00,420.96,486.21,89.11,54.36,0.33,3631.59,468.574458
2020-11-25 03:00:00,463.68,541.95,100.08,68.67,1.11,4539.49,485.559518
2020-11-25 04:00:00,454.81,534.0,111.04,78.2,6.44,4539.49,482.032892
2020-11-25 05:00:00,448.14,529.19,117.9,87.74,17.17,4379.27,479.380964


In [31]:
df.head()

Unnamed: 0_level_0,co,no,no2,o3,so2,pm2_5,pm10,nh3,AQI_PM2_5,AQI_PM10,AQI_NO2,AQI_SO2,AQI_O3,AQI
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2020-11-25 01:00:00,2616.88,2.18,70.6,13.59,38.62,364.61,411.73,28.63,446.170241,377.104684,88.189744,48.275,13.59,446.170241
2020-11-25 02:00:00,3631.59,23.25,89.11,0.33,54.36,420.96,486.21,41.04,468.574458,433.341953,109.11,67.785641,0.33,468.574458
2020-11-25 03:00:00,4539.49,52.75,100.08,1.11,68.67,463.68,541.95,49.14,485.559518,465.994379,120.08,85.764872,1.11,485.559518
2020-11-25 04:00:00,4539.49,50.96,111.04,6.44,78.2,454.81,534.0,48.13,482.032892,461.337278,131.04,97.738462,6.44,482.032892
2020-11-25 05:00:00,4379.27,42.92,117.9,17.17,87.74,448.14,529.19,46.61,479.380964,458.519586,137.9,103.231639,17.17,479.380964


In [32]:
aqi_df = df['AQI']

In [35]:
aqi_df = pd.DataFrame(aqi_df)

In [36]:
aqi_df.head()

Unnamed: 0_level_0,AQI
date,Unnamed: 1_level_1
2020-11-25 01:00:00,446.170241
2020-11-25 02:00:00,468.574458
2020-11-25 03:00:00,485.559518
2020-11-25 04:00:00,482.032892
2020-11-25 05:00:00,479.380964


In [37]:
aqi_df.isnull().sum()

AQI    0
dtype: int64

In [39]:
aqi_df.to_csv('delhi_aqi_cleaned.csv')