In [1]:
# Air Quality Index

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [3]:
# importing dataset
df = pd.read_csv(r"E:\open source dataset\station_hour.csv")
df.head()

Unnamed: 0,StationId,Datetime,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,AP001,2017-11-24 17:00:00,60.5,98.0,2.35,30.8,18.25,8.5,0.1,11.85,126.4,0.1,6.1,0.1,,
1,AP001,2017-11-24 18:00:00,65.5,111.25,2.7,24.2,15.07,9.77,0.1,13.17,117.12,0.1,6.25,0.15,,
2,AP001,2017-11-24 19:00:00,80.0,132.0,2.1,25.18,15.15,12.02,0.1,12.08,98.98,0.2,5.98,0.18,,
3,AP001,2017-11-24 20:00:00,81.5,133.25,1.95,16.25,10.23,11.58,0.1,10.47,112.2,0.2,6.72,0.1,,
4,AP001,2017-11-24 21:00:00,75.25,116.0,1.43,17.48,10.43,12.03,0.1,9.12,106.35,0.2,5.75,0.08,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2589083 entries, 0 to 2589082
Data columns (total 16 columns):
 #   Column      Dtype  
---  ------      -----  
 0   StationId   object 
 1   Datetime    object 
 2   PM2.5       float64
 3   PM10        float64
 4   NO          float64
 5   NO2         float64
 6   NOx         float64
 7   NH3         float64
 8   CO          float64
 9   SO2         float64
 10  O3          float64
 11  Benzene     float64
 12  Toluene     float64
 13  Xylene      float64
 14  AQI         float64
 15  AQI_Bucket  object 
dtypes: float64(13), object(3)
memory usage: 316.1+ MB


In [5]:
# converting dtype of datetime colume to datetime format
df["Datetime"] = pd.to_datetime(df["Datetime"])

In [6]:
# Handling Missing Values

In [7]:
df.columns

Index(['StationId', 'Datetime', 'PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3',
       'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene', 'AQI', 'AQI_Bucket'],
      dtype='object')

In [8]:
cols = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3',
       'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene']

for i in df[cols]:
    df[i].ffill(inplace=True)

In [9]:
df.sample(10)

Unnamed: 0,StationId,Datetime,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
2293780,TN003,2018-09-05 06:00:00,164.24,4.85,8.92,13.39,22.31,22.16,0.69,23.39,99.05,0.26,0.0,0.4,178.0,Moderate
2537871,WB010,2019-06-08 01:00:00,12.88,24.16,5.09,11.38,16.93,1.57,0.41,1.36,37.48,3.75,1.83,2.06,48.0,Good
2555701,WB011,2019-10-26 08:00:00,22.42,43.77,10.65,14.88,25.55,16.69,0.18,10.66,18.84,0.67,2.66,1.02,40.0,Good
2542820,WB010,2019-12-31 06:00:00,94.72,187.25,60.08,45.92,105.0,26.7,1.25,4.85,4.25,117.03,129.7,2.77,292.0,Poor
1494151,KA005,2019-09-29 16:00:00,8.0,195.61,16.62,83.13,99.75,7.53,1.78,3.91,14.25,0.35,0.1,6.99,161.0,Moderate
2216747,TN001,2016-06-03 13:00:00,50.85,75.0,4.79,18.15,12.93,9.48,1.02,6.23,44.73,13.71,22.76,0.4,,
302503,DL007,2016-12-24 08:00:00,240.55,451.34,24.05,11.16,0.0,19.95,1.03,13.55,10.28,1.08,0.5,0.0,,
107992,BR007,2019-02-06 16:00:00,120.0,43.41,3.45,14.74,5.92,4.33,1.1,46.26,151.94,2.01,14.91,7.7,338.0,Very Poor
275536,DL006,2018-11-26 17:00:00,112.06,187.39,42.03,51.96,95.91,19.95,0.62,13.55,62.47,1.1,159.85,0.0,390.0,Very Poor
2270786,TN003,2016-01-21 04:00:00,35.16,4.85,4.41,5.08,8.01,232.13,0.16,2.09,99.05,0.26,0.0,0.4,59.0,Satisfactory


In [10]:
df.isna().sum()

StationId          0
Datetime           0
PM2.5              0
PM10               0
NO                 0
NO2                0
NOx                0
NH3                0
CO                 0
SO2                0
O3                 0
Benzene            0
Toluene            0
Xylene             0
AQI           570190
AQI_Bucket    570190
dtype: int64

In [11]:
#  compute AQI manually based on PM2.5 or PM10 (as AQI is often calculated using these).
def calculate_pm25_aqi(pm25):
    if pm25 <= 30:
        return (50 / 30) * pm25
    elif pm25 <= 60:
        return 50 + ((100 - 50) / (60 - 30)) * (pm25 - 30)
    elif pm25 <= 90:
        return 100 + ((200 - 100) / (90 - 60)) * (pm25 - 60)
    elif pm25 <= 120:
        return 200 + ((300 - 200) / (120 - 90)) * (pm25 - 90)
    elif pm25 <= 250:
        return 300 + ((400 - 300) / (250 - 120)) * (pm25 - 120)
    else:
        return 400 + ((500 - 400) / (500 - 250)) * (pm25 - 250)


In [12]:
df["AQI"] = df["AQI"].fillna(df["PM2.5"].apply(calculate_pm25_aqi))

In [14]:
df["AQI"].min() , df["AQI"].max()

(np.float64(0.016666666666666666), np.float64(3133.0))

In [15]:
def custom_aqi_bucket(aqi):
    if aqi <= 50:
        return "Good"
    elif aqi <= 300:
        return "Moderate"
    elif aqi <= 600:
        return "Poor"
    elif aqi <= 1000:
        return "Very Poor"
    elif aqi <= 2000:
        return "Severe"
    else:
        return "Critical"

In [16]:
df["AQI_Bucket"] = df["AQI"].apply(custom_aqi_bucket)

In [17]:
df.sample(10)

Unnamed: 0,StationId,Datetime,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
749587,DL021,2016-10-29 01:00:00,65.8,137.25,11.1,31.3,31.0,40.55,1.38,2.0,159.8,4.45,15.3,2.6,119.333333,Moderate
1365655,JH001,2019-09-18 02:00:00,60.0,43.6,9.19,8.74,6.37,6.44,0.39,2.19,2.62,2.02,6.04,6.99,33.0,Good
392422,DL009,2019-05-01 07:00:00,115.25,319.25,15.82,41.78,35.02,16.75,0.6,36.33,5.57,4.65,24.88,0.0,223.0,Moderate
1873266,OD002,2018-06-17 00:00:00,86.26,175.13,1.22,0.07,0.0,1.16,2.14,56.06,18.77,0.0,0.0,0.2,187.533333,Moderate
735617,DL021,2015-03-26 23:00:00,50.0,137.25,5.92,24.0,20.89,40.55,12.5,12.0,49.5,14.84,2.72,2.6,236.0,Moderate
114792,BR007,2019-11-17 00:00:00,167.0,43.41,57.08,83.7,56.3,4.33,0.97,50.55,23.57,6.63,21.7,20.38,312.0,Poor
1354567,JH001,2018-06-13 02:00:00,53.2,113.28,15.34,5.06,6.37,9.36,0.0,3.04,31.54,2.02,6.04,6.99,88.666667,Moderate
1603841,KA010,2016-01-31 22:00:00,20.56,146.22,40.63,87.05,78.08,10.55,1.54,3.81,17.05,0.1,0.69,6.99,146.0,Moderate
1840874,MP001,2020-02-28 13:00:00,42.25,129.5,12.05,11.78,18.18,21.35,0.49,45.2,122.1,0.0,0.02,0.2,141.0,Moderate
1687600,KL008,2019-05-03 18:00:00,31.5,105.25,0.68,10.32,6.05,2.47,1.13,5.7,43.83,0.0,0.0,0.0,102.0,Moderate
