In [1]:
# Air Quality Index

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [3]:
# importing dataset
df = pd.read_csv(r"E:\open source dataset\station_hour.csv")
df.head()

Unnamed: 0,StationId,Datetime,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,AP001,2017-11-24 17:00:00,60.5,98.0,2.35,30.8,18.25,8.5,0.1,11.85,126.4,0.1,6.1,0.1,,
1,AP001,2017-11-24 18:00:00,65.5,111.25,2.7,24.2,15.07,9.77,0.1,13.17,117.12,0.1,6.25,0.15,,
2,AP001,2017-11-24 19:00:00,80.0,132.0,2.1,25.18,15.15,12.02,0.1,12.08,98.98,0.2,5.98,0.18,,
3,AP001,2017-11-24 20:00:00,81.5,133.25,1.95,16.25,10.23,11.58,0.1,10.47,112.2,0.2,6.72,0.1,,
4,AP001,2017-11-24 21:00:00,75.25,116.0,1.43,17.48,10.43,12.03,0.1,9.12,106.35,0.2,5.75,0.08,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2589083 entries, 0 to 2589082
Data columns (total 16 columns):
 #   Column      Dtype  
---  ------      -----  
 0   StationId   object 
 1   Datetime    object 
 2   PM2.5       float64
 3   PM10        float64
 4   NO          float64
 5   NO2         float64
 6   NOx         float64
 7   NH3         float64
 8   CO          float64
 9   SO2         float64
 10  O3          float64
 11  Benzene     float64
 12  Toluene     float64
 13  Xylene      float64
 14  AQI         float64
 15  AQI_Bucket  object 
dtypes: float64(13), object(3)
memory usage: 316.1+ MB


In [5]:
# converting dtype of datetime colume to datetime format
df["Datetime"] = pd.to_datetime(df["Datetime"])

In [6]:
# Handling Missing Values

In [7]:
df.columns

Index(['StationId', 'Datetime', 'PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3',
       'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene', 'AQI', 'AQI_Bucket'],
      dtype='object')

In [8]:
cols = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3',
       'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene']

for i in df[cols]:
    df[i].ffill(inplace=True)

In [9]:
df.sample(10)

Unnamed: 0,StationId,Datetime,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
1449988,KA004,2019-05-07 00:00:00,39.5,63.5,5.08,32.55,21.45,22.15,0.29,2.82,96.06,4.78,9.75,6.99,149.0,Moderate
191849,DL002,2019-05-29 05:00:00,100.0,352.75,198.43,113.1,222.12,40.25,2.85,23.75,18.23,6.35,24.43,0.0,232.0,Poor
119711,BR007,2020-06-08 23:00:00,34.0,43.41,7.5,137.5,79.25,4.33,0.98,22.55,22.8,0.3,0.0,0.0,,
877399,DL026,2018-12-06 05:00:00,272.5,459.0,433.82,35.22,372.55,55.2,3.87,11.88,8.48,13.9,81.55,0.0,362.0,Very Poor
1045913,DL031,2020-06-16 10:00:00,16.0,70.0,3.3,25.62,16.35,30.55,0.83,8.0,14.88,0.5,0.0,0.0,95.0,Satisfactory
1544528,KA008,2019-06-18 21:00:00,18.0,70.75,1.83,12.23,7.98,8.38,0.54,3.92,41.11,0.2,1.7,6.99,67.0,Satisfactory
647567,DL017,2020-04-26 21:00:00,1.16,7.0,91.31,17.31,108.64,12.4,0.03,9.72,45.26,0.0,0.0,0.0,135.0,Moderate
1817607,MH013,2020-03-11 18:00:00,13.0,47.92,8.75,16.5,25.25,6.92,0.26,12.39,41.48,0.2,0.76,0.2,51.0,Satisfactory
1972406,RJ006,2017-12-25 15:00:00,28.3,90.74,7.13,12.02,0.0,21.72,0.0,4.41,111.39,0.58,2.85,8.82,181.0,Moderate
1024682,DL031,2018-01-13 19:00:00,117.0,200.0,151.08,134.61,0.0,32.23,2.85,48.8,44.83,2.18,9.5,0.0,394.0,Very Poor


In [10]:
df.isna().sum()

StationId          0
Datetime           0
PM2.5              0
PM10               0
NO                 0
NO2                0
NOx                0
NH3                0
CO                 0
SO2                0
O3                 0
Benzene            0
Toluene            0
Xylene             0
AQI           570190
AQI_Bucket    570190
dtype: int64

In [11]:
# Fill missing AQI values with simple imputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [12]:
# Split dataset into rows with known and missing AQI:
df_known = df[df["AQI"].notnull()]
df_missing = df[df["AQI"].isnull()]

In [13]:
# Choosing relevant features for predicting AQI.
features = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene']
X_known = df_known[features]
y_known = df_known["AQI"]

X_missing = df_missing[features]

In [14]:
# Using SimpleImputer to fill missing values in the predictors:
imputer = SimpleImputer()
X_known_imputed = imputer.fit_transform(X_known)
X_missing_imputed = imputer.fit_transform(X_missing)

In [None]:
# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_known_imputed, y_known)