In [53]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, confusion_matrix

In [54]:
df=pd.read_csv('Air pollution.csv',encoding='unicode_escape')

In [55]:
df.head()
#Loading The Data Set

Unnamed: 0,State,SO2,NO2,PM25,PM10
0,Andhra Pradesh,4.8,17.4,,
1,Andhra Pradesh,3.1,7.0,,
2,Andhra Pradesh,6.2,28.5,,
3,Andhra Pradesh,6.3,14.7,,
4,Andhra Pradesh,4.7,7.5,,


In [56]:
df['State'].value_counts()

State
Maharashtra                    60384
Uttar Pradesh                  42816
Andhra Pradesh                 26368
Punjab                         25634
Rajasthan                      25589
Kerala                         24728
Himachal Pradesh               22896
West Bengal                    22463
Gujarat                        21279
Tamil Nadu                     20597
Madhya Pradesh                 19920
Assam                          19361
Odisha                         19279
Karnataka                      17119
Delhi                           8551
Chandigarh                      8520
Chhattisgarh                    7831
Goa                             6206
Jharkhand                       5968
Mizoram                         5338
Telangana                       3978
Meghalaya                       3853
Puducherry                      3785
Haryana                         3420
Nagaland                        2463
Bihar                           2275
Uttarakhand                     

In [57]:
df.shape
#To find Rows and Columns

(435742, 5)

In [58]:
df.info()
#Overall Info of Data Set

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435742 entries, 0 to 435741
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   State   435742 non-null  object 
 1   SO2     401096 non-null  float64
 2   NO2     419509 non-null  float64
 3   PM25    395520 non-null  float64
 4   PM10    198355 non-null  float64
dtypes: float64(4), object(1)
memory usage: 16.6+ MB


In [59]:
df.isnull().sum()
#Give the clear picture of Null or missing values

State         0
SO2       34646
NO2       16233
PM25      40222
PM10     237387
dtype: int64

In [60]:
df.describe()

Unnamed: 0,SO2,NO2,PM25,PM10
count,401096.0,419509.0,395520.0,198355.0
mean,10.829414,25.809623,108.832784,220.78348
std,11.177187,18.503086,74.87243,151.395457
min,0.0,0.0,0.0,0.0
25%,5.0,14.0,56.0,111.0
50%,8.0,22.0,90.0,187.0
75%,13.7,32.2,142.0,296.0
max,909.0,876.0,6307.033333,3380.0


In [61]:
df.nunique()
#Show unique values

State      37
SO2      4197
NO2      6864
PM25     6065
PM10     6668
dtype: int64

In [62]:
nullvalues=df.isnull().sum().sort_values(ascending=False)
#checking all Null values

In [63]:
nullvalues

PM10     237387
PM25      40222
SO2       34646
NO2       16233
State         0
dtype: int64

In [64]:
null_value_percentage = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending=False)
#Count Return Non-NAN values
missing_data_with_percentage=pd.concat([nullvalues,null_value_percentage],axis=1,keys=['Total','Percent'])
#concatenating total null values and their percentage of missing value  for further imputation or column deletion
missing_data_with_percentage

Unnamed: 0,Total,Percent
PM10,237387,54.478797
PM25,40222,9.230692
SO2,34646,7.951035
NO2,16233,3.72537
State,0,0.0


In [65]:
#Null values imputation for categorical data
df['State']=df['State'].fillna(df['State'].mode()[0])

In [66]:
df.fillna(0,inplace=True)
#Null Values get replace with zero for the numerical data

In [67]:
df.isnull().sum()

State    0
SO2      0
NO2      0
PM25     0
PM10     0
dtype: int64

In [68]:
df
#The following is important for our machine learning models|

Unnamed: 0,State,SO2,NO2,PM25,PM10
0,Andhra Pradesh,4.8,17.4,0.0,0.0
1,Andhra Pradesh,3.1,7.0,0.0,0.0
2,Andhra Pradesh,6.2,28.5,0.0,0.0
3,Andhra Pradesh,6.3,14.7,0.0,0.0
4,Andhra Pradesh,4.7,7.5,0.0,0.0
...,...,...,...,...,...
435737,West Bengal,22.0,50.0,143.0,0.0
435738,West Bengal,20.0,46.0,171.0,0.0
435739,andaman-and-nicobar-islands,0.0,0.0,0.0,0.0
435740,Lakshadweep,0.0,0.0,0.0,0.0


In [69]:
def cal_SOi(SO2): 
    si=0
    if (SO2<=40):
        si=SO2*(50/40)
    elif (SO2>40 and SO2<=80):
        si=50+(SO2-40)*(50/40)
    elif (SO2>80 and SO2<=380):
        si=100+(SO2-80)*(100/300)
    elif (SO2>380 and SO2<=800):
        si=200+ (SO2-380)*(100/800)
    elif (SO2>800 and SO2<=1600):
        si=300+(SO2-800)*(100/800)
    elif (SO2>1600):
        si=400+(SO2-1600)*(100/800)
    return si
df['SOi']=df['SO2'].apply(cal_SOi)
data=df[['SO2', 'SOi']]
data.head()
#Calculating the individual pollutant index for SO2 

Unnamed: 0,SO2,SOi
0,4.8,6.0
1,3.1,3.875
2,6.2,7.75
3,6.3,7.875
4,4.7,5.875


In [70]:
def cal_NOi(NO2):
    ni=0
    if(NO2<=40):
        ni=NO2*50/40
    elif(NO2>40 and NO2<=80):
        ni=50+(NO2-40)*(50/40)
    elif(NO2>80 and NO2<=180):
        ni=100+(NO2-80)*(100/100)
    elif(NO2>180 and NO2<=280):
        ni=200+(NO2-180)*(100/100)
    elif(NO2>280 and NO2<=400):
        ni=300+(NO2-280)*(100/120)
    else:
        ni=400+(NO2-400)*(100/120)
    return ni
df['NOi']=df['NO2'].apply(cal_NOi)
data=df[['NO2','NOi']]
data.head()
#calculating the individual pollutant index for no2

Unnamed: 0,NO2,NOi
0,17.4,21.75
1,7.0,8.75
2,28.5,35.625
3,14.7,18.375
4,7.5,9.375


In [71]:
def cal_PM25i(PM25):
    pm25i=0
    if(PM25<=30):
        pm25i=PM25*50/30
    elif(PM25>30 and PM25<=60):
        pm25i=50+(PM25-30)*(50/30)
    elif(PM25>60 and PM25<=90):
        pm25i=100+(PM25-60)*(100/30)
    elif(PM25>90 and PM25<=120):
        pm25i=200+(PM25-90)*(100/30)
    elif(PM25>120 and PM25<=250):
        pm25i=300+(PM25-120)*(100/130)
    else:
        pm25i=400+(PM25-250)*(100/130)
    return pm25i
df['PM25i']=df['PM25'].apply(cal_PM25i)
data=df[['PM25','PM25i']]
data.head()
#calculating the individual pollutant index for RSMP

Unnamed: 0,PM25,PM25i
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [72]:
def cal_PM10i(PM10):
    pm10i=0
    if(PM10<=50):
        pm10i=PM10*50/50
    elif(PM10>50 and PM10<=100):
        pm10i=50+(PM10-50)*(50/50)
    elif(PM10>100 and PM10<=250):
        pm10i=100+(PM10-100)*(100/150)
    elif(PM10>250 and PM10<=350):
        pm10i=200+(PM10-250)*(100/100)
    elif(PM10>120 and PM10<=250):
        pm10i=(300+PM10-350)*(100/80)
    else:
        pm10i=400+(PM10-430)*(100/430)
    return pm10i
df['PM10i']=df['PM10'].apply(cal_PM10i)
data=df[['PM10','PM10i']]
data.head()

Unnamed: 0,PM10,PM10i
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [73]:
def cal_aqi(si,ni,pm25i,pm10i):
    aqi=0
    if(si>ni and si>pm25i and si>pm10i):
        aqi=si
    if(ni>si and ni>pm25i and ni>pm10i):
        aqi=ni
    if(pm25i>ni and pm25i>si and pm25i>pm10i):
        aqi=pm25i
    if(pm10i>si and pm10i>pm25i and pm10i>ni):
        aqi=pm10i
    return aqi

df['AQI']=df.apply(lambda x:cal_aqi(x['SOi'],x['NOi'],x['PM25i'],x['PM10i']),axis=1)
data=df[['State','SOi','NOi','PM25i','PM10i','AQI']]
data.head()
#Calculating the AQI

Unnamed: 0,State,SOi,NOi,PM25i,PM10i,AQI
0,Andhra Pradesh,6.0,21.75,0.0,0.0,21.75
1,Andhra Pradesh,3.875,8.75,0.0,0.0,8.75
2,Andhra Pradesh,7.75,35.625,0.0,0.0,35.625
3,Andhra Pradesh,7.875,18.375,0.0,0.0,18.375
4,Andhra Pradesh,5.875,9.375,0.0,0.0,9.375


In [74]:
def AQI_Range(x):
    if x<=50:
        return "Good"
    elif x>50 and x<=100:
        return 'Moderate'
    elif x>100 and x<=200:
        return 'Poor'
    elif x>200 and x<=300:
        return 'Unhealty'
    elif x>300 and x<=400:
        return 'Very Unhealthy'
    elif x>400:
        return 'Hazardous'
df["AQI_Range"]=df["AQI"].apply(AQI_Range)
df.head()

Unnamed: 0,State,SO2,NO2,PM25,PM10,SOi,NOi,PM25i,PM10i,AQI,AQI_Range
0,Andhra Pradesh,4.8,17.4,0.0,0.0,6.0,21.75,0.0,0.0,21.75,Good
1,Andhra Pradesh,3.1,7.0,0.0,0.0,3.875,8.75,0.0,0.0,8.75,Good
2,Andhra Pradesh,6.2,28.5,0.0,0.0,7.75,35.625,0.0,0.0,35.625,Good
3,Andhra Pradesh,6.3,14.7,0.0,0.0,7.875,18.375,0.0,0.0,18.375,Good
4,Andhra Pradesh,4.7,7.5,0.0,0.0,5.875,9.375,0.0,0.0,9.375,Good


In [75]:
df["AQI_Range"].value_counts()
#Count of the values of present in the AQI range Column

AQI_Range
Poor              116204
Very Unhealthy    108498
Moderate           79865
Unhealty           71434
Hazardous          32209
Good               27532
Name: count, dtype: int64

In [97]:
x=df[["SOi",'NOi','PM25i','PM10i']]
y=df["AQI"]
x.head()
#only selected column like soi, noi, rpi, spmi

Unnamed: 0,SOi,NOi,PM25i,PM10i
0,6.0,21.75,0.0,0.0
1,3.875,8.75,0.0,0.0
2,7.75,35.625,0.0,0.0
3,7.875,18.375,0.0,0.0
4,5.875,9.375,0.0,0.0


In [98]:
y.head()
#Target Column

0    21.750
1     8.750
2    35.625
3    18.375
4     9.375
Name: AQI, dtype: float64

# Testing and Training Data

In [99]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=70)
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)
#Splitting the Data into train and test

(348593, 4) (87149, 4) (348593,) (87149,)


In [100]:
model=LinearRegression()
model.fit(x_train,y_train)

#predicting_train
train_pred=model.predict(x_train)
#predicting on test
test_pred=model.predict(x_test)

RMSE_train=(np.sqrt(metrics.mean_squared_error(y_train,train_pred)))
RMSE_test=(np.sqrt(metrics.mean_squared_error(y_test,test_pred)))
print("RMSE TainingData = ",str(RMSE_train))
print("RMSE TestData = ",str(RMSE_test))
print('-'*50)
print('RSquared Value on train : ',model.score(x_train,y_train))
print('RSquared Value on test : ',model.score(x_test,y_test))

RMSE TainingData =  47.27376002716514
RMSE TestData =  47.48858264207249
--------------------------------------------------
RSquared Value on train :  0.8584898801885024
RSquared Value on test :  0.8566397955895578


In [101]:
DT=DecisionTreeRegressor()
DT.fit(x_train,y_train)

#predicting_train
train_preds=DT.predict(x_train)
#predicting on test
test_preds=DT.predict(x_test)

RMSE_train=(np.sqrt(metrics.mean_squared_error(y_train,train_preds)))
RMSE_test=(np.sqrt(metrics.mean_squared_error(y_test,test_preds)))
print("RMSE TainingData = ",str(RMSE_train))
print("RMSE TestData = ",str(RMSE_test))
print('-'*50)
print('RSquared Value on train : ',DT.score(x_train,y_train))
print('RSquared Value on test : ',DT.score(x_test,y_test))

RMSE TainingData =  5.358930771394717e-11
RMSE TestData =  2.05576530617744
--------------------------------------------------
RSquared Value on train :  1.0
RSquared Value on test :  0.9997313435565617


In [102]:
RF=RandomForestRegressor().fit(x_train,y_train)

#predicting_train
train_preds1=RF.predict(x_train)
#predicting on test
test_preds1=RF.predict(x_test)

RMSE_train=(np.sqrt(metrics.mean_squared_error(y_train,train_preds1)))
RMSE_test=(np.sqrt(metrics.mean_squared_error(y_test,test_preds1)))
print("RMSE TainingData = ",str(RMSE_train))
print("RMSE TestData = ",str(RMSE_test))
print('-'*50)
print('RSquared Value on train : ',RF.score(x_train,y_train))
print('RSquared Value on test : ',RF.score(x_test,y_test))

RMSE TainingData =  2.971363182573823
RMSE TestData =  1.5523994683026812
--------------------------------------------------
RSquared Value on train :  0.9994409396633082
RSquared Value on test :  0.9998468005638755


In [103]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

x2=df[['SOi','NOi','PM25i','PM10i']]
y2=df['AQI_Range']

x_train2, x_test2, y_train2, y_test2 = train_test_split(x2,y2,test_size=0.30,random_state=70)

# Classification of the Model

In [104]:
#Fit the model on train data
log_reg=LogisticRegression().fit(x_train2,y_train2)

#predict on train
train_preds2=log_reg.predict(x_train2)
#accuracy on train
print("Model accuracy on train is : ",accuracy_score(y_train2,train_preds2))

#predict on test
test_preds2=log_reg.predict(x_test2)
#accuracty on test
print("Model accuracy on train is : ",accuracy_score(y_test2,test_preds2))
print("-"*50)

#Kappa Score
print("Kappa Score : ",metrics.cohen_kappa_score(y_test2,test_preds2))

Model accuracy on train is :  0.40408302433618887
Model accuracy on train is :  0.40403754503798106
--------------------------------------------------
Kappa Score :  0.23023176558069358


In [105]:
#Fit the model on train data
DT2=DecisionTreeClassifier().fit(x_train2,y_train2)

#predict on train
train_preds3=DT2.predict(x_train2)
#accuracy on train
print("Model accuracy on train is : ",accuracy_score(y_train2,train_preds3))

#predict on test
test_preds3=DT2.predict(x_test2)
#accuracty on test
print("Model accuracy on train is : ",accuracy_score(y_test2,test_preds3))
print("-"*50)

#Kappa Score
print("Kappa Score : ",metrics.cohen_kappa_score(y_test2,test_preds3))

Model accuracy on train is :  1.0
Model accuracy on train is :  0.9994568668099723
--------------------------------------------------
Kappa Score :  0.9993185077302613


In [106]:
#Fit the model on train data
RF=RandomForestClassifier().fit(x_train2,y_train2)

#predict on train
train_preds4=RF.predict(x_train2)
#accuracy on train
print("Model accuracy on train is : ",accuracy_score(y_train2,train_preds4))

#predict on test
test_preds4=RF.predict(x_test2)
#accuracty on test
print("Model accuracy on train is : ",accuracy_score(y_test2,test_preds4))
print("-"*50)

#Kappa Score
print("Kappa Score : ",metrics.cohen_kappa_score(y_test2,test_preds4))

Model accuracy on train is :  1.0
Model accuracy on train is :  0.9994339175202528
--------------------------------------------------
Kappa Score :  0.9992896372530368


In [107]:
#Fit the model on train data
KNN=KNeighborsClassifier().fit(x_train2,y_train2)

#predict on train
train_preds5=KNN.predict(x_train2)
#accuracy on train
print("Model accuracy on train is : ",accuracy_score(y_train2,train_preds5))

#predict on test
test_preds5=KNN.predict(x_test2)
#accuracty on test
print("Model accuracy on train is : ",accuracy_score(y_test2,test_preds5))
print("-"*50)

#Kappa Score
print("Kappa Score : ",metrics.cohen_kappa_score(y_test2,test_preds5))

Model accuracy on train is :  0.9945150957809186
Model accuracy on train is :  0.9907055376636094
--------------------------------------------------
Kappa Score :  0.9883328861450505


# Testing The Results

In [108]:
# Patna pm10 icon
# 552 (PM10)
 
# Patna sulphur sioxide s
# Patna nitrogen dioxide no2 icon
# 18 (NO2)

print("Logistic Regression prediction : ",log_reg.predict([[4,18,246,552]]))
print("Decision Tree prediction : ",DT2.predict([[4,18,246,552]]))
print("Random Forest prediction : ",RF.predict([[4,18,246,552]]))
print("KNN prediction : ",KNN.predict([[4,18,246,552]]))

Logistic Regression prediction :  ['Hazardous']
Decision Tree prediction :  ['Hazardous']
Random Forest prediction :  ['Hazardous']
KNN prediction :  ['Hazardous']


In [109]:
# Ghaziabad sulphur sioxide so2 icon
# 4 (SO2)
 
# Ghaziabad nitrogen dioxide no2 icon
# 20 (NO2)

# Ghaziabad pm2.5 icon
# 145 (PM2.5)
 
# Ghaziabad pm10 icon
# 263 (PM10)

print("Logistic Regression prediction : ",log_reg.predict([[4,20,145,263]]))
print("Decision Tree prediction : ",DT2.predict([[4,20,145,263]]))
print("Random Forest prediction : ",RF.predict([[4,20,145,263]]))
print("KNN prediction : ",KNN.predict([[4,20,145,263]]))

Logistic Regression prediction :  ['Hazardous']
Decision Tree prediction :  ['Unhealty']
Random Forest prediction :  ['Unhealty']
KNN prediction :  ['Unhealty']


In [110]:
# Imphal sulphur sioxide so2 icon
# 9 (SO2)
 
# Imphal nitrogen dioxide no2 icon
# 2 (NO2)

# Imphal pm2.5 icon
# 2 (PM2.5)
 
# Imphal pm10 icon
# 19 (PM10)
print("Logistic Regression prediction : ",log_reg.predict([[9,2,2,19]]))
print("Decision Tree prediction : ",DT2.predict([[9,2,2,19]]))
print("Random Forest prediction : ",RF.predict([[9,2,2,19]]))
print("KNN prediction : ",KNN.predict([[9,2,2,19]]))

Logistic Regression prediction :  ['Poor']
Decision Tree prediction :  ['Good']
Random Forest prediction :  ['Good']
KNN prediction :  ['Good']
