### Importing libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import warnings

In [3]:
df = pd.read_csv('data/air_quality_metrics.csv')

In [4]:
df.sample(5)

Unnamed: 0,Temperature,Humidity,PM2.5,PM10,NO2,SO2,CO,Proximity_to_Industrial_Areas,Population_Density,Air Quality,weighted_pollution,pm_fine_ratio,population_exposure
4055,49.6,91.0,7.6,26.6,42.3,5.7,2.22,3.0,748,Hazardous,20.05,0.285714,63146.16
4350,35.7,53.1,43.4,52.7,26.4,6.1,1.51,6.9,659,Moderate,39.06,0.823529,85742.49
3066,39.9,78.7,71.9,85.2,39.7,16.3,2.54,3.3,957,Hazardous,63.89,0.843897,206367.48
2581,31.4,69.8,20.4,32.0,16.3,9.7,1.49,8.0,643,Moderate,21.99,0.6375,51369.27
2929,38.2,82.1,8.4,26.5,24.6,7.4,2.6,5.2,696,Poor,16.97,0.316981,48372.0


In [5]:
X = df.drop(columns=['Air Quality'],axis=1)

In [6]:
X.head()

Unnamed: 0,Temperature,Humidity,PM2.5,PM10,NO2,SO2,CO,Proximity_to_Industrial_Areas,Population_Density,weighted_pollution,pm_fine_ratio,population_exposure
0,29.8,59.1,5.2,17.9,18.9,9.2,1.72,6.3,319,12.15,0.290503,16881.48
1,28.3,75.6,2.3,12.2,30.8,9.7,1.64,6.0,611,11.71,0.188525,34607.04
2,23.1,74.7,26.7,33.8,24.4,12.6,1.63,5.2,619,26.96,0.789941,61361.47
3,27.1,39.1,6.1,6.3,13.5,5.3,1.15,11.1,551,7.56,0.968254,17824.85
4,26.5,70.7,6.9,16.0,21.9,5.6,1.01,12.7,303,12.5,0.43125,15577.23


In [7]:
y = df['Air Quality']

In [8]:
y.head()

0    Moderate
1    Moderate
2    Moderate
3        Good
4        Good
Name: Air Quality, dtype: object

In [9]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

# Separate features by type
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

# Create transformers
numeric_transformer = StandardScaler()
cat_transformer = LabelEncoder()

# Create column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),  # Apply StandardScaler to numeric features
        ("cat", cat_transformer, cat_features)       # Apply custom LabelEncoder to categorical features
    ]
)

In [10]:
# Check for NaN values
if X.isnull().values.any():
    print("There are NaN values in the feature dataset.")
    X = X.fillna(X.mean())  # Fill NaN values with the mean of each column

# Check for infinite values in numeric columns of X
numeric_columns = X.select_dtypes(include=np.number).columns
if np.isinf(X[numeric_columns].values).any():
    print("There are infinite values in the numeric columns.")
    X[numeric_columns] = X[numeric_columns].replace([np.inf, -np.inf], np.nan)  # Replace infinities with NaN

# Fill NaN values in numeric columns (after replacing infinities) with the mean
X[numeric_columns] = X[numeric_columns].fillna(X[numeric_columns].mean())

There are infinite values in the numeric columns.


In [11]:
X = preprocessor.fit_transform(X)

In [12]:
X.shape

(5000, 12)

In [13]:
le = LabelEncoder()
y = le.fit_transform(y) 

In [14]:
y

array([2, 2, 2, ..., 2, 0, 2], shape=(5000,))

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [16]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    report = classification_report(true, predicted)
    matrix = confusion_matrix(true, predicted)
    return accuracy, report, matrix

In [17]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
}
model_list = []
accuracy_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_accuracy, model_train_report, model_train_matrix = evaluate_model(y_train,y_train_pred)
    model_test_accuracy, model_test_report, model_test_matrix = evaluate_model(y_test,y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model performance for training set: ")
    print("- Accuracy: {:.4}".format(model_train_accuracy))
    print("- Report: ", model_train_report)
    print("- Confusion Matrix: ", model_train_matrix)

    print("---------------------------------------------------------")

    print("Model performance for test set:")
    print("- Accuracy: {:.4}".format(model_test_accuracy))
    print("- Report: ", model_test_report)
    print("- Confusion Matrix: ", model_test_matrix)

    accuracy_list.append(model_test_accuracy)

    print("="*35)
    print("\n")


Logistic Regression
Model performance for training set: 
- Accuracy: 0.9445
- Report:                precision    recall  f1-score   support

           0       0.99      1.00      1.00      1591
           1       0.89      0.84      0.86       389
           2       0.95      0.96      0.95      1206
           3       0.87      0.87      0.87       814

    accuracy                           0.94      4000
   macro avg       0.92      0.92      0.92      4000
weighted avg       0.94      0.94      0.94      4000

- Confusion Matrix:  [[1588    0    3    0]
 [   0  326    0   63]
 [  11    0 1155   40]
 [   0   41   64  709]]
---------------------------------------------------------
Model performance for test set:
- Accuracy: 0.945
- Report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00       409
           1       0.92      0.81      0.86       111
           2       0.96      0.95      0.96       294
           3       0.83     

In [19]:
pd.DataFrame(list(zip(model_list, accuracy_list)), columns=['Model Name', 'Accuracy Score']).sort_values(by=["Accuracy Score"],ascending=False)

Unnamed: 0,Model Name,Accuracy Score
2,Random Forest,0.962
0,Logistic Regression,0.945
1,Decision Tree,0.908


In [20]:
rf = RandomForestClassifier()
rf = rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)*100
print("Accuracy of the model is: %.2f"%accuracy)

Accuracy of the model is: 95.70


In [22]:
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
0,1,1,0
1,0,0,0
2,2,2,0
3,1,1,0
4,0,0,0
...,...,...,...
995,2,2,0
996,0,0,0
997,3,3,0
998,2,2,0
