In [13]:


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [14]:
# Load the data
data = pd.read_csv('./TechBlitz DataScience Dataset.csv')  # Using raw string for Windows path

# Check dataset info
print("Dataset Info:")
print(data.info())
print("\nMissing Values:")
print(data.isnull().sum())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Temperature                    5000 non-null   float64
 1   Humidity                       5000 non-null   float64
 2   PM2.5                          5000 non-null   float64
 3   PM10                           5000 non-null   float64
 4   NO2                            5000 non-null   float64
 5   SO2                            5000 non-null   float64
 6   CO                             5000 non-null   float64
 7   Proximity_to_Industrial_Areas  5000 non-null   float64
 8   Population_Density             5000 non-null   int64  
 9   Air Quality                    5000 non-null   object 
dtypes: float64(8), int64(1), object(1)
memory usage: 390.8+ KB
None

Missing Values:
Temperature                      0
Humidity                         0
PM2

In [15]:
# Separate features and target
X = data.drop('Air Quality', axis=1)  # Features
y = data['Air Quality']  # Target

# Encode the target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [17]:
# Create and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [18]:
# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

        Good       1.00      1.00      1.00       409
   Hazardous       0.92      0.88      0.90       111
    Moderate       0.97      0.97      0.97       294
        Poor       0.88      0.90      0.89       186

    accuracy                           0.96      1000
   macro avg       0.94      0.94      0.94      1000
weighted avg       0.96      0.96      0.96      1000


Confusion Matrix:
[[409   0   0   0]
 [  0  98   0  13]
 [  0   0 285   9]
 [  0   8  10 168]]


In [19]:
# Feature Importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
})
print("\nFeature Importance:")
print(feature_importance.sort_values('importance', ascending=False))


Feature Importance:
                         feature  importance
6                             CO    0.334678
7  Proximity_to_Industrial_Areas    0.287712
4                            NO2    0.099142
5                            SO2    0.092587
0                    Temperature    0.073776
8             Population_Density    0.041929
1                       Humidity    0.034426
3                           PM10    0.022763
2                          PM2.5    0.012987


In [20]:
# Define prediction function
def predict_air_quality(temperature, humidity, pm25, pm10, no2, so2, co, proximity, population):
    input_data = np.array([[temperature, humidity, pm25, pm10, no2, so2, co, proximity, population]])
    prediction = rf_model.predict(input_data)
    return le.inverse_transform(prediction)[0]

In [21]:
# Test the prediction function
example = predict_air_quality(29.8, 59.1, 5.2, 17.9, 18.9, 9.2, 1.72, 6.3, 319)
print(f"\nExample Prediction: {example}")


Example Prediction: Moderate


In [22]:
# Save the model and label encoder
import joblib
joblib.dump(rf_model, 'air_quality_model.pkl')
joblib.dump(le, 'label_encoder.pkl')

['label_encoder.pkl']