<a href="https://colab.research.google.com/github/PrajjwalNakarmi/Artificial-Intelligence/blob/main/AirQualityIndex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import requests
import seaborn as sns

In [None]:
url = "https://admin.opendatanepal.com/api/action/datastore_search"

resource_id = "ad8d1b4d-7667-455d-ab74-d5966e8ba4c3"

all_records = []
offset = 0
limit = 1000

while True:
    params = {
        "resource_id": resource_id,
        "sort": "_id asc",
        "limit": limit,
        "offset": offset
    }

    response = requests.get(url, params=params)
    data = response.json()

    records = data["result"]["records"]
    all_records.extend(records)

    if len(records) < limit:
        break

    offset += limit

df = pd.DataFrame(all_records)

print("Total records fetched:", len(df))
print(df.head())


In [None]:
print(df.columns)

In [None]:
# Keeping only relevant columns
df1 = df[['utc', 'parameter', 'value']]

print(df1.head())


In [None]:
df.to_csv("airquality.csv", index=False)
print("Dataset saved as airquality.csv")


In [None]:
# Convert value column to numeric (VERY IMPORTANT)
df1['value'] = pd.to_numeric(df1['value'], errors='coerce')

# Drop rows where conversion failed
df1 = df1.dropna(subset=['value'])

print(df1.dtypes)



In [None]:
# Convert long → wide format
df2 = df1.pivot_table(
    index='utc',
    columns='parameter',
    values='value',
    aggfunc='mean'
).reset_index()

print(df2.head())
print(df2.columns)

In [None]:
df_labeled = df2.copy()

def classify_air_quality(pm25):
    if pm25 <= 50:
        return 'Good'
    elif pm25 <= 100:
        return 'Moderate'
    else:
        return 'Poor'

df_labeled['air_quality'] = df_labeled['pm25'].apply(classify_air_quality)

print(df_labeled['air_quality'].value_counts())


In [None]:
#This Code was written first taking both pm2.5 and o3 features and models each gave 1.0 and 0.99 accuracy.
#The classification labels were generated using PM2.5 threshold rules, and PM2.5 was included as a feature.
#Decision Tree and Random Forest models can perfectly learn these rules, which resulted in 100% accuracy.
#This indicates rule learning rather than model generalization.

# Select features that actually exist
#features = [col for col in ['pm25', 'pm10', 'o3'] if col in df2.columns]

#df_model = df2[features].dropna()

#print("Features used:", features)
#print(df_model.head())



In [None]:
features = ['o3']  # NO pm25, NO pm10 that is why these were removed

df_model = df_labeled[features + ['air_quality']].dropna()

print(df_model.head())


In [None]:
X = df_model[features]
y = df_model['air_quality']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled, y_train)

dt_pred = dt.predict(X_test_scaled)

print("Decision Tree Accuracy:", accuracy_score(y_test, dt_pred))
print(classification_report(y_test, dt_pred))

In [None]:
def plot_confusion_matrix(y_true, dt_pred, title):
    cm = confusion_matrix(y_true, dt_pred, labels=labels)

    plt.figure(figsize=(5,4))
    sns.heatmap(
        cm,
        annot=True,
        fmt='d',
        cmap='Blues',
        xticklabels=labels,
        yticklabels=labels
    )

    plt.xlabel("Predicted Label")
    plt.ylabel("Actual Label")
    plt.title(title)
    plt.show()
plot_confusion_matrix(y_test,dt_pred,"Confusion Matrix – Decision Tree Classifier")

In [None]:
rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

rf.fit(X_train_scaled, y_train)
rf_pred = rf.predict(X_test_scaled)

print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
print(classification_report(y_test, rf_pred))

In [None]:
def plot_confusion_matrix(y_true, rf_pred, title):
    cm = confusion_matrix(y_true, rf_pred, labels=labels)

    plt.figure(figsize=(5,4))
    sns.heatmap(
        cm,
        annot=True,
        fmt='d',
        cmap='Blues',
        xticklabels=labels,
        yticklabels=labels
    )

    plt.xlabel("Predicted Label")
    plt.ylabel("Actual Label")
    plt.title(title)
    plt.show()
plot_confusion_matrix(y_test,rf_pred,"Confusion Matrix – Random Forest Classifier")

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

knn_pred = knn.predict(X_test_scaled)

print("KNN Accuracy:", accuracy_score(y_test, knn_pred))
print(classification_report(y_test, knn_pred))

In [None]:
def plot_confusion_matrix(y_true, knn_pred, title):
    cm = confusion_matrix(y_true, knn_pred, labels=labels)

    plt.figure(figsize=(5,4))
    sns.heatmap(
        cm,
        annot=True,
        fmt='d',
        cmap='Blues',
        xticklabels=labels,
        yticklabels=labels
    )

    plt.xlabel("Predicted Label")
    plt.ylabel("Actual Label")
    plt.title(title)
    plt.show()
plot_confusion_matrix(y_test,knn_pred,"Confusion Matrix – KNN Classifier")

In [None]:
models = ['Decision Tree', 'Random Forest', 'KNN']
accuracies = [
    accuracy_score(y_test, dt_pred),
    accuracy_score(y_test, rf_pred),
    accuracy_score(y_test, knn_pred)
]

plt.figure(figsize=(6,4))
plt.bar(models, accuracies)
plt.ylabel("Accuracy")
plt.title("Air Quality Classification Model Comparison")
plt.ylim(0,1)
plt.show()

# Model in action

In [None]:
new_sample = np.array([[0.055]])

# Scale the input using trained scaler
new_sample_scaled = scaler.transform(new_sample)

# Predict air quality using trained Random Forest model
prediction = knn.predict(new_sample_scaled)

print("Predicted Air Quality:", prediction[0])
