In [1]:
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mujtabamatin/air-quality-and-pollution-assessment")
print(f"Path to dataset files: {path}")

In [None]:
from pathlib import Path

dataset_path = Path(f"{path}").joinpath("updated_pollution_dataset.csv")
dataset = pd.read_csv(dataset_path)
dataset.head()

In [4]:
dataset.rename(columns={"Air Quality": "y"}, inplace=True)

In [None]:
dataset.isna().sum()

In [None]:
import seaborn as sns

ax = sns.countplot(x="y", data=dataset)
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width() / 2., height + 20, height + 20, ha="center")

In [7]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
dataset["y"] = encoder.fit_transform(dataset["y"])

In [8]:
import cloudpickle as cp

run = 1
root_path = Path(f"./saved-runs/run-{run}")
root_path.mkdir(parents=True, exist_ok=True)

encoder_file = root_path.joinpath("encoders", "label_encoder.pkl")
encoder_file.parent.mkdir(parents=True, exist_ok=True)
with open(encoder_file, "wb") as f:
    cp.dump(encoder, f)

In [None]:
dataset.describe()

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
dataset[[
    'Temperature', 'Humidity', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'Proximity_to_Industrial_Areas',
    'Population_Density'
 ]] = scaler.fit_transform(dataset[[
    'Temperature', 'Humidity', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'Proximity_to_Industrial_Areas',
    'Population_Density'
 ]])

In [11]:
encoder_file = root_path.joinpath("scalers", "standard_scaler.pkl")
encoder_file.parent.mkdir(parents=True, exist_ok=True)
with open(encoder_file, "wb") as f:
    cp.dump(encoder, f)

In [None]:
dataset.describe()

In [None]:
from matplotlib import pyplot as plt
plt.figure(figsize=(10, 5), dpi=100)

corr_map = dataset.corr()["y"]
ax = sns.barplot(corr_map)
for p in ax.patches:
    height = round(p.get_height(), 4)
    ax.text(p.get_x() + p.get_width() / 2., height, height, ha="center")

_ = plt.xticks(rotation=45, ha="right")

In [None]:
dataset.columns.tolist()

In [None]:
fig, axes = plt.subplots(dataset.shape[1]//3, 3, figsize=(15, 10), dpi=100)

col = 0
row = 0
for column in dataset.columns:
    if column == "y":
        continue
    sns.histplot(x=dataset[column], kde=True, ax=axes[col, row])
    plt.title(column)

    col, row = (col + 1, 0) if row >= 2 else (col, row + 1)

In [16]:
from sklearn.preprocessing import PowerTransformer

columns_to_transform = [
    # "PM2.5", "PM10", "CO", "Proximity_to_Industrial_Areas"
    "PM2.5", "PM10"
]
transformer = PowerTransformer(method="yeo-johnson", standardize=False)
dataset[columns_to_transform] = transformer.fit_transform(dataset[columns_to_transform])

In [None]:
fig, axes = plt.subplots(dataset.shape[1]//3, 3, figsize=(15, 10), dpi=100)

col = 0
row = 0
for column in dataset.columns:
    if column == "y":
        continue
    sns.histplot(x=dataset[column], kde=True, ax=axes[col, row])
    plt.title(column)

    col, row = (col + 1, 0) if row >= 2 else (col, row + 1)

In [18]:
transformer_file = root_path.joinpath("transformer", "yeo_johnson_transformer.pkl")
transformer_file.parent.mkdir(parents=True, exist_ok=True)

with open(transformer_file, "wb") as f:
    cp.dump(transformer, f)

In [None]:
dataset.columns

In [None]:
from sklearn.model_selection import train_test_split

X = dataset.drop(columns=["y"])
y = dataset[["y"]]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y, shuffle=True
)

print(f"Train dataset size: {X_train.shape[0]}")
print(f"Test dataset size: {X_test.shape[0]}")

In [None]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(
    hidden_layer_sizes=30,
    activation="relu",
    solver="adam",
    alpha=0.003,
    batch_size=64,
    learning_rate="constant",
    learning_rate_init=0.001,
    max_iter=200,
    shuffle=True,
    random_state=42,
    tol=0.001,
    verbose=True,
    beta_1=0.9,
    beta_2=0.999,
    early_stopping=True,
    validation_fraction=0.2,
    n_iter_no_change=20,
)
model.fit(X, y)

In [None]:
sns.lineplot(model.loss_curve_, legend=True)
plt.title("Loss")

In [None]:
sns.lineplot(model.validation_scores_)
plt.title("Validation accuracy")

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay(
    confusion_matrix(y_test, model.predict(X_test)),
    display_labels=encoder.inverse_transform(model.classes_)
).plot()

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, model.predict(X_test)))

In [26]:
model_path = root_path.joinpath("saved-model", "model.pkl")
model_path.parent.mkdir(parents=True, exist_ok=True)

with open(model_path, "wb") as f:
    cp.dump(model, f)

In [None]:
import shutil

shutil.copyfile(
    "./air-quality-pollution.ipynb",
    root_path.joinpath("air-quality-pollution.ipynb")
)