In [24]:
import pandas as pd
df = pd.read_csv('../../data/preprocessed_AQI_data.csv')
df.head()

Unnamed: 0,Country,City,AQI Value,AQI Category,CO AQI Value,CO AQI Category,Ozone AQI Value,Ozone AQI Category,NO2 AQI Value,NO2 AQI Category,PM2.5 AQI Value,PM2.5 AQI Category,lat,lng
0,130,10126,51,2,1,0,36,0,0,0,51,2,44.7444,44.2031
1,22,10140,41,0,1,0,5,0,1,0,41,0,-5.29,-44.49
2,75,10163,66,2,1,0,39,0,2,0,66,2,37.1667,15.1833
3,123,10185,34,0,1,0,34,0,0,0,20,0,53.0167,20.8833
4,166,10243,54,2,1,0,14,0,11,0,54,2,16.1005,-88.8074


In [25]:
from sklearn.model_selection import train_test_split

# shuffle the data before splitting
df_shuffled = df.sample(frac=1, random_state=19)
X = df_shuffled.drop(["AQI Category"], axis=1)
y = df_shuffled["AQI Category"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.8, random_state=22
)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)


(2845, 13) (11384, 13)
(2845,) (11384,)


In [26]:
%pip install torch



## First architecture of nn

In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from torch.utils.data import DataLoader, TensorDataset

X_train_tensor = torch.tensor(X_train.values.astype(np.float32))
X_test_tensor = torch.tensor(X_test.values.astype(np.float32))
y_train_tensor = torch.tensor(y_train.values)
y_test_tensor = torch.tensor(y_test.values)

# defining the first architecture of our nn
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out


input_size = X_train.shape[1]
hidden_size = 64
output_size = len(np.unique(y_train))
model = NeuralNet(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# training the model
num_epochs = 20
batch_size = 32
for epoch in range(num_epochs):
    for i in range(0, len(X_train_tensor), batch_size):
        inputs = X_train_tensor[i:i+batch_size]
        labels = y_train_tensor[i:i+batch_size]

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# evaluating the model
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test_tensor.numpy(), predicted.numpy(), average='weighted')
    accuracy = accuracy_score(y_test_tensor.numpy(), predicted.numpy())
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)


Accuracy: 0.7061665495432186
Precision: 0.7821155801753695
Recall: 0.7061665495432186
F1-score: 0.6987773350926756


  _warn_prf(average, modifier, msg_start, len(result))


## Second attempt, slightly more complex architecture

In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from torch.utils.data import DataLoader, TensorDataset

X_train_tensor = torch.tensor(X_train.values.astype(np.float32))
X_test_tensor = torch.tensor(X_test.values.astype(np.float32))
y_train_tensor = torch.tensor(y_train.values)
y_test_tensor = torch.tensor(y_test.values)

# we add more layers
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size2, output_size)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        return out

input_size = X_train.shape[1]
hidden_size1 = 128  #here we increased the nb of neurones in the first hidden layer
hidden_size2 = 64   # same for the second layer
output_size = len(np.unique(y_train))
model = NeuralNet(input_size, hidden_size1, hidden_size2, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20
batch_size = 32
for epoch in range(num_epochs):
    for i in range(0, len(X_train_tensor), batch_size):
        inputs = X_train_tensor[i:i+batch_size]
        labels = y_train_tensor[i:i+batch_size]

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test_tensor.numpy(), predicted.numpy(), average='weighted')
    accuracy = accuracy_score(y_test_tensor.numpy(), predicted.numpy())
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)


Accuracy: 0.7717849613492621
Precision: 0.7801181711649056
Recall: 0.7717849613492621
F1-score: 0.7532220016228942


## Third attempt, we can still do better, let's try adding cross validation to the second architecture

In [30]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, TensorDataset

X_tensor = torch.tensor(X.values.astype(np.float32))
y_tensor = torch.tensor(y.values)

class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size2, output_size)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        return out

# applying k-fold cross validation
k_folds = 5
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=19)

accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

torch.manual_seed(19)
for fold, (train_index, test_index) in enumerate(skf.split(X_tensor, y)):
    print(f"Fold {fold + 1}/{k_folds}")

    X_train, X_test = X_tensor[train_index], X_tensor[test_index]
    y_train, y_test = y_tensor[train_index], y_tensor[test_index]

    input_size = X_train.shape[1]
    hidden_size1 = 128
    hidden_size2 = 64
    output_size = len(np.unique(y))
    model = NeuralNet(input_size, hidden_size1, hidden_size2, output_size)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # model training
    num_epochs = 20
    batch_size = 32
    for epoch in range(num_epochs):
        for i in range(0, len(X_train), batch_size):
            inputs = X_train[i:i+batch_size]
            labels = y_train[i:i+batch_size]

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    # evaluating the test set on a fold
    with torch.no_grad():
        outputs = model(X_test)
        _, predicted = torch.max(outputs, 1)

        # get the classification metrics for one fold
        accuracy = accuracy_score(y_test.numpy(), predicted.numpy())
        precision, recall, f1, _ = precision_recall_fscore_support(y_test.numpy(), predicted.numpy(), average='weighted')
        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1-score:", f1)



Fold 1/5
Accuracy: 0.9037245256500351
Precision: 0.9025191690106313
Recall: 0.9037245256500351
F1-score: 0.8932017178659687
Fold 2/5
Accuracy: 0.9012649332396345
Precision: 0.8910671215401803
Recall: 0.9012649332396345
F1-score: 0.893016174741547
Fold 3/5
Accuracy: 0.8338018271257905
Precision: 0.8591136806363895
Recall: 0.8338018271257905
F1-score: 0.8356926509203146
Fold 4/5
Accuracy: 0.7297962052002811
Precision: 0.8051705965060307
Recall: 0.7297962052002811
F1-score: 0.7188058539276487
Fold 5/5
Accuracy: 0.8847100175746925
Precision: 0.9053017339503165
Recall: 0.8847100175746925
F1-score: 0.8802632702132116


In [31]:
# get avg metrics
print("\nAverage Metrics Across Folds:")
print("Mean Accuracy:", np.mean(accuracy_list))
print("Mean Precision:", np.mean(precision_list))
print("Mean Recall:", np.mean(recall_list))
print("Mean F1-score:", np.mean(f1_list))


Average Metrics Across Folds:
Mean Accuracy: 0.8506595017580867
Mean Precision: 0.8726344603287096
Mean Recall: 0.8506595017580867
Mean F1-score: 0.8441959335337381
