In [1]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,accuracy_score,classification_report
import matplotlib.pyplot as plt
import torch.nn.functional as F
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import pandas

Firstly, the dataset is loaded and the features (X) and target (y) are extracted as arrays. Next, the data is split into 80% training and 20% testing to evaluate the model performance. Features are then scaled to have mean 0 and a standard deviation equal to 1, which helps models to converge faster and perform better.

In [None]:
# load dataset
diabetes = fetch_openml("diabetes",version=1,as_frame=True)
X = diabetes.data.values
y = diabetes.target.values
X.shape
y = (y=='tested_positive').astype(np.int64)

# train test spliting
test_size=0.2
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=test_size, random_state=42)

# Standardize features
scaler=StandardScaler()
Xtr= scaler.fit_transform(Xtr)
Xte= scaler.transform(Xte)

In this part of the code, the MLP is defined with an input layer, four hidden layers and a last output layer. A dropout probability of 50% is also added as default: some neurons are randomly dropped during training to prevent overfitting. As specified in the forward definition, the activation functions used are ReLU.

In [3]:
class MLP(nn.Module):
    def __init__(self, input_size, output_size=1, dropout_prob=0.5):
        super(MLP, self).__init__()
        
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, 64)
        self.out = nn.Linear(64, output_size)
        
        self.dropout = nn.Dropout(p=dropout_prob)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        
        x = F.relu(self.fc3(x))
        x = self.dropout(x)
        
        x = F.relu(self.fc4(x))
        x = self.dropout(x)
        
        x = self.out(x)
        return x

The next section specifies the hyperparameters of this MLP model. 

- epochs: The epochs are the number of times the training algorithm will iterate over the entire training dataset. Choosing a number too large might cause overfitting, while too small can result in underfitting. In this case, 100 were used. 

- learning rate: This hyperparameter controls how much the model weights are updated in response to the computed gradient during training. In this case, larger numbers can accelerate training but may cause divergence, while smaller values may slow down convergence. A learning rate of 0.0005 is used to ensure small, stable weight updates.

- dropout: a value of 0.1 was chosen, meaning 10% of neurons are ignored at each training step

- batch size: the batch size is the number of samples the model looks at before updating its weights. Smaller batch sizes can provide noisier gradient estimates, which may help the model generalize better. When a larger number is considered, it can provide more stable gradients, but requires more memory. In this case, a batch size of 64 was considered.


In [4]:
num_epochs=100
lr=0.0005
dropout=0.1
batch_size=64

text here

In [5]:
Xtr = torch.tensor(Xtr, dtype=torch.float32)
ytr = torch.tensor(ytr, dtype=torch.float32)
Xte = torch.tensor(Xte, dtype=torch.float32)
yte = torch.tensor(yte, dtype=torch.float32)

# Wrap Xtr and ytr into a dataset
train_dataset = TensorDataset(Xtr, ytr)

# Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

text here

In [6]:
# Model, Loss, Optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = MLP(input_size=Xtr.shape[1], dropout_prob=dropout).to(device)
criterion = nn.BCEWithLogitsLoss()  # for binary classification
criterion = nn.MSELoss() #for regression
optimizer = optim.Adam(model.parameters(), lr=lr)

text here

In [7]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0

    for batch_x, batch_y in train_dataloader:
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)

        logits = model(batch_x)
        loss = criterion(logits, batch_y.view(-1, 1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

Epoch [1/100], Loss: 0.2565
Epoch [2/100], Loss: 0.2301
Epoch [3/100], Loss: 0.2116
Epoch [4/100], Loss: 0.1987
Epoch [5/100], Loss: 0.1819
Epoch [6/100], Loss: 0.1726
Epoch [7/100], Loss: 0.1696
Epoch [8/100], Loss: 0.1623
Epoch [9/100], Loss: 0.1524
Epoch [10/100], Loss: 0.1585
Epoch [11/100], Loss: 0.1564
Epoch [12/100], Loss: 0.1468
Epoch [13/100], Loss: 0.1478
Epoch [14/100], Loss: 0.1513
Epoch [15/100], Loss: 0.1469
Epoch [16/100], Loss: 0.1503
Epoch [17/100], Loss: 0.1494
Epoch [18/100], Loss: 0.1438
Epoch [19/100], Loss: 0.1448
Epoch [20/100], Loss: 0.1439
Epoch [21/100], Loss: 0.1422
Epoch [22/100], Loss: 0.1406
Epoch [23/100], Loss: 0.1404
Epoch [24/100], Loss: 0.1412
Epoch [25/100], Loss: 0.1429
Epoch [26/100], Loss: 0.1404
Epoch [27/100], Loss: 0.1388
Epoch [28/100], Loss: 0.1400
Epoch [29/100], Loss: 0.1346
Epoch [30/100], Loss: 0.1348
Epoch [31/100], Loss: 0.1339
Epoch [32/100], Loss: 0.1396
Epoch [33/100], Loss: 0.1342
Epoch [34/100], Loss: 0.1394
Epoch [35/100], Loss: 0

text here

In [8]:
y_pred=model(Xte)
#print(f'ACC:{accuracy_score(yte.detach().numpy(),y_pred.detach().numpy()>0.5)}') #classification
print(f'MSE:{mean_squared_error(yte.detach().numpy(),y_pred.detach().numpy())}') #regression

MSE:0.22470130026340485
