In [249]:
import torch 
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.data import random_split
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype

from tqdm.auto import tqdm


from torchmetrics import Accuracy

import matplotlib.pyplot as plt

In [250]:
data = pd.read_csv(r"data\FashionMNIST\raw\diabetes_prediction_dataset.csv")
data = data.sample(frac=1)
data


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
58596,Female,21.0,0,0,No Info,22.21,6.5,90,0
18164,Female,41.0,0,0,not current,27.32,6.1,140,0
44174,Male,60.0,1,0,No Info,30.70,6.1,155,0
36760,Male,20.0,0,0,never,26.40,5.8,80,0
53937,Female,52.0,0,0,never,26.90,6.0,140,0
...,...,...,...,...,...,...,...,...,...
87939,Female,30.0,0,0,never,27.32,4.8,145,0
38173,Female,79.0,0,0,never,24.65,4.0,80,0
90636,Female,49.0,0,0,No Info,16.65,6.6,145,0
1793,Female,47.0,0,0,never,25.69,8.2,160,1


In [251]:
gender_dict = {"Male": 0, "Female": 1, "Other": 2}
smoking_dict = {"No Info": 0, "never": 1, "current": 2, "former": 3, "ever": 4, "not current": 5}
data=data.replace({"gender": gender_dict, "smoking_history": smoking_dict})

for columns in data:
    print(is_numeric_dtype(data[columns]))

print(len(data))

True
True
True
True
True
True
True
True
True
100000


In [252]:
data.drop(columns=["diabetes"]).values

array([[  1.  ,  21.  ,   0.  , ...,  22.21,   6.5 ,  90.  ],
       [  1.  ,  41.  ,   0.  , ...,  27.32,   6.1 , 140.  ],
       [  0.  ,  60.  ,   1.  , ...,  30.7 ,   6.1 , 155.  ],
       ...,
       [  1.  ,  49.  ,   0.  , ...,  16.65,   6.6 , 145.  ],
       [  1.  ,  47.  ,   0.  , ...,  25.69,   8.2 , 160.  ],
       [  1.  ,  51.  ,   0.  , ...,  32.61,   6.5 , 240.  ]])

In [260]:
class DiabetesDataset(Dataset):
    def __init__(self, data, transform=None, target_transform=None):
        self.data = torch.tensor(data.drop(columns=["diabetes"]).values, dtype=torch.float32)
        self.label = torch.tensor(data["diabetes"].values, dtype=torch.int32)
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return self.data[idx], self.label[idx]

BATCH_SIZE = 64


train_data, test_data = np.split(data, [int(0.8*len(data))])

train_dataset = DiabetesDataset(data=train_data)
test_dataset = DiabetesDataset(data=test_data)

train_dataloader = DataLoader(dataset=train_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=True)
test_dataloader = DataLoader(dataset=test_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=True)

print(test_dataloader.dataset.shap)

<__main__.DiabetesDataset object at 0x0000020E0EFBBD10>


In [254]:
def eval_model(model: torch.nn.Module,
               data_loader: torch.utils.data.DataLoader,
               loss_fn: torch.nn.Module,
               acc_fn,
               device="cuda"):
    loss, acc = 0, 0
    model.eval()
    with torch.inference_mode():
        for X, y in data_loader:
            X, y = X.to(device), y.to(device)
            
            y_pred = model(X)
            loss += loss_fn(y_pred, y)
            acc += acc_fn(y_pred, y)

        loss /= len(data_loader)
        acc /= len(data_loader)

    return {
        "model_name": model.__class__.__name__,
        "model_loss": loss.item(),
        "model_acc": acc.item()
    }

In [255]:
def train_step(model: torch.nn.Module,
               data_loader: torch.utils.data.DataLoader,
               loss_fn: torch.nn.Module,
               optimizer: torch.optim.Optimizer,
               acc_fn: torch.nn.Module,
               device: torch.device = "cuda"):

    train_loss, train_acc = 0, 0
    model.train()

    for batch, (X, y) in enumerate(data_loader):
        X, y = X.to(device), y.to(device)

        y_pred = model(X)

        loss = loss_fn(y_pred, y)
        train_loss += loss
        train_acc += acc_fn(y_pred.argmax(dim=1), y)

        optimizer.zero_grad()

        loss.backward()

        optimizer.step()

    train_loss /= len(data_loader)
    train_acc /= len(data_loader)

    print(f"Train loss: {train_loss:.5f} | Train acc: {100 * train_acc:.5f}%")

In [256]:
def test_step(model: torch.nn.Module,
              data_loader: torch.utils.data.DataLoader,
              loss_fn: torch.nn.Module,
              acc_fn,
              device: torch.device = "cuda"):


    test_loss, test_acc = 0, 0

    model.eval()
    with torch.inference_mode():
        for X, y in data_loader:
            X, y = X.to(device), y.to(device)

            test_pred = model(X)

            test_loss += loss_fn(test_pred, y)

            test_acc += acc_fn(test_pred.argmax(dim=1), y)

        test_loss /= len(test_dataloader)

        test_acc /= len(test_dataloader)

    print(f"Test loss: {test_loss:.4f}, Test acc: {100 * test_acc:.4f}")

In [257]:
device = "cuda" if torch.cuda.is_available() else "cpu"

class DiabetesModelV0(nn.Module):
    def __init__(self, input_shape: int, hidden_units: int, output_shape: int):
        super().__init__()
        self.layer_block = nn.Sequential(
            nn.Linear(in_features=input_shape,
                      out_features=hidden_units),
            nn.Linear(in_features=hidden_units,
                      out_features=hidden_units),
            nn.Linear(in_features=hidden_units,
                      out_features=output_shape)
        )


    def forward(self, x):
        return self.layer_block(x)

In [258]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_0 = DiabetesModelV0(input_shape=9, hidden_units=16, output_shape=1).to(device)

loss_fn = nn.BCELoss
optimizer = torch.optim.SGD(params=model_0.parameters(), lr=0.01)
acc_fn = Accuracy(task="binary").to(device)

In [259]:
epochs = 3

for epoch in tqdm(range(epochs)):
    train_step(model=model_0,
               data_loader=train_dataloader,
               loss_fn=loss_fn,
               optimizer=optimizer,
               acc_fn=acc_fn,
               device=device)
    test_step(model=model_0,
               data_loader=test_dataloader,
               loss_fn=loss_fn,
               acc_fn=acc_fn,
               device=device)

  0%|          | 0/3 [00:00<?, ?it/s]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x8 and 9x16)