In [18]:
import torch
import sklearn.datasets as skds
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader
import torchvision
from torch import nn
import numpy
import pandas
from typing import Optional

# Creating custom dataset from CSV file

In [38]:
class MyDiabetesDataset(torch.utils.data.Dataset):
	def __init__(self, csv_file, batch_size: Optional[int] = None):
		self.data = pandas.read_csv(csv_file, sep="\t")
		if batch_size:
			assert batch_size > 0
			self.batch_size = batch_size

	def __len__(self):
		return len(self.data)

	def __getitem__(self, index):
		#return self.data[index]
		data = self.data.iloc[index]
		return data.to_numpy()

num_batches = 10
dataset = MyDiabetesDataset("diabetes.csv", batch_size=num_batches)
print(dataset[0], type(dataset[0]))
dataset.data

[ 59.       2.      32.1    101.     157.      93.2     38.       4.
   4.8598  87.     151.    ] <class 'numpy.ndarray'>


Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y
0,59,2,32.1,101.00,157,93.2,38.0,4.00,4.8598,87,151
1,48,1,21.6,87.00,183,103.2,70.0,3.00,3.8918,69,75
2,72,2,30.5,93.00,156,93.6,41.0,4.00,4.6728,85,141
3,24,1,25.3,84.00,198,131.4,40.0,5.00,4.8903,89,206
4,50,1,23.0,101.00,192,125.4,52.0,4.00,4.2905,80,135
...,...,...,...,...,...,...,...,...,...,...,...
437,60,2,28.2,112.00,185,113.8,42.0,4.00,4.9836,93,178
438,47,2,24.9,75.00,225,166.0,42.0,5.00,4.4427,102,104
439,60,2,24.9,99.67,162,106.6,43.0,3.77,4.1271,95,132
440,36,1,30.0,95.00,201,125.2,42.0,4.79,5.1299,85,220


# Creating custom loader for our custom dataset

This allows:
- Batching
- Shuffling the data
- Load the data in parallel using `multiprocess` workers

In [30]:
dataloader = DataLoader(dataset, batch_size=10)

In [32]:
for i_batch, sample_batched in enumerate(dataloader):
    print(i_batch, sample_batched)

0 tensor([[ 59.0000,   2.0000,  32.1000, 101.0000, 157.0000,  93.2000,  38.0000,
           4.0000,   4.8598,  87.0000, 151.0000],
        [ 48.0000,   1.0000,  21.6000,  87.0000, 183.0000, 103.2000,  70.0000,
           3.0000,   3.8918,  69.0000,  75.0000],
        [ 72.0000,   2.0000,  30.5000,  93.0000, 156.0000,  93.6000,  41.0000,
           4.0000,   4.6728,  85.0000, 141.0000],
        [ 24.0000,   1.0000,  25.3000,  84.0000, 198.0000, 131.4000,  40.0000,
           5.0000,   4.8903,  89.0000, 206.0000],
        [ 50.0000,   1.0000,  23.0000, 101.0000, 192.0000, 125.4000,  52.0000,
           4.0000,   4.2905,  80.0000, 135.0000],
        [ 23.0000,   1.0000,  22.6000,  89.0000, 139.0000,  64.8000,  61.0000,
           2.0000,   4.1897,  68.0000,  97.0000],
        [ 36.0000,   2.0000,  22.0000,  90.0000, 160.0000,  99.6000,  50.0000,
           3.0000,   3.9512,  82.0000, 138.0000],
        [ 66.0000,   2.0000,  26.2000, 114.0000, 255.0000, 185.0000,  56.0000,
           4.550

# Creating model

In [36]:
model = nn.Sequential(
	nn.Linear(10, 100),
	nn.ReLU(),
	nn.Linear(100, 500),
	nn.ReLU(),
	nn.Linear(500, 1),
	nn.Sigmoid()
)
model

Sequential(
  (0): Linear(in_features=10, out_features=100, bias=True)
  (1): ReLU()
  (2): Linear(in_features=100, out_features=500, bias=True)
  (3): ReLU()
  (4): Linear(in_features=500, out_features=1, bias=True)
  (5): Sigmoid()
)

# Train

Target is last column (column 11 is measure of diabetes, is a number, not classical linear regression!)

### Set cost function, optimizer function

In [42]:
CE_loss=nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

### Set batch processing function

In [43]:
def iterate_batch():
	batch = next(iter(dataloader))

	optimizer.zero_grad()
	y_model = model(batch)

	loss = CE_loss(y_model, labels)  # TODO: How to train given the expected column 11 value?

	loss.backward()
	optimizer.step()

	predicted_labels = y_model.argmax(dim=1)
	acc = (predicted_labels == labels).sum()/len(labels)
	return loss.detach(), acc.detach()

### Put it all together

In [45]:
batch_loss = torch.zeros(num_batches)
batch_acc = torch.zeros(num_batches)

for idx in range(num_batches): 
    batch_loss[idx], batch_acc[idx] = iterate_batch()

ValueError: too many values to unpack (expected 2)

# Graph results