## Importing the Dependencies

In [9]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.model_selection import train_test_split

## Data extraction and preprocessing

### PIMA diabetes dataset is being used

In [10]:
diabetes_df = pd.read_csv('diabetes.csv')
diabetes_df - diabetes_df.sample(n = len(diabetes_df)) # shuffling the data
diabetes_df.head() # How the data looks

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [11]:
# Checking for N/A values
diabetes_df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [12]:
diabetes_df.describe() # statistical measures of the data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [13]:
# Lets count and group the dataframe rows by the targets to get a sense of feature influence and data distributions
print("Outcomes distributions 1-diabetic, 0-non-diabetic: \n",diabetes_df['Outcome'].value_counts())

print('-'*50)

diabetes_df.groupby('Outcome').mean()

Outcomes distributions 1-diabetic, 0-non-diabetic: 
 Outcome
0    500
1    268
Name: count, dtype: int64
--------------------------------------------------


Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [14]:
# Preparing values and target
X = diabetes_df.drop('Outcome', axis=1)
y = diabetes_df['Outcome']

X.shape, y.shape

((768, 8), (768,))

In [15]:
# Normalization to prevent unwanted influence of a particular feature
X_norm=(X-X.mean())/X.std() # this remains a pd dataframe
X_norm.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,-7.864080000000001e-17,-1.1564820000000001e-17,1.2721310000000002e-17,3.8163920000000003e-17,-4.972874e-17,2.659909e-16,2.451743e-16,2.035409e-16
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.141108,-3.78119,-3.570271,-1.287373,-0.6924393,-4.057829,-1.188778,-1.040871
25%,-0.8443348,-0.6847901,-0.3670975,-1.287373,-0.6924393,-0.5951906,-0.6885198,-0.7857741
50%,-0.2507887,-0.1218083,0.1495433,0.1544326,-0.4277835,0.0009413653,-0.2999328,-0.3606124
75%,0.6395305,0.6053764,0.562856,0.7186174,0.4117396,0.5843897,0.4659233,0.6597757
max,3.904034,2.442886,2.732747,4.91866,6.648507,4.452906,5.879733,4.061069


## Preparing Train and Test splits

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.2, stratify=y, random_state=2)

print(X.shape, X_train.shape, X_test.shape)

(768, 8) (614, 8) (154, 8)


In [29]:
# Transforming dataset to tensors suitable for pytorch
X_train_tensor = torch.from_numpy(X_train.to_numpy()).float()
X_test_tensor = torch.from_numpy(X_test.to_numpy()).float()
Y_train_tensor = torch.from_numpy(Y_train.to_numpy()).reshape(-1, 1).float()
Y_test_tensor = torch.from_numpy(Y_test.to_numpy()).reshape(-1, 1).float()

X_train_tensor.shape, Y_train_tensor.shape, X_test_tensor.shape, Y_test_tensor.shape

(torch.Size([614, 8]),
 torch.Size([614, 1]),
 torch.Size([154, 8]),
 torch.Size([154, 1]))

## Model preparation and training

In [32]:
class DiabetesClassifier(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.hidden1 = nn.Linear(8, 4)
        self.act1 = nn.ReLU()
        self.output = nn.Linear(4, 1)
        self.act_output = nn.Sigmoid()
    def forward(self, x):
        x = self.act1(self.hidden1(x))
        x = self.act_output(self.output(x))
        return x

model = DiabetesClassifier()
print(model)

DiabetesClassifier(
  (hidden1): Linear(in_features=8, out_features=4, bias=True)
  (act1): ReLU()
  (output): Linear(in_features=4, out_features=1, bias=True)
  (act_output): Sigmoid()
)


## Loss and Optimizer

In [33]:
loss_fn = nn.BCELoss()  # binary cross entropy
optimizer = optim.Adam(model.parameters(), lr=0.001)

## Training

In [34]:
n_epochs = 100
batch_size = 10
 
for epoch in range(n_epochs):
    for i in range(0, len(X_train_tensor), batch_size):
        Xbatch = X_train_tensor[i:i+batch_size]
        y_pred = model(Xbatch)
        ybatch = Y_train_tensor[i:i+batch_size]
        loss = loss_fn(y_pred, ybatch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Finished epoch {epoch}, latest loss {loss}')

Finished epoch 0, latest loss 1.7258367538452148
Finished epoch 1, latest loss 1.2726655006408691
Finished epoch 2, latest loss 0.7512059211730957
Finished epoch 3, latest loss 0.5935401320457458
Finished epoch 4, latest loss 0.5311040282249451
Finished epoch 5, latest loss 0.5224899649620056
Finished epoch 6, latest loss 0.5137693285942078
Finished epoch 7, latest loss 0.5060229897499084
Finished epoch 8, latest loss 0.4910108745098114
Finished epoch 9, latest loss 0.4901670813560486
Finished epoch 10, latest loss 0.4917377829551697
Finished epoch 11, latest loss 0.4922412037849426
Finished epoch 12, latest loss 0.4925665557384491
Finished epoch 13, latest loss 0.49437588453292847
Finished epoch 14, latest loss 0.49244004487991333
Finished epoch 15, latest loss 0.493877649307251
Finished epoch 16, latest loss 0.4957502484321594
Finished epoch 17, latest loss 0.49515262246131897
Finished epoch 18, latest loss 0.4965129792690277
Finished epoch 19, latest loss 0.4975947141647339
Finished

## Model Evaluation based on accuracy

In [35]:
# compute accuracy (no_grad is optional)
with torch.no_grad():
    y_pred = model(X_train_tensor)
accuracy = (y_pred.round() == Y_train_tensor).float().mean()
print(f"Accuracy {accuracy}")

Accuracy 0.7442996501922607


In [36]:
# compute accuracy (no_grad is optional)
with torch.no_grad():
    y_pred = model(X_test_tensor)
accuracy = (y_pred.round() == Y_test_tensor).float().mean()
print(f"Accuracy {accuracy}")

Accuracy 0.7272727489471436


In [40]:
input_data = (5,166,72,19,175,25.8,0.587,51)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = (input_data_reshaped - input_data_reshaped.mean())/input_data_reshaped.std()
print(std_data)

prediction = model(torch.tensor(std_data).float())
print(prediction)

if (prediction[0] <= .5):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[[-0.91034909  1.5613241   0.1182354  -0.69542099  1.69949217 -0.59102734
  -0.9780975  -0.20415675]]
tensor([[0.1993]], grad_fn=<SigmoidBackward0>)
The person is not diabetic
