In [172]:
import pandas as pd
import numpy as np
import torch

from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

from sklearn.model_selection import train_test_split

# Данные 

In [173]:
df = pd.read_csv('diabetes.csv')

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [175]:
X_train, X_test, Y_train, Y_test = train_test_split(
    df.drop(columns='Outcome'), 
    df['Outcome'], 
    stratify=df['Outcome'],
    test_size=0.25)

In [204]:
class Data(Dataset):
    def __init__(self, X_train, y_train):
        self.X = torch.from_numpy(X_train.values.astype(np.float32))
        self.y = torch.from_numpy(y_train.values).type(torch.LongTensor)
        self.len = self.X.shape[0]
  
    def __getitem__(self, index):
        return self.X[index], self.y[index]
    
    def __len__(self):
        return self.len

In [205]:
traindata = Data(X_train, Y_train)

In [206]:
print(traindata[25])

(tensor([ 1.0000, 95.0000, 60.0000, 18.0000, 58.0000, 23.9000,  0.2600, 22.0000]), tensor(0))


In [207]:
batch_size = 20
trainloader = DataLoader(traindata, 
                         batch_size=batch_size, 
                         shuffle=True, 
                         num_workers=0)

In [209]:
testdata = Data(X_test, Y_test)
testloader = DataLoader(testdata, batch_size=1, 
                        shuffle=True, num_workers=0)

In [210]:
def get_test_accuracy_score(clf):
    correct, total = 0, 0
    with torch.no_grad():
        for data in testloader:
            inputs, labels = data
            outputs = clf(inputs)
            __, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
    return round(correct / total,2)

In [211]:
input_dim = len(X_test.columns)
hidden_layers = 20
output_dim = 2

class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.linear1 = nn.Linear(input_dim, hidden_layers)
        self.linear2 = nn.Linear(hidden_layers, output_dim)
    def forward(self, x):
        x = torch.sigmoid(self.linear1(x))
        x = self.linear2(x)
        return x

In [212]:
clf = Network()

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(clf.parameters(), lr=0.1)

In [214]:
epochs = 200
for epoch in range(epochs):
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data
        # обнуляем градиент, чтобы удалить градиенты предыдущей эпохи
        optimizer.zero_grad()
        # forward propagation
        outputs = clf(inputs)
        loss = criterion(outputs, labels)
        # backward propagation
        loss.backward()
        # optimize
        optimizer.step()
        running_loss += loss.item()
        
    if epoch % 5 == 0:
        acc = get_test_accuracy_score(clf)
        print(f'Epoch {epoch + 1} | loss: {round(running_loss, 2)} | accuracy on test: {acc}')
        

Epoch 1 | loss: 16.33 | accuracy on test: 0.65
Epoch 6 | loss: 16.61 | accuracy on test: 0.65
Epoch 11 | loss: 16.25 | accuracy on test: 0.65
Epoch 16 | loss: 16.36 | accuracy on test: 0.63
Epoch 21 | loss: 16.31 | accuracy on test: 0.65
Epoch 26 | loss: 16.5 | accuracy on test: 0.59
Epoch 31 | loss: 16.53 | accuracy on test: 0.65
Epoch 36 | loss: 16.71 | accuracy on test: 0.62
Epoch 41 | loss: 16.47 | accuracy on test: 0.59
Epoch 46 | loss: 16.66 | accuracy on test: 0.62
Epoch 51 | loss: 16.38 | accuracy on test: 0.61
Epoch 56 | loss: 17.13 | accuracy on test: 0.62
Epoch 61 | loss: 16.49 | accuracy on test: 0.63
Epoch 66 | loss: 16.6 | accuracy on test: 0.6
Epoch 71 | loss: 16.63 | accuracy on test: 0.66
Epoch 76 | loss: 16.43 | accuracy on test: 0.63
Epoch 81 | loss: 17.54 | accuracy on test: 0.61
Epoch 86 | loss: 17.48 | accuracy on test: 0.61
Epoch 91 | loss: 17.38 | accuracy on test: 0.61
Epoch 96 | loss: 17.54 | accuracy on test: 0.62
Epoch 101 | loss: 17.24 | accuracy on test: 0

Видим, что модель быстро переобучается, поэтому сделаем меньше эпох

In [220]:
clf = Network()

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(clf.parameters(), lr=0.1)

epochs = 30
for epoch in range(epochs):
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data
        # обнуляем градиент, чтобы удалить градиенты предыдущей эпохи
        optimizer.zero_grad()
        # forward propagation
        outputs = clf(inputs)
        loss = criterion(outputs, labels)
        # backward propagation
        loss.backward()
        # optimize
        optimizer.step()
        running_loss += loss.item()
        
    if epoch % 5 == 0:
        acc = get_test_accuracy_score(clf)
        print(f'Epoch {epoch + 1} | loss: {round(running_loss, 2)} | accuracy on test: {acc}')
        

Epoch 1 | loss: 17.72 | accuracy on test: 0.65
Epoch 6 | loss: 17.05 | accuracy on test: 0.65
Epoch 11 | loss: 17.04 | accuracy on test: 0.62
Epoch 16 | loss: 16.91 | accuracy on test: 0.65
Epoch 21 | loss: 17.01 | accuracy on test: 0.65
Epoch 26 | loss: 16.7 | accuracy on test: 0.67


In [221]:
TP = 0
FP = 0
FN = 0
TN = 0
with torch.no_grad():
    for data in testloader:
        inputs, labels = data
        outputs = clf(inputs)
        __, predicted = torch.max(outputs.data, 1)
        
        if predicted == 1:
            if labels == 1:
                TP += 1
            else:
                FP += 1
        else:
            if labels == 1:
                FN += 1
            else:
                TN += 1

In [222]:
accuracy = (TP+TN)/(TP+FP+FN+TN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
f_score = 2*(precision*recall)/(precision+recall)

print(f'Accuracy: {round(accuracy, 2)}')
print(f'Precision: {round(precision, 2)}')
print(f'Recall: {round(recall, 2)}')
print(f'F-score: {round(f_score, 2)}')

Accuracy: 0.64
Precision: 0.44
Recall: 0.1
F-score: 0.16
