# Binary Classification
## 1. Load Dataset from sklearn

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

In [2]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [4]:
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['class'] = cancer.target
df.tail()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,class
564,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,0.05623,...,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115,0
565,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,...,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637,0
566,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,0.159,0.05648,...,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782,0
567,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,0.07016,...,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124,0
568,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,0.1587,0.05884,...,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039,1


## 2. Convert to PyTorch Tensor

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [6]:
data = torch.from_numpy(df.values).float()
data.shape

torch.Size([569, 31])

In [9]:
x = data[:, :-1]
y = data[:, -1:]

print(x.shape, y.shape)

torch.Size([569, 30]) torch.Size([569, 1])


In [10]:
# train, valid, test ratio
ratios = [.6, .2, .2]

In [11]:
train_cnt = int(data.size(0)*ratios[0]) # data.size(0)=569, ratios[0]=.6
valid_cnt = int(data.size(0)*ratios[1]) # data.size(0)=569, ratios[1]=.2
test_cnt = data.size(0) - train_cnt - valid_cnt
cnts = [train_cnt, valid_cnt, test_cnt]

print("train %d / valid %d / test %d samples." % (train_cnt, valid_cnt, test_cnt))

train 341 / valid 113 / test 115 samples.


In [12]:
# 데이터 섞고 분리
indices = torch.randperm(data.size(0))

x = torch.index_select(x, dim=0, index=indices)
y = torch.index_select(y, dim=0, index=indices)

x = x.split(cnts, dim=0)
y = y.split(cnts, dim=0)

for x_i, y_i in zip(x, y):
    print(x_i.size(), y_i.size())

torch.Size([341, 30]) torch.Size([341, 1])
torch.Size([113, 30]) torch.Size([113, 1])
torch.Size([115, 30]) torch.Size([115, 1])


## 3. Preprocessing

In [19]:
x[0].size(-1)

30

In [18]:
scaler = StandardScaler()
scaler.fit(x[0].numpy()) # x[0] = x_train

x = [torch.from_numpy(scaler.transform(x[0].numpy())).float(),
     torch.from_numpy(scaler.transform(x[1].numpy())).float(),
     torch.from_numpy(scaler.transform(x[2].numpy())).float()]

df = pd.DataFrame(x[0].numpy(), columns=cancer.feature_names)
df.tail()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
336,-0.59804,-1.134751,-0.581492,-0.59386,0.487012,-0.236446,-0.286961,-0.559392,-0.482309,-0.328681,...,-0.524974,-0.843474,-0.51911,-0.527439,0.739495,-0.231534,0.173341,-0.740064,-0.249585,-0.052583
337,0.116243,-1.297664,0.080138,-0.029882,0.986523,-0.206325,-0.248276,0.424989,-0.577048,-0.216287,...,-0.097328,-1.496144,-0.120461,-0.230124,0.084105,-0.669117,-0.652055,-0.105647,-0.490419,-0.322731
338,1.518888,-0.244997,1.474497,1.419273,0.531087,0.833453,0.947382,1.205189,0.343817,-0.546266,...,1.577619,0.532845,1.366181,1.330375,0.846497,0.796594,0.796448,1.706317,1.190296,-0.335386
339,1.573611,1.464332,1.495408,1.552382,0.494358,-0.080018,0.984407,1.099814,-0.527784,-1.268184,...,1.084987,0.932756,0.986228,0.962508,0.891081,-0.432254,0.629597,0.502425,-1.025037,-1.268524
340,-0.488593,-0.540747,-0.558072,-0.512837,-1.535272,-1.368596,-0.957245,-0.835313,-1.251591,-1.023221,...,-0.705256,-0.928304,-0.763969,-0.645973,-2.172309,-1.31891,-1.217724,-1.309232,-1.69459,-1.356006


## 4. Build Model & Optimizer

In [20]:
model = nn.Sequential(
    nn.Linear(x[0].size(-1), 25), # x[0].size(-1) = x_train의 컬럼수
    nn.LeakyReLU(),
    nn.Linear(25, 20),
    nn.LeakyReLU(),
    nn.Linear(20, 15),
    nn.LeakyReLU(),
    nn.Linear(15, 10),
    nn.LeakyReLU(),
    nn.Linear(10, 5),
    nn.LeakyReLU(),
    nn.Linear(5, y[0].size(-1)),
    nn.Sigmoid(), # 이진분류
)

model

Sequential(
  (0): Linear(in_features=30, out_features=25, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=25, out_features=20, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=20, out_features=15, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
  (6): Linear(in_features=15, out_features=10, bias=True)
  (7): LeakyReLU(negative_slope=0.01)
  (8): Linear(in_features=10, out_features=5, bias=True)
  (9): LeakyReLU(negative_slope=0.01)
  (10): Linear(in_features=5, out_features=1, bias=True)
  (11): Sigmoid()
)

In [21]:
optimizer = optim.Adam(model.parameters())

## 5. Train

In [22]:
n_epochs = 10000
batch_size = 32
print_interval = 100
early_stop = 1000

In [23]:
from copy import deepcopy

lowest_loss = np.inf # 이후 최저loss 찾는데 사용할 변수
best_model = None 

lowest_epoch = np.inf # 이후 최저loss를 가지는 epoch 찾는데 사용할 변수

In [25]:
train_history, valid_history = [], []

for i in range(n_epochs):
    # 랜덤하게 데이터 섞기
    indices = torch.randperm(x[0].size(0))
    x_ = torch.index_select(x[0], dim=0, index=indices)
    y_ = torch.index_select(y[0], dim=0, index=indices)
    
    x_ = x_.split(batch_size, dim=0)
    y_ = y_.split(batch_size, dim=0)
    
    #####
    
    train_loss, valid_loss = 0, 0
    y_hat = []
    
    # loss 찾기
    for x_i, y_i in zip(x_, y_):
        y_hat_i = model(x_i)
        loss = F.binary_cross_entropy(y_hat_i, y_i)

        optimizer.zero_grad()
        loss.backward() # 미분

        optimizer.step()   
        train_loss += float(loss) # This is very important to prevent memory leak.

    train_loss = train_loss / len(x_) # 평균 train_loss
    
    # no_grad() 안하면 속도가 엄청 느려짐
    with torch.no_grad():
        x_ = x[1].split(batch_size, dim=0)
        y_ = y[1].split(batch_size, dim=0)
        
        valid_loss = 0
        
        for x_i, y_i in zip(x_, y_):
            y_hat_i = model(x_i)
            loss = F.binary_cross_entropy(y_hat_i, y_i)
            
            valid_loss += float(loss)
            
            y_hat += [y_hat_i]
            
    valid_loss = valid_loss / len(x_)
    
    train_history += [train_loss]
    valid_history += [valid_loss]
    
    if (i + 1) % print_interval == 0:
        print('Epoch %d: train loss=%.4e  valid_loss=%.4e  lowest_loss=%.4e' % (
            i + 1,
            train_loss,
            valid_loss,
            lowest_loss,
        ))
        
    # valid loss 갱신 여부 best model 찾기
    if valid_loss <= lowest_loss:
        lowest_loss = valid_loss
        lowest_epoch = i
        
        best_model = deepcopy(model.state_dict()) # model.state_dict() 모델을 딕셔너리형으로 저장
    else:
        if early_stop > 0 and lowest_epoch + early_stop < i + 1:
            print("There is no improvement during last %d epochs." % early_stop)
            break

print("The best validation loss from epoch %d: %.4e" % (lowest_epoch + 1, lowest_loss))
model.load_state_dict(best_model)

Epoch 100: train loss=1.6807e-04  valid_loss=4.7265e-02  lowest_loss=2.5532e-02
Epoch 200: train loss=1.9054e-05  valid_loss=6.1857e-02  lowest_loss=2.5532e-02
Epoch 300: train loss=6.3405e-06  valid_loss=6.4396e-02  lowest_loss=2.5532e-02
Epoch 400: train loss=2.5916e-06  valid_loss=6.6352e-02  lowest_loss=2.5532e-02
Epoch 500: train loss=1.1828e-06  valid_loss=6.8140e-02  lowest_loss=2.5532e-02
Epoch 600: train loss=5.8275e-07  valid_loss=6.9665e-02  lowest_loss=2.5532e-02
Epoch 700: train loss=2.7873e-07  valid_loss=7.0754e-02  lowest_loss=2.5532e-02
Epoch 800: train loss=1.4072e-07  valid_loss=7.1800e-02  lowest_loss=2.5532e-02
Epoch 900: train loss=7.3152e-08  valid_loss=7.2446e-02  lowest_loss=2.5532e-02
Epoch 1000: train loss=3.8761e-08  valid_loss=7.3143e-02  lowest_loss=2.5532e-02
There is no improvement during last 1000 epochs.
The best validation loss from epoch 39: 2.5532e-02


<All keys matched successfully>