# Dataset, Dataloader, BatchNorm, Dropout, Оптимизация

## Используя датасет недвижимости (sklearn.datasets.fetch_california_housing)
Необходимо:
* Создать Dataset для загрузки данных
* Обернуть его в Dataloader
* Написать архитектуру сети, которая предсказывает стоимость недвижимости. Сеть должна включать BatchNorm слои и Dropout (или НЕ включать, но нужно обосновать)
* Сравните сходимость Adam, RMSProp и SGD, сделайте вывод по качеству работы модели

При этом train-test разделение нужно сделать с помощью sklearn random_state=13, test_size = 0.25.

### Подключаем необходимые библиотеки

In [1]:
import numpy as np
import pandas as pd

import torch
from torch import nn
from torch.optim import Adam, RMSprop, SGD
from torch.utils.data import DataLoader, Dataset

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Глобальные переменные

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHES = 8
LR = 0.01

### Создадим Dataset

In [3]:
class MyDataset(Dataset):
  def __init__(self, *init_datasets):
    assert all(init_datasets[0].size(0) == init_dataset.size(0) for init_dataset in init_datasets), "Несоотвутствует размерность среди dataset"
    self._base_datasets = init_datasets
  
  def __len__(self):
    return self._base_datasets[0].size(0)
  
  def __getitem__(self, idx):
      return tuple(base_dataset[idx] for base_dataset in self._base_datasets)

### Опишем нейронную сеть

In [4]:
class MyNet(nn.Module):
  def __init__(self, variant = 1) -> None:
      super(MyNet, self).__init__()
      self.block_1 = self.variant_sequential(variant, 8, 100, 0.1)
      self.block_2 = self.variant_sequential(variant, 100, 100, 0.2)
      self.block_3 = self.variant_sequential(variant, 100, 60, 0.2)
      self.block_4 = self.variant_sequential(variant, 60 ,30 ,0.2)
      self.predict = self.variant_sequential(4, 30, 1)
  
  def forward(self, inp):
    out = self.block_1(inp)
    out = self.block_2(out)
    out = self.block_3(out)
    out = self.block_4(out)
    out = self.predict(out)
    return out[:, 0]

  def variant_sequential(self, variant = 4, in_features = 0, out_features = 0, d_out = 0.1, bias = True):
    result = None
    #BatchNorm1d,ReLU,Dropout 
    if variant == 1:
          result = nn.Sequential(nn.Linear(in_features, out_features, bias=True),
          nn.BatchNorm1d(out_features),         
          nn.ReLU(),
          nn.Dropout(d_out))

    #ReLU,BatchNorm1d,Dropout
    if variant == 2:
          result = nn.Sequential(nn.Linear(in_features, out_features, bias=True),
          nn.ReLU(),
          nn.BatchNorm1d(out_features),
          nn.Dropout(d_out))

    #Dropout,BatchNorm1d,ReLU
    if variant == 3:
          result = nn.Sequential(nn.Linear(in_features, out_features, bias=True),
          nn.Dropout(d_out),
          nn.BatchNorm1d(out_features),
          nn.ReLU())

    #BatchNorm1d,ReLU      
    if variant == 4:
          result = nn.Sequential(nn.Linear(in_features, out_features, bias=True),
          nn.BatchNorm1d(out_features),
          nn.ReLU())

    return result   


### Функция для оценки оптимизации

In [5]:
def train_loop(train_loader, test_loader, net, optimizer):
  loss_fn = nn.MSELoss()
  best_acc = {'train': None, 'test': None}
  net.train()
  for epoch in range(EPOCHES):
    running_loss, running_items, running_right = 0.0, 0.0, 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        
        outputs = net(inputs)
        loss = loss_fn(outputs, labels)

        # обнуляем градиент
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # выводим статистику о процессе обучения
        running_loss += loss.item()
        running_items += len(labels)
        
        # выводим статистику о процессе обучения
        if i % 100 == 0 or (i + 1) == len(train_loader):    # печатаем каждые 100 mini-batches
            net.eval()

            test_loss, test_running_total, test_loss  = 0.0, 0.0, 0.0
            for y, (out_test, lbl_test) in enumerate(test_loader):
                test_outputs = net(out_test)
                test_loss += loss_fn(test_outputs, lbl_test)
                test_running_total += len(lbl_test)
            
            res_loss_train = running_loss / running_items
            res_loss_test = test_loss / test_running_total
            
            if best_acc['train'] is None or res_loss_train < best_acc['train']:
              best_acc['train'] = res_loss_train
            
            if best_acc['test'] is None or res_loss_test < best_acc['test']:
              best_acc['test'] = res_loss_train

            #print(f'Epoch [{epoch + 1}/{EPOCHES}]. ' \
            #      f'Step [{i + 1}/{len(train_loader)}]. ' \
            #      f'Loss: {res_loss_train:.3f}. '\
            #      f'Test acc: {res_loss_test:.3f}.')
            
            running_loss, running_items = 0.0, 0.0
            net.train()
  print(f"Best acc train: {best_acc['train']:.3f}. Best acc test: {best_acc['test']:.3f}")
  print('Training is finished!')
  %%time

### Начало анализа

In [6]:
california_housing = fetch_california_housing()
# Разделим на тестовые и тренеровочные данные
X_train, X_test, y_train, y_test = train_test_split(california_housing.data, california_housing.target, test_size=0.25, random_state=13)

In [7]:
# Нормализуем данные и подготовим их для дальнейшего использования в нашем dstaset
scale = StandardScaler()
X_train_s = scale.fit_transform(X_train)
X_test_s = scale.transform(X_test)

In [8]:
train_xt = torch.from_numpy(X_train_s.astype(np.float32)).to(DEVICE)
train_yt = torch.from_numpy(y_train.astype(np.float32)).to(DEVICE)

test_xt = torch.from_numpy(X_test_s.astype(np.float32)).to(DEVICE)
test_yt = torch.from_numpy(y_test.astype(np.float32)).to(DEVICE)

In [9]:
train_dataset = MyDataset(train_xt, train_yt)
test_dataset = MyDataset(test_xt, test_yt)

In [10]:
train_loader = DataLoader(train_dataset, batch_size=100, shuffle=True, num_workers=2, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=True, num_workers=2, drop_last=True)

### Дальшей начнем анализ по таким оптимизаторам как Adam, RMSProp, SGD и SGD + Momentum

In [11]:
variants = {1:'BatchNorm1d,ReLU,Dropout',2:'ReLU,BatchNorm1d,Dropout',3:'Dropout,BatchNorm1d,ReLU'}

### Adam

In [12]:
for i, label in variants.items():
  net = MyNet(i).to(DEVICE)
  optimizer = Adam(net.parameters(), lr=LR)  
  print(f'********************\nВариант:{i} {label}\n{optimizer}\n')  
  train_loop(train_loader, test_loader, net, optimizer)


********************
Вариант:1 BatchNorm1d,ReLU,Dropout
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.01
    maximize: False
    weight_decay: 0
)

Best acc train: 0.003. Best acc test: 0.004
Training is finished!
CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 6.2 µs
********************
Вариант:2 ReLU,BatchNorm1d,Dropout
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.01
    maximize: False
    weight_decay: 0
)

Best acc train: 0.004. Best acc test: 0.004
Training is finished!
CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 10 µs
********************
Вариант:3 Dropout,BatchNorm1d,ReLU
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.01
    maximize: False
    weight_decay: 0
)

Best acc train: 0.003. Best acc test: 0.003
Training is finished!
CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 7.39 µs


### RMSProp

In [13]:
for i, label in variants.items():
  net = MyNet(i).to(DEVICE)
  optimizer = RMSprop(net.parameters(), lr=LR)
  print(f'********************\nВариант:{i} {label}\n{optimizer}\n')  
  train_loop(train_loader, test_loader, net, optimizer)


********************
Вариант:1 BatchNorm1d,ReLU,Dropout
RMSprop (
Parameter Group 0
    alpha: 0.99
    centered: False
    eps: 1e-08
    lr: 0.01
    momentum: 0
    weight_decay: 0
)

Best acc train: 0.003. Best acc test: 0.005
Training is finished!
CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 8.11 µs
********************
Вариант:2 ReLU,BatchNorm1d,Dropout
RMSprop (
Parameter Group 0
    alpha: 0.99
    centered: False
    eps: 1e-08
    lr: 0.01
    momentum: 0
    weight_decay: 0
)

Best acc train: 0.003. Best acc test: 0.003
Training is finished!
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 6.91 µs
********************
Вариант:3 Dropout,BatchNorm1d,ReLU
RMSprop (
Parameter Group 0
    alpha: 0.99
    centered: False
    eps: 1e-08
    lr: 0.01
    momentum: 0
    weight_decay: 0
)

Best acc train: 0.003. Best acc test: 0.004
Training is finished!
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 8.58 µs


### SGD

In [14]:
for i, label in variants.items():
  net = MyNet(i).to(DEVICE)
  optimizer = SGD(net.parameters(), lr=LR)
  print(f'********************\nВариант:{i} {label}\n{optimizer}\n')  
  train_loop(train_loader, test_loader, net, optimizer)


********************
Вариант:1 BatchNorm1d,ReLU,Dropout
SGD (
Parameter Group 0
    dampening: 0
    lr: 0.01
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)

Best acc train: 0.003. Best acc test: 0.003
Training is finished!
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs
********************
Вариант:2 ReLU,BatchNorm1d,Dropout
SGD (
Parameter Group 0
    dampening: 0
    lr: 0.01
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)

Best acc train: 0.004. Best acc test: 0.004
Training is finished!
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 6.68 µs
********************
Вариант:3 Dropout,BatchNorm1d,ReLU
SGD (
Parameter Group 0
    dampening: 0
    lr: 0.01
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)

Best acc train: 0.004. Best acc test: 0.004
Training is finished!
CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 6.91 µs


### SGD + Momentum

In [15]:
for i, label in variants.items():
  net = MyNet(i).to(DEVICE)
  optimizer = SGD(net.parameters(), lr=LR, momentum=0.8)
  print(f'********************\nВариант:{i} {label}\n{optimizer}\n')  
  train_loop(train_loader, test_loader, net, optimizer)


********************
Вариант:1 BatchNorm1d,ReLU,Dropout
SGD (
Parameter Group 0
    dampening: 0
    lr: 0.01
    maximize: False
    momentum: 0.8
    nesterov: False
    weight_decay: 0
)

Best acc train: 0.004. Best acc test: 0.004
Training is finished!
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.15 µs
********************
Вариант:2 ReLU,BatchNorm1d,Dropout
SGD (
Parameter Group 0
    dampening: 0
    lr: 0.01
    maximize: False
    momentum: 0.8
    nesterov: False
    weight_decay: 0
)

Best acc train: 0.003. Best acc test: 0.003
Training is finished!
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.63 µs
********************
Вариант:3 Dropout,BatchNorm1d,ReLU
SGD (
Parameter Group 0
    dampening: 0
    lr: 0.01
    maximize: False
    momentum: 0.8
    nesterov: False
    weight_decay: 0
)

Best acc train: 0.004. Best acc test: 0.004
Training is finished!
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 8.34 µs


### Вывод

Поставим ограничение (выбирались рандомно): 
* Количество эпох - EPOCHES=8
* Размер батча в DataLoader - batch_size=100
* Скорость обучения - lr=0.01

Вариации:
1.   BatchNorm1d, ReLU, Dropout
2.   ReLU, BatchNorm1d, Dropout
3.   Dropout, BatchNorm1d, ReLU

Для сравнения используем смену последовательности применения нормализации, функции активации и регуляризации в 3 вариантах для различных оптимизаторов.
Очевидно данный порядок применения влияет на точность и скорость сходимости.

**ADAM** значительно снижает свои показатели скорости и точности при 2 варианции. Высокий показатель точности сохранился при вариации 3 (Dropout,BatchNorm1d,ReLU)

**RMSProp** вариация 2 оказалась более быстрой и точной (ReLU,BatchNorm1d,Dropout)

**SGD** вариант 1 оказался более точным по сравнению с остальными (BatchNorm1d,ReLU,Dropout). Скорость при этом остается соизмеримым с остальными вариациями

**SGD + Momentum** вариант 2 (ReLU,BatchNorm1d,Dropout) оказался более точным, но скорость снижается, хотя и не самая низкая





