In [None]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets
from torch.autograd import Variable

# Set seed
torch.manual_seed(0)

# Scheduler import
from torch.optim.lr_scheduler import StepLR


In [None]:
'''
STEP 1: LOADING DATASET
'''

train_dataset = dsets.MNIST(root='./data', 
                            train=True, 
                            transform=transforms.ToTensor(),
                            download=True)

test_dataset = dsets.MNIST(root='./data', 
                           train=False, 
                           transform=transforms.ToTensor())

In [None]:
'''
STEP 2: MAKING DATASET ITERABLE
'''

batch_size = 100
n_iters = 3000
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

 Веса инициализируются в момент опсиания архитектуры сети.
 Все возможные варианты инициализации весов приведены в официальной [документации](https://pytorch.org/docs/stable/nn.init.html). В примере ниже веса инициализируются методом Ксавье, что соотвествует использованию `tanh` в качестве функции активации

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
%cd /content/gdrive/My Drive/HSE_DL_2021/04_week

In [None]:
from IPython import display
display.Image('images/xavier.png')

In [None]:
'''
STEP 3: CREATE MODEL CLASS
'''
class FeedforwardNeuralNetModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, act_type='relu', init_type='he'):
        super(FeedforwardNeuralNetModel, self).__init__()
        # Linear function
        self.fc1 = nn.Linear(input_dim, hidden_dim) 
        
        # Non-linearity
        if act_type == 'tanh':
            self.activation = nn.Tanh()
        elif act_type == 'relu':
            self.activation = nn.ReLU()
        elif act_type == 'sigmoid':
            self.activation = nn.Sigmoid()
        else:
            print('This kind of activation is not supported in this net')

        # Linear function (readout)
        self.fc2 = nn.Linear(hidden_dim, output_dim)  
        
        # Linear weight, W,  Y = WX + B
        if init_type == 'xavier':
            nn.init.xavier_normal_(self.fc1.weight)
            nn.init.xavier_normal_(self.fc2.weight)
        elif init_type == 'he':
            nn.init.kaiming_normal_(self.fc1.weight)
            nn.init.kaiming_normal_(self.fc2.weight)
        else:
            print('This kind of initialization is not supported in this net')

    def forward(self, x):
        # Linear function
        out = self.fc1(x)
        # Non-linearity
        out = self.activation(out)
        # Linear function (readout)
        out = self.fc2(out)
        return out

In [None]:
a = torch.empty(3,3)
print(a)
nn.init.xavier_normal_(a)

In [None]:
'''
STEP 4: INSTANTIATE BASE PARAMETERS
'''
input_dim = 28*28
hidden_dim = 100
output_dim = 10

model = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim)

In [None]:
'''
STEP 5: INSTANTIATE LOSS CLASS
'''
criterion = nn.CrossEntropyLoss()

In [None]:
'''
STEP 6: INSTANTIATE OPTIMIZER CLASS
'''
learning_rate = 0.1

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, nesterov=True)

In [None]:
'''
STEP 7: INSTANTIATE STEP LEARNING SCHEDULER CLASS
'''
# step_size: at how many multiples of epoch you decay
# step_size = 1, after every 1 epoch, new_lr = lr*gamma 
# step_size = 2, after every 2 epoch, new_lr = lr*gamma 

# gamma = decaying factor
scheduler = StepLR(optimizer, step_size=1, gamma=0.96)


In [None]:
'''
STEP 8: TRAIN THE MODEL
'''
def model_train(model, criterion, optimizer, scheduler, train_loader, test_loader):
    iter = 0
    for epoch in range(num_epochs):
        # Print Learning Rate
        print('Epoch:', epoch,'LR:', scheduler.get_last_lr()[0])
        for i, (images, labels) in enumerate(train_loader):
            # Load images as tensors with gradient accumulation abilities
            images = images.view(-1, 28*28).requires_grad_()

            # Clear gradients w.r.t. parameters
            optimizer.zero_grad()

            # Forward pass to get output/logits
            outputs = model(images)

            # Calculate Loss: softmax --> cross entropy loss
            loss = criterion(outputs, labels)

            # Getting gradients w.r.t. parameters
            loss.backward()

            # Updating parameters
            optimizer.step()

            iter += 1

            if iter % 500 == 0:
                # Calculate Accuracy         
                correct = 0
                total = 0
                # Iterate through test dataset
                for images, labels in test_loader:
                    # Load images to a Torch Variable
                    images = images.view(-1, 28*28)

                    # Forward pass only to get logits/output
                    outputs = model(images)

                    # Get predictions from the maximum value
                    _, predicted = torch.max(outputs.data, 1)

                    # Total number of labels
                    total += labels.size(0)

                    # Total correct predictions
                    correct += (predicted.type(torch.FloatTensor).cpu() == labels.type(torch.FloatTensor)).sum()

                accuracy = 100. * correct.item() / total

                # Print Loss
                print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))

        # Decay Learning Rate
        scheduler.step()

## ReLU + Xavier

In [None]:
model = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim, act_type='relu', init_type='xavier')

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, nesterov=True)
scheduler = StepLR(optimizer, step_size=1, gamma=0.96)

model_train(model, criterion, optimizer, scheduler, train_loader, test_loader)

## Sigmoid + Xavier


In [None]:
model = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim, act_type='sigmoid', init_type='xavier')

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, nesterov=True)
scheduler = StepLR(optimizer, step_size=1, gamma=0.96)

model_train(model, criterion, optimizer, scheduler, train_loader, test_loader)

## Tanh + Xavier


In [None]:
model = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim, act_type='tanh', init_type='xavier')

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, nesterov=True)
scheduler = StepLR(optimizer, step_size=1, gamma=0.96)

model_train(model, criterion, optimizer, scheduler, train_loader, test_loader)

## ReLU + He


In [None]:
model = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim, act_type='relu', init_type='he')

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, nesterov=True)
scheduler = StepLR(optimizer, step_size=1, gamma=0.96)

model_train(model, criterion, optimizer, scheduler, train_loader, test_loader)


## Sigmoid + He


In [None]:
model = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim, act_type='sigmoid', init_type='he')

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, nesterov=True)
scheduler = StepLR(optimizer, step_size=1, gamma=0.96)

model_train(model, criterion, optimizer, scheduler, train_loader, test_loader)


## Tanh + He


In [None]:
model = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim, act_type='tanh', init_type='he')

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, nesterov=True)
scheduler = StepLR(optimizer, step_size=1, gamma=0.96)

model_train(model, criterion, optimizer, scheduler, train_loader, test_loader)


[Источник](https://www.deeplearningwizard.com/deep_learning/boosting_models_pytorch/weight_initialization_activation_functions/) ноутбука