# <font color = 'indianred'> **Import Libraries**

In [2]:
import torch
import torch.nn as nn
import numpy as np
from sklearn.model_selection import train_test_split


# <font color = 'indianred'>**Understanding Dropout, Batchnorm1d, model.state_dict**



## <font color = 'indianred'>**Dropout**

<img src ="https://drive.google.com/uc?export=view&id=1f7KmsmF1TXZFUNOJpWBH2P4WkawnVH3Z" width =500>


In [3]:
torch.manual_seed(42)
inp = torch.tensor([1.0, 2.0, 3, 4, 5])
print(inp)
model = nn.Dropout(p=0.3)
output = model(inp)
print(output)


tensor([1., 2., 3., 4., 5.])
tensor([0.0000, 0.0000, 4.2857, 5.7143, 7.1429])


In [4]:
1/0.7


1.4285714285714286

In [6]:
inp * (1/0.7)


tensor([1.4286, 2.8571, 4.2857, 5.7143, 7.1429])

In [5]:
inp.sum(), output.sum()


(tensor(15.), tensor(17.1429))

### <font color = 'indianred'>**Dropout with model.train()**

In [7]:
torch.manual_seed(42)
inp = torch.tensor([1.0, 2.0, 3, 4, 5])
print(inp)
model = nn.Dropout(p=0.5)
model.train()
output = model(inp)
print(output)


tensor([1., 2., 3., 4., 5.])
tensor([ 0.,  0.,  6.,  8., 10.])


### <font color = 'indianred'>**Dropout with model.eval()**

In [8]:
# model.eval() ignores dropout and batch normalization layers

inp = torch.tensor([1.0, 2.0, 3, 4, 5])
print(inp)
model = nn.Dropout(p=0.4)
model.eval()
output = model(inp)
print(output)

tensor([1., 2., 3., 4., 5.])
tensor([1., 2., 3., 4., 5.])


## <font color = 'indianred'>**Model.eval vs torch.no_grad()**

In [9]:
N = 10
# random data on the x-axis in (-5, +5)
X = np.random.random((N, 2))*10-5

# a line plus some noise
Y = 0.5*X[:, 0] + 0.2*X[:, 1]-1 + np.random.randn(N)

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.33, random_state=41)

X_train = torch.from_numpy(X_train.astype(np.float32))
X_test = torch.from_numpy(X_test.astype(np.float32))
y_train = torch.from_numpy(y_train.astype(np.float32).reshape(-1, 1))
y_test = torch.from_numpy(y_test.astype(np.float32).reshape(-1, 1))


In [10]:
model = nn.Sequential(nn.Dropout(p=0.4),
                      nn.Linear(2, 1)
                      )
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)


In [11]:
n_epochs = 1
train_losses = np.zeros(n_epochs)
test_losses = np.zeros(n_epochs)

for i in range(n_epochs):
    # zero the parameter gradients
    optimizer.zero_grad()

    model.train()
    # Forward pass
    out_train = model(X_train)  # we are using both the layers together here
    # We are using the first layer only here
    out_train_drop = model[0](X_train)
    loss_train = criterion(out_train, y_train)

    # Backward and optimize
    loss_train.backward()
    optimizer.step()

    model.eval()
    # test loss and prediction
    with torch.no_grad():
        out_test = model(X_test)
        out_test_drop = model[0](X_test)
        loss_test = criterion(out_test, y_test)

    # Save losses
    train_losses[i] = loss_train.item()
    test_losses[i] = loss_test.item()


In [12]:
print(out_train.requires_grad)
print(loss_train.requires_grad)
print(out_test.requires_grad)
print(loss_test.requires_grad)


True
True
False
False


In [13]:
print('\nOutput after Dropout in Train', out_train_drop)
print('\nX_train', X_train)
print('\nOutput after Dropout in Test', out_test_drop)
print('\nX_test', X_test)



Output after Dropout in Train tensor([[-4.9717,  0.0000],
        [-0.5494, -3.7556],
        [ 0.0000, -0.0000],
        [-6.5637,  0.0000],
        [-0.0000, -4.2955],
        [-6.2759, -0.0000]])

X_train tensor([[-2.9830,  1.1757],
        [-0.3297, -2.2534],
        [ 4.9481, -3.8866],
        [-3.9382,  2.6983],
        [-4.9646, -2.5773],
        [-3.7655, -1.7608]])

Output after Dropout in Test tensor([[ 4.0119,  0.2275],
        [-4.7192,  2.4383],
        [-0.5311, -4.4541],
        [-2.9625,  3.3738]])

X_test tensor([[ 4.0119,  0.2275],
        [-4.7192,  2.4383],
        [-0.5311, -4.4541],
        [-2.9625,  3.3738]])


In [14]:
model.state_dict()

OrderedDict([('1.weight', tensor([[0.4305, 0.5575]])),
             ('1.bias', tensor([0.5274]))])

In [15]:
list(model.parameters())

[Parameter containing:
 tensor([[0.4305, 0.5575]], requires_grad=True),
 Parameter containing:
 tensor([0.5274], requires_grad=True)]

## <font color = 'indianred'>**Batchnorm1d**

<img src ="https://drive.google.com/uc?export=view&id=1f6TJdYfRJdQ10GVO6Q2ZX7biejwwykkq" width =300>


In [16]:
X = torch.randn(3, 2) * 5 + 10

B = nn.BatchNorm1d(2, affine=False)
y = B(X)

mu = torch.mean(X, axis=0)
var_ = torch.var(X, axis=0, unbiased=False)
sigma = torch.sqrt(var_ + 1e-5)
z = (X - mu)/sigma

# the ratio below should be equal to one
print(z / y)


tensor([[1.0000, 1.0000],
        [1.0000, 1.0000],
        [1.0000, 1.0000]])


### <font color = 'indianred'>**Batchnorm with model.train() and model.eval()**
- During training, this layer keeps a running estimate of its computed mean and variance. The running sum is kept with a default momentum of 0.1.

- During evaluation, this running mean/variance is used for normalization.

In [17]:
torch.manual_seed(0)
X1 = torch.randn(3, 2) * 5 + 10
print('X1', X1, end='\n\n', sep='\n')
model = nn.Sequential()
model.add_module('batchnorm', nn.BatchNorm1d(2, momentum=0.1))
print('X1.mean()', X1.mean(axis=0), end='\n\n', sep='\n')
y = model(X1)
print(y)
print('Running Mean', model[0].running_mean, end='\n\n', sep='\n')


X1
tensor([[17.7050,  8.5329],
        [-0.8939, 12.8422],
        [ 4.5774,  3.0070]])

X1.mean()
tensor([7.1295, 8.1273])

tensor([[ 1.3551,  0.1007],
        [-1.0281,  1.1713],
        [-0.3270, -1.2720]], grad_fn=<NativeBatchNormBackward0>)
Running Mean
tensor([0.7129, 0.8127])



In [18]:
0.1 * torch.tensor([7.1295, 8.1273]) + 0.9 * torch.tensor([0.7129, 0.8127])


tensor([1.3546, 1.5442])

In [19]:
for i in range(3):
    model.train()
    y1 = model(X1)
    # if (i % 10) ==0:
    print(model[0].running_mean)


tensor([1.3546, 1.5442])
tensor([1.9321, 2.2025])
tensor([2.4518, 2.7950])


In [20]:
for i in range(100):
    model.train()
    y1 = model(X1)
    if (i % 10) == 0:
        print(model[0].running_mean)


tensor([2.9196, 3.3282])
tensor([5.6616, 6.4540])
tensor([6.6176, 7.5439])
tensor([6.9510, 7.9239])
tensor([7.0672, 8.0564])
tensor([7.1078, 8.1026])
tensor([7.1219, 8.1187])
tensor([7.1268, 8.1243])
tensor([7.1286, 8.1263])
tensor([7.1292, 8.1270])


In [21]:
for i in range(100):
    model.eval()
    y1 = model(X1)
    if (i % 10) == 0:
        print(model[0].running_mean)


tensor([7.1293, 8.1272])
tensor([7.1293, 8.1272])
tensor([7.1293, 8.1272])
tensor([7.1293, 8.1272])
tensor([7.1293, 8.1272])
tensor([7.1293, 8.1272])
tensor([7.1293, 8.1272])
tensor([7.1293, 8.1272])
tensor([7.1293, 8.1272])
tensor([7.1293, 8.1272])


In [22]:
model.state_dict()

OrderedDict([('batchnorm.weight', tensor([1., 1.])),
             ('batchnorm.bias', tensor([0., 0.])),
             ('batchnorm.running_mean', tensor([7.1293, 8.1272])),
             ('batchnorm.running_var', tensor([91.3633, 24.3054])),
             ('batchnorm.num_batches_tracked', tensor(104))])

In [23]:
list(model.parameters())

[Parameter containing:
 tensor([1., 1.], requires_grad=True),
 Parameter containing:
 tensor([0., 0.], requires_grad=True)]