### Transformer Model: Understanding Dropout with In-Depth-Details

https://youtu.be/C7Fb8kzMikQ

In [1]:
import torch
from torch import nn
from torch.distributions import Bernoulli

In [2]:
p = 0.3
bernoulli_sampler = Bernoulli(probs=p)  # 30% chance 1; 70% chance 0
samples = bernoulli_sampler.sample((20,))
# Compute % of 1
print(samples.count_nonzero()/len(samples))


tensor(0.3000)


In [3]:
# Create dropout layer
dropout_model = nn.Dropout(p=p)  # Default: mode train
tensor = torch.randn(4, 2, 10)
print(tensor)

tensor([[[ 0.1859, -0.1053, -0.2731, -0.7167, -0.5425,  1.2291,  0.3505,
          -1.2454,  1.1320, -0.4447],
         [ 0.0924, -0.3091,  1.0209, -0.8926, -0.7805,  0.2892, -1.0263,
           0.0436, -0.5956, -0.0222]],

        [[-0.4953,  0.7610, -0.9507,  0.6537, -1.9930,  1.0477,  1.4442,
           0.2745, -1.0905, -0.4964],
         [ 1.3553, -0.7693, -0.8706, -0.4816, -0.8423,  0.0844,  1.2860,
           0.9878, -0.7485, -0.2340]],

        [[-0.9195, -1.2510,  0.3589,  0.7687,  1.1305,  1.0399, -0.0942,
          -1.4100,  0.1734,  1.4067],
         [-0.4691, -0.9007, -1.2614, -0.7139, -1.2298, -0.2396,  2.1209,
           1.3498, -0.2547, -0.1009]],

        [[ 0.4403,  0.8939,  1.8371,  0.9443, -0.0700, -0.0083, -0.2612,
           0.5563,  0.3223, -2.0619],
         [-1.7610, -1.5139,  0.7899,  0.1353,  0.7773,  0.8537,  0.1333,
          -1.3760,  0.7191,  2.0048]]])


In [4]:
print(tensor * (1/(1-p)))  # Scaling tensor

tensor([[[ 0.2656, -0.1505, -0.3901, -1.0239, -0.7750,  1.7559,  0.5007,
          -1.7791,  1.6171, -0.6353],
         [ 0.1320, -0.4415,  1.4585, -1.2752, -1.1150,  0.4132, -1.4662,
           0.0623, -0.8508, -0.0317]],

        [[-0.7076,  1.0872, -1.3581,  0.9338, -2.8471,  1.4967,  2.0631,
           0.3922, -1.5579, -0.7091],
         [ 1.9361, -1.0990, -1.2438, -0.6879, -1.2032,  0.1206,  1.8372,
           1.4112, -1.0692, -0.3343]],

        [[-1.3135, -1.7871,  0.5127,  1.0981,  1.6149,  1.4855, -0.1346,
          -2.0142,  0.2476,  2.0095],
         [-0.6702, -1.2867, -1.8019, -1.0199, -1.7569, -0.3423,  3.0299,
           1.9283, -0.3638, -0.1441]],

        [[ 0.6290,  1.2770,  2.6244,  1.3490, -0.1001, -0.0119, -0.3731,
           0.7948,  0.4605, -2.9456],
         [-2.5158, -2.1627,  1.1285,  0.1933,  1.1104,  1.2196,  0.1904,
          -1.9657,  1.0273,  2.8639]]])


In [5]:
output = dropout_model(tensor)
print(output)

tensor([[[ 0.0000, -0.0000, -0.0000, -1.0239, -0.7750,  0.0000,  0.5007,
          -1.7791,  1.6171, -0.6353],
         [ 0.1320, -0.0000,  1.4585, -1.2752, -1.1150,  0.0000, -1.4662,
           0.0623, -0.8508, -0.0317]],

        [[-0.7076,  1.0872, -0.0000,  0.0000, -2.8471,  1.4967,  0.0000,
           0.3922, -1.5579, -0.7091],
         [ 1.9361, -1.0990, -1.2438, -0.6879, -1.2032,  0.1206,  1.8372,
           1.4112, -1.0692, -0.3343]],

        [[-1.3135, -1.7871,  0.5127,  1.0981,  0.0000,  1.4855, -0.1346,
          -2.0142,  0.2476,  2.0095],
         [-0.6702, -0.0000, -1.8019, -1.0199, -1.7569, -0.3423,  3.0299,
           1.9283, -0.3638, -0.0000]],

        [[ 0.6290,  1.2770,  0.0000,  1.3490, -0.1001, -0.0119, -0.0000,
           0.0000,  0.4605, -2.9456],
         [-2.5158, -0.0000,  1.1285,  0.1933,  0.0000,  0.0000,  0.1904,
          -1.9657,  0.0000,  0.0000]]])


In [6]:
# Eval mode
dropout_model = dropout_model.eval()
output = dropout_model(tensor)
print(output)

tensor([[[ 0.1859, -0.1053, -0.2731, -0.7167, -0.5425,  1.2291,  0.3505,
          -1.2454,  1.1320, -0.4447],
         [ 0.0924, -0.3091,  1.0209, -0.8926, -0.7805,  0.2892, -1.0263,
           0.0436, -0.5956, -0.0222]],

        [[-0.4953,  0.7610, -0.9507,  0.6537, -1.9930,  1.0477,  1.4442,
           0.2745, -1.0905, -0.4964],
         [ 1.3553, -0.7693, -0.8706, -0.4816, -0.8423,  0.0844,  1.2860,
           0.9878, -0.7485, -0.2340]],

        [[-0.9195, -1.2510,  0.3589,  0.7687,  1.1305,  1.0399, -0.0942,
          -1.4100,  0.1734,  1.4067],
         [-0.4691, -0.9007, -1.2614, -0.7139, -1.2298, -0.2396,  2.1209,
           1.3498, -0.2547, -0.1009]],

        [[ 0.4403,  0.8939,  1.8371,  0.9443, -0.0700, -0.0083, -0.2612,
           0.5563,  0.3223, -2.0619],
         [-1.7610, -1.5139,  0.7899,  0.1353,  0.7773,  0.8537,  0.1333,
          -1.3760,  0.7191,  2.0048]]])


In [7]:
# Train mode
dropout_model = dropout_model.train()
output = dropout_model(tensor)
print(output)

tensor([[[ 0.0000, -0.0000, -0.3901, -0.0000, -0.7750,  1.7559,  0.5007,
          -1.7791,  1.6171, -0.6353],
         [ 0.0000, -0.4415,  1.4585, -1.2752, -1.1150,  0.0000, -0.0000,
           0.0623, -0.8508, -0.0000]],

        [[-0.0000,  0.0000, -0.0000,  0.0000, -0.0000,  1.4967,  0.0000,
           0.3922, -0.0000, -0.7091],
         [ 1.9361, -0.0000, -0.0000, -0.6879, -1.2032,  0.1206,  1.8372,
           1.4112, -1.0692, -0.3343]],

        [[-1.3135, -1.7871,  0.5127,  1.0981,  1.6149,  1.4855, -0.1346,
          -0.0000,  0.2476,  0.0000],
         [-0.6702, -1.2867, -0.0000, -0.0000, -0.0000, -0.3423,  3.0299,
           1.9283, -0.3638, -0.1441]],

        [[ 0.6290,  1.2770,  0.0000,  1.3490, -0.1001, -0.0000, -0.0000,
           0.7948,  0.4605, -2.9456],
         [-0.0000, -2.1627,  1.1285,  0.1933,  0.0000,  1.2196,  0.1904,
          -0.0000,  1.0273,  2.8639]]])
