# Tensorflow vs Pytorch

In [1]:
import sys, os, random
import numpy as np

In [2]:
import tensorflow as tf
import torch
msg = "tensorflow: {}, torch: {}"
print(msg.format(tf.__version__, torch.__version__))

tensorflow: 2.0.0, torch: 0.4.1


there is no way to do this in pytorch. However, PyTorch doesn’t pre-occupy the GPU’s entire memory, so if your computation only uses 50% of GPU, only that much is locked by PyTorch

In [3]:
cpus = tf.config.experimental.list_physical_devices('CPU')
gpus = tf.config.experimental.list_physical_devices('GPU')

In [4]:
# # GPU 메모리 제한하기
MEMORY_LIMIT_CONFIG = [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=512)]
tf.config.experimental.set_virtual_device_configuration(gpus[0], MEMORY_LIMIT_CONFIG)
msg = "limit option: {}"
print(msg.format(MEMORY_LIMIT_CONFIG))

limit option: [VirtualDeviceConfiguration(memory_limit=512)]


In [5]:
# # only use CPU
# os.environ["CUDA_VISIBLE_DEVICES"]="-1"

# Generate Dataset

In [6]:
V = 1000 # voca sizs
B, D, T, H = 2, 3, 5, 2

In [7]:
x = np.random.randint(0, 1000, size=(B, T), dtype=int)
# x_len = np.random.randint(0, T + 1, size=(B, ), dtype=int) # This will cause Error!!
x_len = np.random.randint(1, T + 1, size=(B, ), dtype=int)
for i in range(len(x)):
    x[i][x_len[i]:] = 0
mask = x!=0
msg = "x:\n{}\nx_len:\n{}\nmask:\n{}"
print(msg.format(x, x_len, mask))

x:
[[595 856 358 165   0]
 [705   0   0   0   0]]
x_len:
[4 1]
mask:
[[ True  True  True  True False]
 [ True False False False False]]


## Encodeing: Embedding, LSTM

### 1. tensorflow

if `tf.test.is_gpu_available()` is executed, all gpu memories can be pre-occupied.

In [8]:
# tf.test.is_gpu_available()

In [9]:
import tensorflow.keras.layers as L

In [10]:
# convert to tensor
inp = tf.convert_to_tensor(x, dtype=tf.int32)
inp_len  = tf.convert_to_tensor(x_len, dtype=tf.int32)
mask = tf.convert_to_tensor(mask, dtype=tf.bool)

In [11]:
inp, inp_len

(<tf.Tensor: id=0, shape=(2, 5), dtype=int32, numpy=
 array([[595, 856, 358, 165,   0],
        [705,   0,   0,   0,   0]], dtype=int32)>,
 <tf.Tensor: id=1, shape=(2,), dtype=int32, numpy=array([4, 1], dtype=int32)>)

In [12]:
# embed = L.Embedding(V, D, mask_zero=True)
embed = L.Embedding(V, D)
lstm = L.LSTM(units=H, return_sequences=True, return_state=True)
blstm = L.Bidirectional(layer=lstm, merge_mode=None)

In [13]:
embed(inp)

<tf.Tensor: id=17, shape=(2, 5, 3), dtype=float32, numpy=
array([[[ 0.01795701, -0.03862273, -0.00558972],
        [ 0.01919926, -0.04094749,  0.01460603],
        [-0.01255976,  0.02137837, -0.02569915],
        [ 0.03439699, -0.00957326,  0.02156724],
        [-0.03864299,  0.00503808, -0.02458462]],

       [[ 0.00422397, -0.03477051,  0.00359398],
        [-0.03864299,  0.00503808, -0.02458462],
        [-0.03864299,  0.00503808, -0.02458462],
        [-0.03864299,  0.00503808, -0.02458462],
        [-0.03864299,  0.00503808, -0.02458462]]], dtype=float32)>

In [14]:
#if mask_zero==True, mask values can be compute using embedding methods.
print(embed.compute_mask(inp)) 
print(embed(inp)._keras_mask) # another way.

None
None


**In Tensorflow** ...

<font color=red> Please Note that :</font> Error can occurs if <mark>all sequence values are zeros in an example.</mark> Cudnn does not precess this when lstm module is used.  
The error message can be shown as follows.

<font color=red>UnknownError:</font> CUDNN_STATUS_BAD_PARAM
in tensorflow/stream_executor/cuda/cuda_dnn.cc(1424): 'cudnnSetRNNDataDescriptor( data_desc.get(), data_type, layout, max_seq_length, batch_size, data_size, seq_lengths_array, (void*)&padding_fill)' [Op:CudnnRNNV3]



In [15]:
lstm(embed(inp)) # [h, ht, ct], automatically applied if embed.mask_zero=True.
lstm(embed(inp), mask=mask) # manully plug-in mask values.

[<tf.Tensor: id=294, shape=(2, 5, 2), dtype=float32, numpy=
 array([[[ 0.00317005,  0.00056077],
         [ 0.0047301 ,  0.00393966],
         [ 0.00361286, -0.00128846],
         [-0.00114782, -0.00159627],
         [ 0.        ,  0.        ]],
 
        [[ 0.00390952,  0.00336001],
         [ 0.        ,  0.        ],
         [ 0.        ,  0.        ],
         [ 0.        ,  0.        ],
         [ 0.        ,  0.        ]]], dtype=float32)>,
 <tf.Tensor: id=298, shape=(2, 2), dtype=float32, numpy=
 array([[-0.00114782, -0.00159627],
        [ 0.00390952,  0.00336001]], dtype=float32)>,
 <tf.Tensor: id=302, shape=(2, 2), dtype=float32, numpy=
 array([[-0.002287  , -0.0031701 ],
        [ 0.0077549 ,  0.00679565]], dtype=float32)>]

In [16]:
init_states = [tf.random.normal(shape=[B, H])] * 4 # [ht_fw, ht_bw, ct_fw, bt_bw]
blstm(embed(inp), mask=mask, initial_state=init_states) 
blstm(embed(inp), mask=mask) # outputs # [hf, hb, htf, htb, ctf, ctb]

[<tf.Tensor: id=762, shape=(2, 5, 2), dtype=float32, numpy=
 array([[[-0.00506626, -0.00420797],
         [-0.00882048, -0.0062254 ],
         [-0.00710497, -0.00252964],
         [-0.00567203, -0.00647691],
         [ 0.        ,  0.        ]],
 
        [[-0.0040854 , -0.00124973],
         [ 0.        ,  0.        ],
         [ 0.        ,  0.        ],
         [ 0.        ,  0.        ],
         [ 0.        ,  0.        ]]], dtype=float32)>,
 <tf.Tensor: id=903, shape=(2, 5, 2), dtype=float32, numpy=
 array([[[-0.00546061, -0.01607447],
         [-0.00268578, -0.0077665 ],
         [ 0.00279986, -0.00162677],
         [-0.00161444, -0.00275692],
         [ 0.        ,  0.        ]],
 
        [[-0.00383861, -0.00539634],
         [ 0.        ,  0.        ],
         [ 0.        ,  0.        ],
         [ 0.        ,  0.        ],
         [ 0.        ,  0.        ]]], dtype=float32)>,
 <tf.Tensor: id=766, shape=(2, 2), dtype=float32, numpy=
 array([[-0.00567203, -0.00647691],
   

### 2. pytorch

In [17]:
torch.cuda.is_available()

True

In [18]:
import torch.nn as nn
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

In [19]:
# conver to torch.Tensor
inp = torch.LongTensor(x)
inp_len = torch.LongTensor(x_len)
inp = inp.cuda()
inp_len = inp_len.cuda()

In [20]:
inp, inp_len

(tensor([[595, 856, 358, 165,   0],
         [705,   0,   0,   0,   0]], device='cuda:0'),
 tensor([4, 1], device='cuda:0'))

In [21]:
embed = nn.Embedding(num_embeddings=V, embedding_dim=D, padding_idx=0).cuda()
lstm = nn.LSTM(input_size=D, hidden_size=H, num_layers=1, batch_first=True).cuda()
blstm = nn.LSTM(input_size=D, hidden_size=H, num_layers=1, batch_first=True, bidirectional=True).cuda()

In [22]:
embed(inp)

tensor([[[-0.0108,  0.0929, -1.6743],
         [-0.4543,  0.8441, -0.5434],
         [-1.1308,  1.0828, -0.6759],
         [ 1.0714,  0.3594,  1.7790],
         [ 0.0000,  0.0000,  0.0000]],

        [[ 0.2507, -1.3260, -0.7707],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000]]],
       device='cuda:0', grad_fn=<EmbeddingBackward>)

In [23]:
# defaults initial states are all zeros.
# h0 = torch.randn(1*1, B, H) # shape: (num_layers * num_directions, batch, hidden_size)
# c0 = torch.randn(1*2, B, H)
# inp, (h0, c0) can be a input
lstm(embed(inp)) # outputs (h, (ht, ct))

(tensor([[[-0.1659,  0.1207],
          [-0.1313,  0.1584],
          [-0.1283,  0.1270],
          [ 0.0030,  0.1742],
          [-0.0242,  0.2194]],
 
         [[ 0.1015,  0.2408],
          [-0.0062,  0.3109],
          [-0.0354,  0.3059],
          [-0.0461,  0.3046],
          [-0.0500,  0.3044]]], device='cuda:0', grad_fn=<CudnnRnnBackward>),
 (tensor([[[-0.0242,  0.2194],
           [-0.0500,  0.3044]]], device='cuda:0', grad_fn=<CudnnRnnBackward>),
  tensor([[[-0.0628,  0.4446],
           [-0.1354,  0.6597]]], device='cuda:0', grad_fn=<CudnnRnnBackward>)))

In [24]:
h0 = torch.randn(1*2, B, H).cuda() # shape: (num_layers * num_directions, batch, hidden_size)
c0 = torch.randn(1*2, B, H).cuda()
blstm(embed(inp), (h0, c0))

(tensor([[[-0.3166, -0.0375,  0.1704, -0.0853],
          [-0.0481,  0.2206,  0.4176, -0.0310],
          [ 0.0783,  0.3529,  0.4082, -0.0505],
          [ 0.0209, -0.0261,  0.3105, -0.5865],
          [-0.0894,  0.0179, -0.1782, -0.0385]],
 
         [[ 0.0031,  0.1702,  0.1792, -0.2081],
          [-0.0807,  0.0830,  0.4344, -0.3378],
          [-0.1149,  0.0826,  0.4302, -0.3379],
          [-0.1261,  0.0914,  0.4153, -0.3432],
          [-0.1296,  0.0972,  0.4747, -0.1887]]],
        device='cuda:0', grad_fn=<CudnnRnnBackward>),
 (tensor([[[-0.0894,  0.0179],
           [-0.1296,  0.0972]],
  
          [[ 0.1704, -0.0853],
           [ 0.1792, -0.2081]]], device='cuda:0', grad_fn=<CudnnRnnBackward>),
  tensor([[[-0.1532,  0.0319],
           [-0.2321,  0.1690]],
  
          [[ 0.3068, -0.3076],
           [ 0.4212, -1.1216]]], device='cuda:0', grad_fn=<CudnnRnnBackward>)))

pack, unpack techniques can be used easily in pytorch. [korean blog](https://simonjisu.github.io/nlp/2018/07/05/packedsequence.html)

In [25]:
sorted_inp_len, indices = torch.sort(inp_len, dim=0, descending=True)
sorted_inp_len, indices

(tensor([4, 1], device='cuda:0'), tensor([0, 1], device='cuda:0'))

In [26]:
embed(inp)[indices]

tensor([[[-0.0108,  0.0929, -1.6743],
         [-0.4543,  0.8441, -0.5434],
         [-1.1308,  1.0828, -0.6759],
         [ 1.0714,  0.3594,  1.7790],
         [ 0.0000,  0.0000,  0.0000]],

        [[ 0.2507, -1.3260, -0.7707],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000]]],
       device='cuda:0', grad_fn=<TakeBackward>)

if a seqeunce of an example with all zeros vectors causes `Error`.  
the message is shown as follows.  
<font color=red>ValueError</font>: Length of all samples has to be greater than 0, but found an element in 'lengths' that is <= 0

In [27]:
packed_embeddings = pack_padded_sequence(embed(inp)[indices], sorted_inp_len.data.tolist(), batch_first=True)
packed_embeddings

PackedSequence(data=tensor([[-0.0108,  0.0929, -1.6743],
        [ 0.2507, -1.3260, -0.7707],
        [-0.4543,  0.8441, -0.5434],
        [-1.1308,  1.0828, -0.6759],
        [ 1.0714,  0.3594,  1.7790]],
       device='cuda:0', grad_fn=<PackPaddedBackward>), batch_sizes=tensor([2, 1, 1, 1], grad_fn=<PackPaddedBackward>))

In [28]:
packed_h, (packed_ht_fw, packed_ht_bw) = lstm(packed_embeddings) # outputs packed results.
packed_h, (packed_ht_fw, packed_ht_bw)

(PackedSequence(data=tensor([[-0.1659,  0.1207],
         [ 0.1015,  0.2408],
         [-0.1313,  0.1584],
         [-0.1283,  0.1270],
         [ 0.0030,  0.1742]], device='cuda:0', grad_fn=<CudnnRnnBackward>), batch_sizes=tensor([2, 1, 1, 1], grad_fn=<PackPaddedBackward>)),
 (tensor([[[0.0030, 0.1742],
           [0.1015, 0.2408]]], device='cuda:0', grad_fn=<CudnnRnnBackward>),
  tensor([[[0.0086, 0.2180],
           [0.1640, 0.7270]]], device='cuda:0', grad_fn=<CudnnRnnBackward>)))

In [29]:
pad_packed_sequence(packed_h, batch_first=True) # unpack the result.

(tensor([[[-0.1659,  0.1207],
          [-0.1313,  0.1584],
          [-0.1283,  0.1270],
          [ 0.0030,  0.1742]],
 
         [[ 0.1015,  0.2408],
          [ 0.0000,  0.0000],
          [ 0.0000,  0.0000],
          [ 0.0000,  0.0000]]], device='cuda:0', grad_fn=<TransposeBackward0>),
 tensor([4, 1]))

### Big-Difference tensorflow vs pytorch
1. **Embedding**  
    **In Tensorflow**, even though `mask_zero=True`, the outputs of embedding layer for `padding id=0` does not zero-vector.  
    <font color=red>On the other hand</font>, 
    **In Pytorch**, embedding layer's signiture `padding_idx` can determine outputs to become zero-vector.
    
2. **LSTM**   
    * **In Tensorflow**, if a seqeunce of an example with all zeros vectors causes `Error` in GPU commputing.  
      <font color=red>On the other hand</font>, **In Pytorch**, a seqeunce with all zeros vectors does not cause `Error` in GPU commputing.
    * Automatical masked outputs in LSTM can be supplied in Tensorflow, but pytorch does not have this.
    * Pytorch supplies packed, unpacked technique for efficient computation when treating LSTM sequences.  
      <font color=red>However</font>, in this technique, if a seqeunce of an example with all zeros vectors causes `Error`
      
   <font color=skyblue> Solution</font>: To prevent `Error` related to all zero vectors, all values of input_len should be larger than 0 
    
    

Create class `Embedding` in Tensorflow which operates 

In [30]:
class Embedding(tf.keras.layers.Layer):
  
    def __init__(self, input_dim, output_dim, padding_idx=0, **kwargs):
        """ default padding_idx=0.
        
        Call Args:
            inputs: [B, T]
        
        description:
            input_dim: V (vocabulary size)
            output_dim: D 
        """
        super(Embedding, self).__init__(**kwargs)
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.padding_idx = padding_idx

    def build(self, input_shape):
        self.embeddings = self.add_weight(
          shape=(self.input_dim, self.output_dim),
          initializer='random_normal',
          dtype='float32')

    def call(self, inputs): 
        def compute_mask():
            return tf.not_equal(inputs, self.padding_idx)
        
        out = tf.nn.embedding_lookup(self.embeddings, inputs)
        masking = compute_mask() # [B, T], bool
        masking = tf.cast(tf.tile(masking[:,:, tf.newaxis], [1,1,self.output_dim]), 
                          dtype=tf.float32) # [B, T, D]
        return tf.multiply(out, masking)
  

In [31]:
embed = Embedding(V, D, padding_idx=0)

#### regenerate dataset

In [32]:
x = np.random.randint(0, 1000, size=(B, T), dtype=int)
# x_len = np.random.randint(0, T + 1, size=(B, ), dtype=int) # This will cause Error!!
x_len = np.random.randint(1, T + 1, size=(B, ), dtype=int)
for i in range(len(x)):
    x[i][x_len[i]:] = 0
mask = x!=0
msg = "x:\n{}\nx_len:\n{}\nmask:\n{}"
print(msg.format(x, x_len, mask))

x:
[[101  25  11   0   0]
 [757 978   0   0   0]]
x_len:
[3 2]
mask:
[[ True  True  True False False]
 [ True  True False False False]]


In [33]:
# convert to tensor
inp = tf.convert_to_tensor(x, dtype=tf.int32)
inp_len  = tf.convert_to_tensor(x_len, dtype=tf.int32)
mask = tf.convert_to_tensor(mask, dtype=tf.bool)

In [34]:
inp, mask

(<tf.Tensor: id=904, shape=(2, 5), dtype=int32, numpy=
 array([[101,  25,  11,   0,   0],
        [757, 978,   0,   0,   0]], dtype=int32)>,
 <tf.Tensor: id=906, shape=(2, 5), dtype=bool, numpy=
 array([[ True,  True,  True, False, False],
        [ True,  True, False, False, False]])>)

In [35]:
# test_mask = np.array([[True, False, False, False, False],
#         [ True,  True,  True, False, False]])
# test_mask = tf.convert_to_tensor(test_mask)

In [36]:
embed(inp)

<tf.Tensor: id=930, shape=(2, 5, 3), dtype=float32, numpy=
array([[[ 0.03622475, -0.05047853,  0.02699393],
        [-0.01007628, -0.01224462,  0.04993922],
        [ 0.0482391 , -0.04658696, -0.05432926],
        [-0.        ,  0.        ,  0.        ],
        [-0.        ,  0.        ,  0.        ]],

       [[-0.07953385,  0.04829338, -0.09294441],
        [-0.02257329,  0.03778542, -0.0711035 ],
        [-0.        ,  0.        ,  0.        ],
        [-0.        ,  0.        ,  0.        ],
        [-0.        ,  0.        ,  0.        ]]], dtype=float32)>

In [37]:
lstm = L.LSTM(units=H, return_sequences=True, return_state=True)
blstm = L.Bidirectional(layer=lstm, merge_mode=None)

In [38]:
lstm(embed(inp), mask=mask) #  [h, ht, ct]

[<tf.Tensor: id=1112, shape=(2, 5, 2), dtype=float32, numpy=
 array([[[ 0.00167775,  0.00632184],
         [ 0.00311793,  0.00512636],
         [ 0.00476312,  0.00738899],
         [ 0.        ,  0.        ],
         [ 0.        ,  0.        ]],
 
        [[ 0.00939463, -0.01418047],
         [ 0.00632864, -0.01470776],
         [ 0.        ,  0.        ],
         [ 0.        ,  0.        ],
         [ 0.        ,  0.        ]]], dtype=float32)>,
 <tf.Tensor: id=1116, shape=(2, 2), dtype=float32, numpy=
 array([[ 0.00476312,  0.00738899],
        [ 0.00632864, -0.01470776]], dtype=float32)>,
 <tf.Tensor: id=1120, shape=(2, 2), dtype=float32, numpy=
 array([[ 0.00965497,  0.01484598],
        [ 0.01252178, -0.0303885 ]], dtype=float32)>]

In [39]:
blstm(embed(inp), mask=mask)

[<tf.Tensor: id=1356, shape=(2, 5, 2), dtype=float32, numpy=
 array([[[ 0.00474327,  0.00231616],
         [ 0.00752271,  0.00516678],
         [ 0.00160015,  0.00578936],
         [ 0.        ,  0.        ],
         [ 0.        ,  0.        ]],
 
        [[-0.01166129,  0.00491333],
         [-0.01529764,  0.00018354],
         [ 0.        ,  0.        ],
         [ 0.        ,  0.        ],
         [ 0.        ,  0.        ]]], dtype=float32)>,
 <tf.Tensor: id=1497, shape=(2, 5, 2), dtype=float32, numpy=
 array([[[-0.00333201, -0.00958159],
         [-0.00097156, -0.00666697],
         [-0.00971548,  0.00756083],
         [ 0.        ,  0.        ],
         [ 0.        ,  0.        ]],
 
        [[ 0.00411886,  0.01158959],
         [-0.00268966,  0.01074356],
         [ 0.        ,  0.        ],
         [ 0.        ,  0.        ],
         [ 0.        ,  0.        ]]], dtype=float32)>,
 <tf.Tensor: id=1360, shape=(2, 2), dtype=float32, numpy=
 array([[ 0.00160015,  0.00578936],
