# Tensorflow vs Pytorch

In [1]:
import sys, os, random
import numpy as np

In [2]:
import tensorflow as tf
import torch
msg = "tensorflow: {}, torch: {}"
print(msg.format(tf.__version__, torch.__version__))

tensorflow: 2.0.0, torch: 0.4.1


there is no way to do this in pytorch. However, PyTorch doesn’t pre-occupy the GPU’s entire memory, so if your computation only uses 50% of GPU, only that much is locked by PyTorch

In [3]:
cpus = tf.config.experimental.list_physical_devices('CPU')
gpus = tf.config.experimental.list_physical_devices('GPU')

In [4]:
# # GPU 메모리 제한하기
MEMORY_LIMIT_CONFIG = [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=512)]
tf.config.experimental.set_virtual_device_configuration(gpus[0], MEMORY_LIMIT_CONFIG)
msg = "limit option: {}"
print(msg.format(MEMORY_LIMIT_CONFIG))

limit option: [VirtualDeviceConfiguration(memory_limit=512)]


In [5]:
# # only use CPU
# os.environ["CUDA_VISIBLE_DEVICES"]="-1"

# Generate Dataset

In [10]:
V = 1000 # voca sizs
B, D, T, H = 2, 3, 5, 2

In [11]:
x = np.random.randint(0, 1000, size=(B, T), dtype=int)
x_len = np.random.randint(0, T + 1, size=(B, ), dtype=int)
for i in range(len(x)):
    x[i][x_len[i]:] = 0
mask = x!=0
msg = "x:\n{}\nx_len:\n{}\nmask:\n{}"
print(msg.format(x, x_len, mask))

x:
[[  0   0   0   0   0]
 [947   0   0   0   0]]
x_len:
[0 1]
mask:
[[False False False False False]
 [ True False False False False]]


## Encodeing: Embedding, LSTM

### 1. tensorflow

In [19]:
tf.test.is_gpu_available()

True

In [20]:
import tensorflow.keras.layers as L

In [21]:
# convert to tensor
inp = tf.convert_to_tensor(x, dtype=tf.int32)
inp_len  = tf.convert_to_tensor(x_len, dtype=tf.int32)
mask = tf.convert_to_tensor(mask, dtype=tf.bool)

In [22]:
inp, inp_len

(<tf.Tensor: id=0, shape=(2, 5), dtype=int32, numpy=
 array([[  0,   0,   0,   0,   0],
        [947,   0,   0,   0,   0]], dtype=int32)>,
 <tf.Tensor: id=1, shape=(2,), dtype=int32, numpy=array([0, 1], dtype=int32)>)

In [23]:
# embed = L.Embedding(V, D, mask_zero=True)
embed = L.Embedding(V, D)
lstm = L.LSTM(units=H, return_sequences=True, return_state=True)
blstm = L.Bidirectional(layer=lstm, merge_mode=None)

In [24]:
embed(inp)

<tf.Tensor: id=17, shape=(2, 5, 3), dtype=float32, numpy=
array([[[ 0.02016583, -0.0327732 ,  0.00697129],
        [ 0.02016583, -0.0327732 ,  0.00697129],
        [ 0.02016583, -0.0327732 ,  0.00697129],
        [ 0.02016583, -0.0327732 ,  0.00697129],
        [ 0.02016583, -0.0327732 ,  0.00697129]],

       [[ 0.00281926,  0.00876669,  0.00381393],
        [ 0.02016583, -0.0327732 ,  0.00697129],
        [ 0.02016583, -0.0327732 ,  0.00697129],
        [ 0.02016583, -0.0327732 ,  0.00697129],
        [ 0.02016583, -0.0327732 ,  0.00697129]]], dtype=float32)>

In [25]:
#if mask_zero==True, mask values can be compute using embedding methods.
print(embed.compute_mask(inp)) 
print(embed(inp)._keras_mask) # another way.

None
None


<font color=red> Please Note that :</font> Error can occurs if all sequence 
UnknownError: CUDNN_STATUS_BAD_PARAM
in tensorflow/stream_executor/cuda/cuda_dnn.cc(1424): 'cudnnSetRNNDataDescriptor( data_desc.get(), data_type, layout, max_seq_length, batch_size, data_size, seq_lengths_array, (void*)&padding_fill)' [Op:CudnnRNNV3]

In [26]:
lstm(embed(inp)) # [h, ht, ct], automatically applied if embed.mask_zero=True.
lstm(embed(inp), mask=mask) # manully plug-in mask values.

UnknownError: CUDNN_STATUS_BAD_PARAM
in tensorflow/stream_executor/cuda/cuda_dnn.cc(1424): 'cudnnSetRNNDataDescriptor( data_desc.get(), data_type, layout, max_seq_length, batch_size, data_size, seq_lengths_array, (void*)&padding_fill)' [Op:CudnnRNNV3]

In [None]:
blstm(embed(inp), mask=mask) # [hf, hb, htf, htb, ctf, ctb]

### 2. pytorch

In [6]:
torch.cuda.is_available()

True

In [12]:
import torch.nn as nn 

In [13]:
# conver to torch.Tensor
inp = torch.LongTensor(x)
inp_len = torch.LongTensor(x_len)
inp = inp.cuda()
inp_len = inp_len.cuda()

In [14]:
inp, inp_len

(tensor([[  0,   0,   0,   0,   0],
         [947,   0,   0,   0,   0]], device='cuda:0'),
 tensor([0, 1], device='cuda:0'))

In [15]:
embed = nn.Embedding(num_embeddings=V, embedding_dim=D, padding_idx=0).cuda()
lstm = nn.LSTM(input_size=D, hidden_size=H, num_layers=1, batch_first=True).cuda()
blstm = nn.LSTM(input_size=D, hidden_size=H, num_layers=1, batch_first=True, bidirectional=True).cuda()

In [16]:
embed(inp)

tensor([[[ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000]],

        [[-0.2215, -0.9797,  0.4111],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000]]],
       device='cuda:0', grad_fn=<EmbeddingBackward>)

In [17]:
# defaults initial states are all zeros.
# h0 = torch.randn(1*1, B, H) # shape: (num_layers * num_directions, batch, hidden_size)
# c0 = torch.randn(1*2, B, H)
# inp, (h0, c0) can be a input
lstm(embed(inp)) # outputs (h, (ht, ct))

(tensor([[[-0.0626, -0.0187],
          [-0.0904, -0.0289],
          [-0.1025, -0.0346],
          [-0.1079, -0.0379],
          [-0.1102, -0.0397]],
 
         [[-0.1203,  0.1109],
          [-0.1217,  0.0531],
          [-0.1176,  0.0164],
          [-0.1153, -0.0068],
          [-0.1139, -0.0211]]], device='cuda:0', grad_fn=<CudnnRnnBackward>),
 (tensor([[[-0.1102, -0.0397],
           [-0.1139, -0.0211]]], device='cuda:0', grad_fn=<CudnnRnnBackward>),
  tensor([[[-0.2407, -0.0895],
           [-0.2471, -0.0480]]], device='cuda:0', grad_fn=<CudnnRnnBackward>)))

In [18]:
h0 = torch.randn(1*2, B, H).cuda() # shape: (num_layers * num_directions, batch, hidden_size)
c0 = torch.randn(1*2, B, H).cuda()
blstm(embed(inp), (h0, c0))

(tensor([[[-0.2138,  0.2176,  0.0658,  0.2757],
          [-0.1058,  0.3534,  0.0386,  0.2552],
          [-0.0340,  0.3848, -0.0193,  0.2229],
          [ 0.0064,  0.3875, -0.1266,  0.1808],
          [ 0.0269,  0.3846, -0.2272,  0.1215]],
 
         [[-0.0735, -0.0709,  0.0770,  0.1998],
          [-0.0450,  0.2410,  0.1109,  0.2938],
          [-0.0141,  0.3454,  0.1409,  0.2843],
          [ 0.0122,  0.3746,  0.1962,  0.2476],
          [ 0.0283,  0.3806,  0.1528,  0.1089]]],
        device='cuda:0', grad_fn=<CudnnRnnBackward>),
 (tensor([[[0.0269, 0.3846],
           [0.0283, 0.3806]],
  
          [[0.0658, 0.2757],
           [0.0770, 0.1998]]], device='cuda:0', grad_fn=<CudnnRnnBackward>),
  tensor([[[0.0504, 0.7326],
           [0.0533, 0.7264]],
  
          [[0.1432, 0.7036],
           [0.1805, 0.4572]]], device='cuda:0', grad_fn=<CudnnRnnBackward>)))

### Difference tensorflow vs pytorch
1. Embedding  
    **In Tensorflow**, even though `mask_zero=True`, the outputs of embedding layer for `padding id=0` does not zero-vector.  
    <font color=red>On the other hand</font>, 
    **In Pytorch**, embedding layer's signiture `padding_idx` can determine the output to become zero-vector.