In [1]:
import tensorflow as tf
import numpy as np

### The tf.string data type

In [2]:
tf.constant(u"ॐ गं गणपतये नम:")

<tf.Tensor: shape=(), dtype=string, numpy=b'\xe0\xa5\x90 \xe0\xa4\x97\xe0\xa4\x82 \xe0\xa4\x97\xe0\xa4\xa3\xe0\xa4\xaa\xe0\xa4\xa4\xe0\xa4\xaf\xe0\xa5\x87 \xe0\xa4\xa8\xe0\xa4\xae:'>

In [3]:
#s = '''\xe0\xa5\x90 \xe0\xa4\x97\xe0\xa4\x82 \xe0\xa4\x97\xe0\xa4\xa3\xe0\xa4\xaa\xe0\xa4\xa4\xe0\xa4\xaf\xe0\xa5\x87 \xe0\xa4\xa8\xe0\xa4\xae:'''
#s.encode('raw-unicode-escape').decode('utf8')
tf.constant(u"ॐ गं गणपतये नम:").numpy().decode()

'ॐ गं गणपतये नम:'

In [4]:
tf.constant(u"Thanks 😊")

<tf.Tensor: shape=(), dtype=string, numpy=b'Thanks \xf0\x9f\x98\x8a'>

In [5]:
tf.constant([u"You're", u"welcome!"])

<tf.Tensor: shape=(2,), dtype=string, numpy=array([b"You're", b'welcome!'], dtype=object)>

In [6]:
tf.constant([u"You're", u"welcome!"]).shape

TensorShape([2])

## Representing Unicode

There are two standard ways to represent a Unicode string in TensorFlow:
- string scalar — where the sequence of code points is encoded using a known character encoding.
- int32 vector — where each position contains a single code point.

In [7]:
# Unicode string, represented as a UTF-8 encoded string scalar.
text_utf8 = tf.constant(u"语言处理")
text_utf8

<tf.Tensor: shape=(), dtype=string, numpy=b'\xe8\xaf\xad\xe8\xa8\x80\xe5\xa4\x84\xe7\x90\x86'>

In [8]:
text_utf8.numpy().decode()

'语言处理'

In [9]:
# Unicode string, represented as a UTF-16-BE encoded string scalar.
text_utf16be = tf.constant(u"语言处理".encode("UTF-16-BE"))
text_utf16be

<tf.Tensor: shape=(), dtype=string, numpy=b'\x8b\xed\x8a\x00Y\x04t\x06'>

In [10]:
# Unicode string, represented as a vector of Unicode code points.
text_chars = tf.constant([ord(char) for char in u"语言处理"])
text_chars

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([35821, 35328, 22788, 29702])>

### With Hindi & Sanskrit text

In [11]:
# Unicode string, represented as a UTF-8 encoded string scalar.
text_utf8 = tf.constant(u"ॐ")
print('UTF-8 encoded >>',text_utf8)

# Decode UTF-8 encoded string
print('Decode UTF-8 encoded value >>', text_utf8.numpy().decode())

# Unicode string, represented as a UTF-16-BE encoded string scalar.
text_utf16be = tf.constant(u"ॐ".encode("UTF-16-BE"))
print('UTF-16BE encoded >>',text_utf16be)

# Decode UTF-8 encoded string
print('Decode UTF-16BE encoded value >>', text_utf16be.numpy().decode('UTF-16-BE'))

# Unicode string, represented as a vector of Unicode code points.
text_chars = tf.constant([ord(char) for char in u"ॐ"])
print('Unicode code points >>',text_chars)

UTF-8 encoded >> tf.Tensor(b'\xe0\xa5\x90', shape=(), dtype=string)
Decode UTF-8 encoded value >> ॐ
UTF-16BE encoded >> tf.Tensor(b'\tP', shape=(), dtype=string)
Decode UTF-16BE encoded value >> ॐ
Unicode code points >> tf.Tensor([2384], shape=(1,), dtype=int32)


### Converting between representations
TensorFlow provides operations to convert between these different representations:
- `tf.strings.unicode_decode`: Converts an encoded string scalar to a vector of code points.
- `tf.strings.unicode_encode`: Converts a vector of code points to an encoded string scalar.
- `tf.strings.unicode_transcode`: Converts an encoded string scalar to a different encoding.

In [12]:
# Converts an encoded string scalar to a vector of code points.
text_utf8 = tf.constant(u"ॐ")

tf.strings.unicode_decode(text_utf8, 
                          input_encoding='UTF-8')

<tf.Tensor: shape=(1,), dtype=int32, numpy=array([2384])>

In [13]:
# Converts a vector of code points to an encoded string scalar.
text_chars = tf.constant([ord(char) for char in u"ॐ"])

tf.strings.unicode_encode(text_chars, 
                          output_encoding='UTF-8')

<tf.Tensor: shape=(), dtype=string, numpy=b'\xe0\xa5\x90'>

In [14]:
#  Converts an encoded string scalar to a different encoding.
tf.strings.unicode_transcode(text_utf8,
                             input_encoding='UTF8',
                             output_encoding='UTF-16-BE')

<tf.Tensor: shape=(), dtype=string, numpy=b'\tP'>

### Batch dimensions
When decoding multiple strings, the number of characters in each string may not be equal. The return result is a `tf.RaggedTensor`, where the innermost dimension length varies depending on the number of characters in each string.

In [15]:
# A batch of Unicode strings, each represented as a UTF8-encoded string.
batch_utf8 = [s.encode('UTF-8') for s in
              [u'hÃllo', u'What is the weather tomorrow', u'Göödnight', u'😊']]
batch_chars_ragged = tf.strings.unicode_decode(batch_utf8,
                                               input_encoding='UTF-8')
for sentence_chars in batch_chars_ragged.to_list():
    print(sentence_chars)

[104, 195, 108, 108, 111]
[87, 104, 97, 116, 32, 105, 115, 32, 116, 104, 101, 32, 119, 101, 97, 116, 104, 101, 114, 32, 116, 111, 109, 111, 114, 114, 111, 119]
[71, 246, 246, 100, 110, 105, 103, 104, 116]
[128522]


 Use this `tf.RaggedTensor` directly, or convert it to a dense `tf.Tensor` with padding or a `tf.sparse.SparseTensor` using the methods `tf.RaggedTensor.to_tensor` and `tf.RaggedTensor.to_sparse`.

In [16]:
batch_chars_padded = batch_chars_ragged.to_tensor(default_value=-1)
print(batch_chars_padded.numpy())

[[   104    195    108    108    111     -1     -1     -1     -1     -1
      -1     -1     -1     -1     -1     -1     -1     -1     -1     -1
      -1     -1     -1     -1     -1     -1     -1     -1]
 [    87    104     97    116     32    105    115     32    116    104
     101     32    119    101     97    116    104    101    114     32
     116    111    109    111    114    114    111    119]
 [    71    246    246    100    110    105    103    104    116     -1
      -1     -1     -1     -1     -1     -1     -1     -1     -1     -1
      -1     -1     -1     -1     -1     -1     -1     -1]
 [128522     -1     -1     -1     -1     -1     -1     -1     -1     -1
      -1     -1     -1     -1     -1     -1     -1     -1     -1     -1
      -1     -1     -1     -1     -1     -1     -1     -1]]


In [17]:
batch_chars_sparse = batch_chars_ragged.to_sparse()

nrows, ncols = batch_chars_sparse.dense_shape.numpy()
elements = [['_' for i in range(ncols)] for j in range(nrows)]
for (row, col), value in zip(batch_chars_sparse.indices.numpy(), batch_chars_sparse.values.numpy()):
    elements[row][col] = str(value)
# max_width = max(len(value) for row in elements for value in row)
value_lengths = []
for row in elements:
    for value in row:
        value_lengths.append(len(value))
max_width = max(value_lengths)

print('[%s]' % '\n '.join(
    '[%s]' % ', '.join(value.rjust(max_width) for value in row)
    for row in elements))

[[   104,    195,    108,    108,    111,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _]
 [    87,    104,     97,    116,     32,    105,    115,     32,    116,    104,    101,     32,    119,    101,     97,    116,    104,    101,    114,     32,    116,    111,    109,    111,    114,    114,    111,    119]
 [    71,    246,    246,    100,    110,    105,    103,    104,    116,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _]
 [128522,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _,      _]]


When encoding multiple strings with the same lengths, use a `tf.Tensor` as the input.

In [18]:
tf.strings.unicode_encode([[99, 97, 116], [100, 111, 103], [99, 111, 119]], output_encoding='UTF-8')

<tf.Tensor: shape=(3,), dtype=string, numpy=array([b'cat', b'dog', b'cow'], dtype=object)>

In [19]:
tf.strings.unicode_encode(batch_chars_ragged, output_encoding='UTF-8')

<tf.Tensor: shape=(4,), dtype=string, numpy=
array([b'h\xc3\x83llo', b'What is the weather tomorrow',
       b'G\xc3\xb6\xc3\xb6dnight', b'\xf0\x9f\x98\x8a'], dtype=object)>

If you have a tensor with multiple strings in padded or sparse format, convert it first into a `tf.RaggedTensor` before calling tf.strings.unicode_encode.

In [20]:
tf.strings.unicode_encode(
    tf.RaggedTensor.from_sparse(batch_chars_sparse),
    output_encoding='UTF-8')

<tf.Tensor: shape=(4,), dtype=string, numpy=
array([b'h\xc3\x83llo', b'What is the weather tomorrow',
       b'G\xc3\xb6\xc3\xb6dnight', b'\xf0\x9f\x98\x8a'], dtype=object)>

In [21]:
Tensor = tf.strings.unicode_encode(
    tf.RaggedTensor.from_tensor(batch_chars_padded, padding=-1),
    output_encoding='UTF-8')
Tensor

<tf.Tensor: shape=(4,), dtype=string, numpy=
array([b'h\xc3\x83llo', b'What is the weather tomorrow',
       b'G\xc3\xb6\xc3\xb6dnight', b'\xf0\x9f\x98\x8a'], dtype=object)>

In [22]:
for byt in Tensor.numpy():
    print(byt,' | ',byt.decode())

b'h\xc3\x83llo'  |  hÃllo
b'What is the weather tomorrow'  |  What is the weather tomorrow
b'G\xc3\xb6\xc3\xb6dnight'  |  Göödnight
b'\xf0\x9f\x98\x8a'  |  😊


## Unicode operations
### Character length
Use the unit parameter of the `tf.strings.length` op to indicate how character lengths should be computed. `unit` defaults to `"BYTE"`, but it can be set to other values, such as `"UTF8_CHAR"` or `"UTF16_CHAR"`, to determine the number of Unicode codepoints in each encoded string.

In [23]:
# Note that the final character takes up 4 bytes in UTF8.
thanks = u'Thanks 😊'.encode('UTF-16BE')
num_bytes = tf.strings.length(thanks).numpy()
num_chars = tf.strings.length(thanks, unit='UTF8_CHAR').numpy()
print('{} bytes; {} UTF-8 characters'.format(num_bytes, num_chars))

18 bytes; 18 UTF-8 characters


In [24]:
# Note that the final character takes up 4 bytes in UTF8.
thanks = u'Thanks 😊'.encode('UTF-8')
num_bytes = tf.strings.length(thanks).numpy()
num_chars = tf.strings.length(thanks, unit='UTF8_CHAR').numpy()
print('{} bytes; {} UTF-8 characters'.format(num_bytes, num_chars))

11 bytes; 8 UTF-8 characters


### Character substrings

In [25]:
#For each string in the input `Tensor`, creates a substring starting at index `pos` with a total length of `len`.
tf.strings.substr(thanks, pos=0, len=len(thanks)).numpy()

b'Thanks \xf0\x9f\x98\x8a'

In [26]:
tf.strings.substr(thanks, pos=0, len=len(thanks)).numpy().decode()

'Thanks 😊'

In [27]:
# Here, unit='BYTE' (default). Returns a single byte with len=1
tf.strings.substr(thanks, pos=7, len=4).numpy()

b'\xf0\x9f\x98\x8a'

In [28]:
tf.strings.substr(thanks, pos=7, len=4).numpy().decode()

'😊'

In [29]:
# Here, unit='BYTE' (default). Returns a single byte with len=1
tf.strings.substr(thanks, pos=7, len=1).numpy()

b'\xf0'

In [30]:
# Specifying unit='UTF8_CHAR', returns a single 4 byte character in this case
tf.strings.substr(thanks, pos=7, len=1, unit='UTF8_CHAR').numpy()

b'\xf0\x9f\x98\x8a'

In [31]:
tf.strings.substr(thanks, pos=7, len=1, unit='UTF8_CHAR').numpy().decode()

'😊'

### Split Unicode strings
The `tf.strings.unicode_split` operation splits unicode strings into substrings of individual characters.

In [32]:
tf.strings.unicode_split(thanks, 'UTF-8').numpy()

array([b'T', b'h', b'a', b'n', b'k', b's', b' ', b'\xf0\x9f\x98\x8a'],
      dtype=object)

### Byte offsets for characters
To align the character tensor generated by `tf.strings.unicode_decode` with the original string, it's useful to know the offset for where each character begins. The method `tf.strings.unicode_decode_with_offsets` is similar to `unicode_decode`, except that it returns a second tensor containing the start offset of each character.

In [33]:
codepoints, offsets = tf.strings.unicode_decode_with_offsets(u'🤣🎈🎉🎊😇', 'UTF-8')

for (codepoint, offset) in zip(codepoints.numpy(), offsets.numpy()):
    print('At byte offset : {} | codepoint : {} | string scalar : {} | Text : {}'.format(
        offset, 
        codepoint, 
        tf.strings.unicode_encode([codepoint], output_encoding='UTF-8'),
        tf.strings.unicode_encode([codepoint], output_encoding='UTF-8').numpy().decode()
    ))

At byte offset : 0 | codepoint : 129315 | string scalar : b'\xf0\x9f\xa4\xa3' | Text : 🤣
At byte offset : 4 | codepoint : 127880 | string scalar : b'\xf0\x9f\x8e\x88' | Text : 🎈
At byte offset : 8 | codepoint : 127881 | string scalar : b'\xf0\x9f\x8e\x89' | Text : 🎉
At byte offset : 12 | codepoint : 127882 | string scalar : b'\xf0\x9f\x8e\x8a' | Text : 🎊
At byte offset : 16 | codepoint : 128519 | string scalar : b'\xf0\x9f\x98\x87' | Text : 😇


## Unicode scripts

TensorFlow provides the `tf.strings.unicode_script` operation to determine which script a given codepoint uses. The script codes are `int32` values corresponding to [International Components for Unicode (ICU)](https://icu.unicode.org/home) `UScriptCode` values.

In [34]:
uscript = tf.strings.unicode_script([33464, 1041])  # ['芸', 'Б']

print(uscript.numpy())  # [17, 8] == [USCRIPT_HAN, USCRIPT_CYRILLIC]

[17  8]


The `tf.strings.unicode_script` operation can also be applied to multidimensional `tf.Tensors` or `tf.RaggedTensors` of codepoints:

In [35]:
batch_chars_ragged

<tf.RaggedTensor [[104, 195, 108, 108, 111],
 [87, 104, 97, 116, 32, 105, 115, 32, 116, 104, 101, 32, 119, 101, 97, 116,
  104, 101, 114, 32, 116, 111, 109, 111, 114, 114, 111, 119]               ,
 [71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]>

In [36]:
tf.strings.unicode_script(batch_chars_ragged)

<tf.RaggedTensor [[25, 25, 25, 25, 25],
 [25, 25, 25, 25, 0, 25, 25, 0, 25, 25, 25, 0, 25, 25, 25, 25, 25, 25, 25,
  0, 25, 25, 25, 25, 25, 25, 25, 25]                                      ,
 [25, 25, 25, 25, 25, 25, 25, 25, 25], [0]]>

In [37]:
uscript = tf.strings.unicode_script(batch_chars_ragged)
# Text vector and string
for sentence_chars in uscript.to_list():
    print('Vector : ',sentence_chars)
    
    #Converts vector of code points to an encoded string scalar.
    print('Text :',tf.strings.unicode_encode(sentence_chars, output_encoding='UTF-8').numpy().decode())
    print('-'*100)

Vector :  [25, 25, 25, 25, 25]
Text : 
----------------------------------------------------------------------------------------------------
Vector :  [25, 25, 25, 25, 0, 25, 25, 0, 25, 25, 25, 0, 25, 25, 25, 25, 25, 25, 25, 0, 25, 25, 25, 25, 25, 25, 25, 25]
Text :     
----------------------------------------------------------------------------------------------------
Vector :  [25, 25, 25, 25, 25, 25, 25, 25, 25]
Text : 
----------------------------------------------------------------------------------------------------
Vector :  [0]
Text :  
----------------------------------------------------------------------------------------------------


## Example: Simple segmentation
Segmentation is the task of splitting text into word-like units. This is often easy when space characters are used to separate words, but some languages (like Chinese and Japanese) do not use spaces, and some languages (like German) contain long compounds that must be split in order to analyze their meaning. In web text, different languages and scripts are frequently mixed together, as in "NY株価" (New York Stock Exchange)

In [38]:
# dtype: string; shape: [num_sentences]
#
# The sentences to process.  Edit this line to try out different inputs!
sentence_texts = [u'Hello, world.', u'世界こんにちは']

In [39]:
# dtype: int32; shape: [num_sentences, (num_chars_per_sentence)]
#
# sentence_char_codepoint[i, j] is the codepoint for the j'th character in
# the i'th sentence.
sentence_char_codepoint = tf.strings.unicode_decode(sentence_texts, 'UTF-8')
print(sentence_char_codepoint)

# dtype: int32; shape: [num_sentences, (num_chars_per_sentence)]
#
# sentence_char_scripts[i, j] is the Unicode script of the j'th character in
# the i'th sentence.
sentence_char_script = tf.strings.unicode_script(sentence_char_codepoint)
print(sentence_char_script)

<tf.RaggedTensor [[72, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 46],
 [19990, 30028, 12371, 12435, 12395, 12385, 12399]]>
<tf.RaggedTensor [[25, 25, 25, 25, 25, 0, 0, 25, 25, 25, 25, 25, 0],
 [17, 17, 20, 20, 20, 20, 20]]>


Use the script identifiers to determine where word boundaries should be added. Add a word boundary at the beginning of each sentence, and for each character whose script differs from the previous character.

In [40]:
# dtype: bool; shape: [num_sentences, (num_chars_per_sentence)]
#
# sentence_char_starts_word[i, j] is True if the j'th character in the i'th
# sentence is the start of a word.
sentence_char_starts_word = tf.concat(
    [tf.fill([sentence_char_script.nrows(), 1], True),
     tf.not_equal(sentence_char_script[:, 1:], sentence_char_script[:, :-1])],
    axis=1)
sentence_char_starts_word

<tf.RaggedTensor [[True, False, False, False, False, True, False, True, False, False, False,
  False, True]                                                             ,
 [True, False, True, False, False, False, False]]>

In [41]:
# dtype: int64; shape: [num_words]
#
# word_starts[i] is the index of the character that starts the i'th word (in
# the flattened list of characters from all sentences).
word_starts = tf.squeeze(tf.where(sentence_char_starts_word.values), axis=1)
print(word_starts)

tf.Tensor([ 0  5  7 12 13 15], shape=(6,), dtype=int64)


Use those start offsets to build a `RaggedTensor` containing the list of words from all batches.

In [42]:
# dtype: int32; shape: [num_words, (num_chars_per_word)]
#
# word_char_codepoint[i, j] is the codepoint for the j'th character in the
# i'th word.
word_char_codepoint = tf.RaggedTensor.from_row_starts(
    values=sentence_char_codepoint.values,
    row_starts=word_starts)
print(word_char_codepoint)

<tf.RaggedTensor [[72, 101, 108, 108, 111], [44, 32], [119, 111, 114, 108, 100], [46],
 [19990, 30028], [12371, 12435, 12395, 12385, 12399]]>


To finish, segment the word codepoints RaggedTensor back into sentences and encode into UTF-8 strings for readability.

In [43]:
# dtype: int64; shape: [num_sentences]
#
# sentence_num_words[i] is the number of words in the i'th sentence.
sentence_num_words = tf.reduce_sum(
    tf.cast(sentence_char_starts_word, tf.int64),
    axis=1)

# dtype: int32; shape: [num_sentences, (num_words_per_sentence), (num_chars_per_word)]
#
# sentence_word_char_codepoint[i, j, k] is the codepoint for the k'th character
# in the j'th word in the i'th sentence.
sentence_word_char_codepoint = tf.RaggedTensor.from_row_lengths(
    values=word_char_codepoint,
    row_lengths=sentence_num_words)
print(sentence_word_char_codepoint)
print()

for sentence_chars in tf.strings.unicode_encode(sentence_word_char_codepoint, 'UTF-8').to_list():
    for char in sentence_chars:
        print(char.decode(), end="")
    print()

<tf.RaggedTensor [[[72, 101, 108, 108, 111], [44, 32], [119, 111, 114, 108, 100], [46]],
 [[19990, 30028], [12371, 12435, 12395, 12385, 12399]]]>

Hello, world.
世界こんにちは


## Example

In [44]:
# Canvert Text to UTF8-encoded string.

Test_batch_utf8 = [s.encode('UTF-8') for s in [u"ॐ गं गणपतये नम:",
                                               u'ॐ भूर्भव: स्व: तत्सवितुर्वरेण्यं भर्गो देवस्य धीमहि धियो यो न: प्रचोदयात्।',
                                               u'पवन कुमार गुंजन']]

# Converts UTF-8 encoded string scalar to a vector of code points.
Test_batch_chars_ragged = tf.strings.unicode_decode(Test_batch_utf8,
                                                    input_encoding='UTF-8')
# Text vector and string
for sentence_chars in Test_batch_chars_ragged.to_list():
    print('Vector : ',sentence_chars)
    
    #Converts vector of code points to an encoded string scalar.
    print('Text :',tf.strings.unicode_encode(sentence_chars, output_encoding='UTF-8').numpy().decode())
    print('-'*100)

Vector :  [2384, 32, 2327, 2306, 32, 2327, 2339, 2346, 2340, 2351, 2375, 32, 2344, 2350, 58]
Text : ॐ गं गणपतये नम:
----------------------------------------------------------------------------------------------------
Vector :  [2384, 32, 2349, 2370, 2352, 2381, 2349, 2357, 58, 32, 2360, 2381, 2357, 58, 32, 2340, 2340, 2381, 2360, 2357, 2367, 2340, 2369, 2352, 2381, 2357, 2352, 2375, 2339, 2381, 2351, 2306, 32, 2349, 2352, 2381, 2327, 2379, 32, 2342, 2375, 2357, 2360, 2381, 2351, 32, 2343, 2368, 2350, 2361, 2367, 32, 2343, 2367, 2351, 2379, 32, 2351, 2379, 32, 2344, 58, 32, 2346, 2381, 2352, 2330, 2379, 2342, 2351, 2366, 2340, 2381, 2404]
Text : ॐ भूर्भव: स्व: तत्सवितुर्वरेण्यं भर्गो देवस्य धीमहि धियो यो न: प्रचोदयात्।
----------------------------------------------------------------------------------------------------
Vector :  [2346, 2357, 2344, 32, 2325, 2369, 2350, 2366, 2352, 32, 2327, 2369, 2306, 2332, 2344]
Text : पवन कुमार गुंजन
---------------------------------------------------

In [45]:
uscript = tf.strings.unicode_script(Test_batch_chars_ragged)

sentence_char_starts_word = tf.concat(
    [tf.fill([uscript.nrows(), 1], True),
     tf.not_equal(uscript[:, 1:], uscript[:, :-1])],
    axis=1)

# word_starts[i] is the index of the character that starts the i'th word 
#(in the flattened list of characters from all sentences).
word_starts = tf.squeeze(tf.where(sentence_char_starts_word.values), axis=1)
#print(word_starts)

# word_char_codepoint[i, j] is the codepoint for the j'th character in the i'th word.
word_char_codepoint = tf.RaggedTensor.from_row_starts(
    values=Test_batch_chars_ragged.values,
    row_starts=word_starts)
#print(word_char_codepoint)

# sentence_num_words[i] is the number of words in the i'th sentence.
sentence_num_words = tf.reduce_sum(
    tf.cast(sentence_char_starts_word, tf.int64),
    axis=1)


# sentence_word_char_codepoint[i, j, k] is the codepoint for the k'th character
# in the j'th word in the i'th sentence.
sentence_word_char_codepoint = tf.RaggedTensor.from_row_lengths(
    values=word_char_codepoint,
    row_lengths=sentence_num_words)
#print(sentence_word_char_codepoint)

for sentence_chars in tf.strings.unicode_encode(sentence_word_char_codepoint, 'UTF-8').to_list():
    #print('Vector : ',sentence_chars)
    for char in sentence_chars:
        print(char.decode(), end="")
    print()

ॐ गं गणपतये नम:
ॐ भूर्भव: स्व: तत्सवितुर्वरेण्यं भर्गो देवस्य धीमहि धियो यो न: प्रचोदयात्।
पवन कुमार गुंजन


#### UScriptCode is useful for Hindi and sanskrit word