### Sub Tokenization: further splitting of tokens into parts and giving them a unique id is the goal.
#### eg: Tensorflow: Ten:1, sor:2, flow:3 This is how the data will split.
#### Remember this is case sensitive and it includes all the punctualtions and everything.

In [44]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

In [150]:
imdb,info=tfds.load('imdb_reviews/subwords8k',with_info=True,as_supervised=True)

### Previously we have the data for that we will create tokens and pad_sequences
### tokens are converted to sub-tokens and are already built in this dataset, related to IMDB
### When we tryto encode any text it will pick from the encoder list and assign those vector values

In [86]:
features=info.features

### Features are two keys Label and text
### If you can see the keys: label has num_classes as method, and text has encoder as method.
### within that encoder it has subwordtextencoder where we can tr encoding the text

In [87]:
features.keys

<bound method FeaturesDict.keys of FeaturesDict({
    'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
    'text': Text(shape=(None,), dtype=tf.int64, encoder=<SubwordTextEncoder vocab_size=8185>),
})>

### Lets take the text encoder and try to encode simple text

In [94]:
sub_tokenizer=features['text'].encoder

### Lets encode an example

In [95]:
sample='Hello! How are you doing my dear?'
sample_encoder=sub_tokenizer.encode(sample)

In [106]:
sample_encoder

[4025, 8040, 90, 693, 29, 37, 573, 82, 4946, 8043, 7992]

In [98]:
sub_tokenizer.subwords[:10]

['the_', ', ', '. ', 'a_', 'and_', 'of_', 'to_', 's_', 'is_', 'br']

In [104]:
sub_words_index=sub_tokenizer.subwords

## lets try to decode the encoder

In [99]:
sample_decoder=sub_tokenizer.decode(sample_encoder)

In [100]:
sample_decoder

'Hello! How are you doing my dear?'

### Now lets map the words and check the values

In [118]:
for i in sample_encoder:
    
    print(str(i) + '--->' + str(sub_tokenizer.decode([i])))

4025--->Hell
8040--->o
90--->! 
693--->How 
29--->are 
37--->you 
573--->doing 
82--->my 
4946--->dea
8043--->r
7992--->?


### If you observe the aobve sub-tokens are case sensitive.
### Individually they doesn't make any sense but together they make sense
### Lets train the model with these tokens

In [120]:
from tensorflow import keras
from tensorflow.keras.layers import Embedding,GlobalAveragePooling1D,Dense

In [279]:
model=keras.Sequential()
model.add(Embedding(input_dim=sub_tokenizer.vocab_size,output_dim=64))
model.add(GlobalAveragePooling1D())
model.add(Dense(units=6,activation=tf.nn.relu))
model.add(Dense(units=1,activation=tf.nn.sigmoid))
model.compile(optimizer='adam',loss=keras.losses.BinaryCrossentropy(),
             metrics=['accuracy'])
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 64)          523840    
_________________________________________________________________
global_average_pooling1d_6 ( (None, 64)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 6)                 390       
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 7         
Total params: 524,237
Trainable params: 524,237
Non-trainable params: 0
_________________________________________________________________


#### Lets create train data and test data for model fitting

In [172]:
train_data,test_data=imdb['train'],imdb['test']

In [207]:
train_cat,train_label=[],[]
test_cat,test_label=[],[]
for cat,label in train_data:
    train_cat.append(cat)
    train_label.append(label.numpy())
    
for cat,label in test_data:
    test_cat.append(cat)
    test_label.append(label.numpy())

In [208]:
train_label=np.array(train_label)
test_label=np.array(test_label)
#train_cat=np.asarray(train_cat)
#test_cat=np.asarray(test_cat)

In [228]:
test_label[:10]

array([1, 1, 0, 0, 1, 1, 1, 1, 0, 1])

In [211]:
train_cat[0]

<tf.Tensor: shape=(163,), dtype=int64, numpy=
array([  62,   18,   41,  604,  927,   65,    3,  644, 7968,   21,   35,
       5096,   36,   11,   43, 2948, 5240,  102,   50,  681, 7862, 1244,
          3, 3266,   29,  122,  640,    2,   26,   14,  279,  438,   35,
         79,  349,  384,   11, 1991,    3,  492,   79,  122,  188,  117,
         33, 4047, 4531,   14,   65, 7968,    8, 1819, 3947,    3,   62,
         27,    9,   41,  577, 5044, 2629, 2552, 7193, 7961, 3642,    3,
         19,  107, 3903,  225,   85,  198,   72,    1, 1512,  738, 2347,
        102, 6245,    8,   85,  308,   79, 6936, 7961,   23, 4981, 8044,
          3, 6429, 7961, 1141, 1335, 1848, 4848,   55, 3601, 4217, 8050,
          2,    5,   59, 3831, 1484, 8040, 7974,  174, 5773,   22, 5240,
        102,   18,  247,   26,    4, 3903, 1612, 3902,  291,   11,    4,
         27,   13,   18, 4092, 4008, 7961,    6,  119,  213, 2774,    3,
         12,  258, 2306,   13,   91,   29,  171,   52,  229,    2, 1245,
     

In [264]:
(train_data, test_data), info = tfds.load(
    # Use the version pre-encoded with an ~8k vocabulary.
    'imdb_reviews/subwords8k', 
    # Return the train/test datasets as a tuple.
    split = (tfds.Split.TRAIN, tfds.Split.TEST),
    # Return (example, label) pairs from the dataset (instead of a dictionary).
    as_supervised=True,
    # Also return the `info` structure. 
    with_info=True)

In [268]:
train_data


<DatasetV1Adapter shapes: ((None,), ()), types: (tf.int64, tf.int64)>

### Padded Batch is similar to Sequencing and putting all the values in sync
##### Eg: if the value is 32 then all the length equals to 32

In [276]:
train_batch=train_data.shuffle(1000).padded_batch(batch_size=32,padded_shapes=([None], []))
#test_batch=test_data.padded_batch(32)
test_batches = (
    test_data
    .padded_batch(batch_size=32,padded_shapes=([None], [])))


In [277]:
for train_example, train_label in train_data.take(1):
    print('Encoded text:', train_example[:10].numpy())
    print('Label:', train_label.numpy())



Encoded text: [  62   18   41  604  927   65    3  644 7968   21]
Label: 0


In [278]:
for example_batch, label_batch in train_batch.take(2):
    print("Batch shape:", example_batch.shape)
    print("label shape:", label_batch.shape)


Batch shape: (32, 974)
label shape: (32,)
Batch shape: (32, 697)
label shape: (32,)


In [280]:
model.fit(train_batch,epochs=3,validation_data=test_batches)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f264764af10>