In [1]:
import numpy as np
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
%matplotlib notebook 

In [2]:
import tensorflow as tf
from tensorflow import keras

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [4]:
sentences = [
    'My name is Anthony Gonzalves and I am alone in this world.',
    'How many people were there? Sir, two. And you three, even then you returned empty handed',
    'It is not only difficult to capture Don, it is impossible',
    'In big-big cities, such small-small incidences keep on happening',
    'Dog! I will drink your blood',
    'The good, the bad and the ugly',
    'Small is beautiful',
    '''The illiterate of the twenty-first century will not be those who 
    cannot learn but those who cannot learn, unlearn and relearn'''
]

In [5]:
#tokenizer = Tokenizer(num_words = 100, filters = '.', 
#                     lower = True, split =' ',
#                     char_level = False, oov_token = "<UNK>")
tokenizer = Tokenizer(num_words = 100, lower = True, 
                     char_level = False, oov_token = "<UNK>")

In [6]:
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print('word_index:\n', word_index)

word_index:
 {'<UNK>': 1, 'the': 2, 'is': 3, 'and': 4, 'small': 5, 'i': 6, 'in': 7, 'you': 8, 'it': 9, 'not': 10, 'big': 11, 'will': 12, 'those': 13, 'who': 14, 'cannot': 15, 'learn': 16, 'my': 17, 'name': 18, 'anthony': 19, 'gonzalves': 20, 'am': 21, 'alone': 22, 'this': 23, 'world': 24, 'how': 25, 'many': 26, 'people': 27, 'were': 28, 'there': 29, 'sir': 30, 'two': 31, 'three': 32, 'even': 33, 'then': 34, 'returned': 35, 'empty': 36, 'handed': 37, 'only': 38, 'difficult': 39, 'to': 40, 'capture': 41, 'don': 42, 'impossible': 43, 'cities': 44, 'such': 45, 'incidences': 46, 'keep': 47, 'on': 48, 'happening': 49, 'dog': 50, 'drink': 51, 'your': 52, 'blood': 53, 'good': 54, 'bad': 55, 'ugly': 56, 'beautiful': 57, 'illiterate': 58, 'of': 59, 'twenty': 60, 'first': 61, 'century': 62, 'be': 63, 'but': 64, 'unlearn': 65, 'relearn': 66}


In [7]:
n = tokenizer.word_counts
print('word_count:\n',n)

word_count:
 OrderedDict([('my', 1), ('name', 1), ('is', 4), ('anthony', 1), ('gonzalves', 1), ('and', 4), ('i', 2), ('am', 1), ('alone', 1), ('in', 2), ('this', 1), ('world', 1), ('how', 1), ('many', 1), ('people', 1), ('were', 1), ('there', 1), ('sir', 1), ('two', 1), ('you', 2), ('three', 1), ('even', 1), ('then', 1), ('returned', 1), ('empty', 1), ('handed', 1), ('it', 2), ('not', 2), ('only', 1), ('difficult', 1), ('to', 1), ('capture', 1), ('don', 1), ('impossible', 1), ('big', 2), ('cities', 1), ('such', 1), ('small', 3), ('incidences', 1), ('keep', 1), ('on', 1), ('happening', 1), ('dog', 1), ('will', 2), ('drink', 1), ('your', 1), ('blood', 1), ('the', 5), ('good', 1), ('bad', 1), ('ugly', 1), ('beautiful', 1), ('illiterate', 1), ('of', 1), ('twenty', 1), ('first', 1), ('century', 1), ('be', 1), ('those', 2), ('who', 2), ('cannot', 2), ('learn', 2), ('but', 1), ('unlearn', 1), ('relearn', 1)])


In [8]:
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[17, 18, 3, 19, 20, 4, 6, 21, 22, 7, 23, 24], [25, 26, 27, 28, 29, 30, 31, 4, 8, 32, 33, 34, 8, 35, 36, 37], [9, 3, 10, 38, 39, 40, 41, 42, 9, 3, 43], [7, 11, 11, 44, 45, 5, 5, 46, 47, 48, 49], [50, 6, 12, 51, 52, 53], [2, 54, 2, 55, 4, 2, 56], [5, 3, 57], [2, 58, 59, 2, 60, 61, 62, 12, 10, 63, 13, 14, 15, 16, 64, 13, 14, 15, 16, 65, 4, 66]]


In [9]:
return_sentences = tokenizer.sequences_to_texts(sequences)
print(return_sentences)

['my name is anthony gonzalves and i am alone in this world', 'how many people were there sir two and you three even then you returned empty handed', 'it is not only difficult to capture don it is impossible', 'in big big cities such small small incidences keep on happening', 'dog i will drink your blood', 'the good the bad and the ugly', 'small is beautiful', 'the illiterate of the twenty first century will not be those who cannot learn but those who cannot learn unlearn and relearn']


In [12]:
sentences1 = ['Anthony Gonzalves is Don',
             'Rahul! you must have heard this name']
sequences1 = tokenizer.texts_to_sequences(sentences1)
return_sentences1 = tokenizer.sequences_to_texts(sequences1)
print(sequences1)
print(return_sentences1)

[[19, 20, 3, 42], [1, 8, 1, 1, 1, 23, 18]]
['anthony gonzalves is don', '<UNK> you <UNK> <UNK> <UNK> this name']


In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [18]:
padded_sequences = pad_sequences(sequences)
print(sequences)
print(' ')
print(padded_sequences)

[[17, 18, 3, 19, 20, 4, 6, 21, 22, 7, 23, 24], [25, 26, 27, 28, 29, 30, 31, 4, 8, 32, 33, 34, 8, 35, 36, 37], [9, 3, 10, 38, 39, 40, 41, 42, 9, 3, 43], [7, 11, 11, 44, 45, 5, 5, 46, 47, 48, 49], [50, 6, 12, 51, 52, 53], [2, 54, 2, 55, 4, 2, 56], [5, 3, 57], [2, 58, 59, 2, 60, 61, 62, 12, 10, 63, 13, 14, 15, 16, 64, 13, 14, 15, 16, 65, 4, 66]]
 
[[ 0  0  0  0  0  0  0  0  0  0 17 18  3 19 20  4  6 21 22  7 23 24]
 [ 0  0  0  0  0  0 25 26 27 28 29 30 31  4  8 32 33 34  8 35 36 37]
 [ 0  0  0  0  0  0  0  0  0  0  0  9  3 10 38 39 40 41 42  9  3 43]
 [ 0  0  0  0  0  0  0  0  0  0  0  7 11 11 44 45  5  5 46 47 48 49]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 50  6 12 51 52 53]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2 54  2 55  4  2 56]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  5  3 57]
 [ 2 58 59  2 60 61 62 12 10 63 13 14 15 16 64 13 14 15 16 65  4 66]]


In [19]:
padded_sequences = pad_sequences(sequences, padding = 'post')
print(padded_sequences)

[[17 18  3 19 20  4  6 21 22  7 23 24  0  0  0  0  0  0  0  0  0  0]
 [25 26 27 28 29 30 31  4  8 32 33 34  8 35 36 37  0  0  0  0  0  0]
 [ 9  3 10 38 39 40 41 42  9  3 43  0  0  0  0  0  0  0  0  0  0  0]
 [ 7 11 11 44 45  5  5 46 47 48 49  0  0  0  0  0  0  0  0  0  0  0]
 [50  6 12 51 52 53  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2 54  2 55  4  2 56  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 5  3 57  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2 58 59  2 60 61 62 12 10 63 13 14 15 16 64 13 14 15 16 65  4 66]]


In [16]:
# maxlen will truncate the characters where padding has been done, default = 'pre'
# truncating will truncate the sentence, default = 'pre' means 
# words will be truncated from beginning

padded_sequences = pad_sequences(sequences, padding = 'post',
                                maxlen = 10, truncating = 'pre')
print(padded_sequences)

[[ 3 19 20  4  6 21 22  7 23 24]
 [31  4  8 32 33 34  8 35 36 37]
 [ 3 10 38 39 40 41 42  9  3 43]
 [11 11 44 45  5  5 46 47 48 49]
 [50  6 12 51 52 53  0  0  0  0]
 [ 2 54  2 55  4  2 56  0  0  0]
 [ 5  3 57  0  0  0  0  0  0  0]
 [15 16 64 13 14 15 16 65  4 66]]


In [17]:
tokenizer.sequences_to_texts(padded_sequences)

['is anthony gonzalves and i am alone in this world',
 'two and you three even then you returned empty handed',
 'is not only difficult to capture don it is impossible',
 'big big cities such small small incidences keep on happening',
 'dog i will drink your blood <UNK> <UNK> <UNK> <UNK>',
 'the good the bad and the ugly <UNK> <UNK> <UNK>',
 'small is beautiful <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>',
 'cannot learn but those who cannot learn unlearn and relearn']

# TensorFlow Dataset (Reviews from Movie Subscribing Channel - IMDB)

In [20]:
# !pip install -q tensorflow-datasets  -- installed using conda

In [None]:
import tensorflow_datasets as tfds
imdb, info = tfds.load("imdb_reviews", with_info = True, as_supervised = True)