In [35]:
import numpy as np
import tensorflow
from tensorflow import keras

docs= [
    "The dog runs in the park every morning.",
    "A small dog chased a ball across the park.",
    "Children love to play in the park when the sun is bright.",
    "I saw a dog sleeping under the old tree.",
    "The sun rises over the park and fills it with light.",
    "We decided to run together before the sun went down.",
    "A playful dog barked loudly at the gate.",
    "They like to run around the park on weekends.",
    "The park looks peaceful when the sun sets behind the hills.",
    "Running with my dog in the park makes me happy."
]


In [36]:
# Tokenizing the data
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer=Tokenizer(oov_token="nothing") # oov_token-> out of vocabulary token i.e. replace all the words not present in the vocabulary with 'nothing'

In [37]:
tokenizer.fit_on_texts(docs)

In [None]:
tokenizer.word_index # index values assigned to the words in the vocabulary

{'nothing': 1,
 'the': 2,
 'park': 3,
 'dog': 4,
 'a': 5,
 'sun': 6,
 'in': 7,
 'to': 8,
 'when': 9,
 'with': 10,
 'run': 11,
 'runs': 12,
 'every': 13,
 'morning': 14,
 'small': 15,
 'chased': 16,
 'ball': 17,
 'across': 18,
 'children': 19,
 'love': 20,
 'play': 21,
 'is': 22,
 'bright': 23,
 'i': 24,
 'saw': 25,
 'sleeping': 26,
 'under': 27,
 'old': 28,
 'tree': 29,
 'rises': 30,
 'over': 31,
 'and': 32,
 'fills': 33,
 'it': 34,
 'light': 35,
 'we': 36,
 'decided': 37,
 'together': 38,
 'before': 39,
 'went': 40,
 'down': 41,
 'playful': 42,
 'barked': 43,
 'loudly': 44,
 'at': 45,
 'gate': 46,
 'they': 47,
 'like': 48,
 'around': 49,
 'on': 50,
 'weekends': 51,
 'looks': 52,
 'peaceful': 53,
 'sets': 54,
 'behind': 55,
 'hills': 56,
 'running': 57,
 'my': 58,
 'makes': 59,
 'me': 60,
 'happy': 61}

In [39]:
# word frequency
tokenizer.word_counts 

OrderedDict([('the', 15),
             ('dog', 5),
             ('runs', 1),
             ('in', 3),
             ('park', 7),
             ('every', 1),
             ('morning', 1),
             ('a', 4),
             ('small', 1),
             ('chased', 1),
             ('ball', 1),
             ('across', 1),
             ('children', 1),
             ('love', 1),
             ('to', 3),
             ('play', 1),
             ('when', 2),
             ('sun', 4),
             ('is', 1),
             ('bright', 1),
             ('i', 1),
             ('saw', 1),
             ('sleeping', 1),
             ('under', 1),
             ('old', 1),
             ('tree', 1),
             ('rises', 1),
             ('over', 1),
             ('and', 1),
             ('fills', 1),
             ('it', 1),
             ('with', 2),
             ('light', 1),
             ('we', 1),
             ('decided', 1),
             ('run', 2),
             ('together', 1),
             ('before', 1),
  

In [40]:
# number of rows in the data
tokenizer.document_count

10

In [41]:
# generating sequences i.e.converting sentences into numbers
sequences=tokenizer.texts_to_sequences(docs)
sequences

[[2, 4, 12, 7, 2, 3, 13, 14],
 [5, 15, 4, 16, 5, 17, 18, 2, 3],
 [19, 20, 8, 21, 7, 2, 3, 9, 2, 6, 22, 23],
 [24, 25, 5, 4, 26, 27, 2, 28, 29],
 [2, 6, 30, 31, 2, 3, 32, 33, 34, 10, 35],
 [36, 37, 8, 11, 38, 39, 2, 6, 40, 41],
 [5, 42, 4, 43, 44, 45, 2, 46],
 [47, 48, 8, 11, 49, 2, 3, 50, 51],
 [2, 3, 52, 53, 9, 2, 6, 54, 55, 2, 56],
 [57, 10, 58, 4, 7, 2, 3, 59, 60, 61]]

In [42]:
# applying padding to make all the sequences of the same size
from keras.utils import pad_sequences
sequences=pad_sequences(sequences,padding='post') # pre-> zeros will be added in the beginning post-> zeros will be added at the end
sequences

array([[ 2,  4, 12,  7,  2,  3, 13, 14,  0,  0,  0,  0],
       [ 5, 15,  4, 16,  5, 17, 18,  2,  3,  0,  0,  0],
       [19, 20,  8, 21,  7,  2,  3,  9,  2,  6, 22, 23],
       [24, 25,  5,  4, 26, 27,  2, 28, 29,  0,  0,  0],
       [ 2,  6, 30, 31,  2,  3, 32, 33, 34, 10, 35,  0],
       [36, 37,  8, 11, 38, 39,  2,  6, 40, 41,  0,  0],
       [ 5, 42,  4, 43, 44, 45,  2, 46,  0,  0,  0,  0],
       [47, 48,  8, 11, 49,  2,  3, 50, 51,  0,  0,  0],
       [ 2,  3, 52, 53,  9,  2,  6, 54, 55,  2, 56,  0],
       [57, 10, 58,  4,  7,  2,  3, 59, 60, 61,  0,  0]])