### Pad Sequences using Numpy

In [1]:
# Example 2:

import numpy as np

data = [['Thanks', 'for', 'an', 'excellent', 'report'],
        ['Your', 'service', 'is', 'very', 'quick', 'and', 'fast'],
        ['I', 'am', 'pleased', 'with', 'your', 'service']]

wrds = []

[[wrds.append(w.lower()) for w in x] for x in data]

wrds = set(wrds)

print(wrds,"\n")

word_id = {}

for i,w in enumerate(wrds,1):
    word_id[w] = i
    
print(word_id,"\n")

row = len(data)

column = np.max([len(x) for x in data])

print("row={0} and column={1}".format(row,column))

seq_embedding = np.zeros((row,column))

print("seq_embedding = \n {0}".format(seq_embedding))

print("\n")

for i in range(len(data)):
    doc = data[i]
    for j in range(len(doc)):
        id = word_id[(doc[j]).lower()]
        seq_embedding[i][j] = id
        
print(seq_embedding)

{'i', 'fast', 'pleased', 'excellent', 'for', 'is', 'very', 'and', 'your', 'am', 'report', 'thanks', 'quick', 'service', 'an', 'with'} 

{'i': 1, 'fast': 2, 'pleased': 3, 'excellent': 4, 'for': 5, 'is': 6, 'very': 7, 'and': 8, 'your': 9, 'am': 10, 'report': 11, 'thanks': 12, 'quick': 13, 'service': 14, 'an': 15, 'with': 16} 

row=3 and column=7
seq_embedding = 
 [[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]]


[[12.  5. 15.  4. 11.  0.  0.]
 [ 9. 14.  6.  7. 13.  8.  2.]
 [ 1. 10.  3. 16.  9. 14.  0.]]


In [1]:
# Example 1 : using Keras

import numpy as np
from tensorflow.keras.preprocessing import sequence

data_vec = [['3', '18', '9', '3', '11', '5', '20'],
            ['3', '8', '1', '12'],
            ['18', '1', '8', '1'],
            ['8', '1', '9', '14'],
            ['25', '1', '8', '1'],
            ['9']]

maxl=np.max([len(x) for x in data_vec])

print('maxl:{0}'.format(maxl))

x_data_vec = sequence.pad_sequences(data_vec, maxlen=maxl)

x_data_vec

maxl:7


array([[ 3, 18,  9,  3, 11,  5, 20],
       [ 0,  0,  0,  3,  8,  1, 12],
       [ 0,  0,  0, 18,  1,  8,  1],
       [ 0,  0,  0,  8,  1,  9, 14],
       [ 0,  0,  0, 25,  1,  8,  1],
       [ 0,  0,  0,  0,  0,  0,  9]])

In [2]:
# padding = 'post'

x_data_vec = sequence.pad_sequences(data_vec, maxlen=maxl,padding='post')

x_data_vec

array([[ 3, 18,  9,  3, 11,  5, 20],
       [ 3,  8,  1, 12,  0,  0,  0],
       [18,  1,  8,  1,  0,  0,  0],
       [ 8,  1,  9, 14,  0,  0,  0],
       [25,  1,  8,  1,  0,  0,  0],
       [ 9,  0,  0,  0,  0,  0,  0]])

In [3]:
# Max Length

x_data_vec = sequence.pad_sequences(data_vec, maxlen=10,padding='post')

x_data_vec

array([[ 3, 18,  9,  3, 11,  5, 20,  0,  0,  0],
       [ 3,  8,  1, 12,  0,  0,  0,  0,  0,  0],
       [18,  1,  8,  1,  0,  0,  0,  0,  0,  0],
       [ 8,  1,  9, 14,  0,  0,  0,  0,  0,  0],
       [25,  1,  8,  1,  0,  0,  0,  0,  0,  0],
       [ 9,  0,  0,  0,  0,  0,  0,  0,  0,  0]])

### Get Input and Padded Sequence for Model Training

In [11]:
import pandas as pd

sent1 = "Located on the southern tip of Lake Union, the Hilton Garden Inn"

In [12]:
sent = [sent1]

In [13]:
data = pd.DataFrame(sent,columns=['Description'])

data

Unnamed: 0,Description
0,"Located on the southern tip of Lake Union, the..."


In [14]:
all_description = list(data.Description)

In [15]:
corpus = [x for x in all_description]

corpus

['Located on the southern tip of Lake Union, the Hilton Garden Inn']

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer

t = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False,\
              oov_token=None, document_count=0)

t.fit_on_texts(corpus)

In [None]:
# Located on the southern tip of Lake Union, the Hilton Garden Inn"


# Word_Count 
# ('located', 1)
# ('on', 1), 
# ('the', 2), 
# ('southern', 1), 
# ('tip', 1), 
# ('of', 1), 
# ('lake', 1), 
# ('union', 1), 
# ('hilton', 1), 
# ('garden', 1), 
# ('inn', 1)

# Word Index
# 'the' : 1
# 'located': 2, 
# 'on': 3, 
# 'southern': 4, 
# 'tip': 5, 
# 'of': 6, 
# 'lake': 7, 
# 'union': 8, 
# 'hilton': 9, 
# 'garden': 10, 
# 'inn': 11

# texts_to_sequences
# [Located on the southern tip of Lake Union, the Hilton Garden Inn]
# [2,3,1,4,5,6,7,8,1,9,10,11]

In [17]:
# corpus = [Located on the southern tip of Lake Union, the Hilton Garden Inn]

token_list = t.texts_to_sequences(corpus)

token_list

[[2, 3, 1, 4, 5, 6, 7, 8, 1, 9, 10, 11]]

In [18]:
def get_sequence_of_tokens(corpus):
    t.fit_on_texts(corpus)
    total_words = len(t.word_index) + 1
    
    input_sequences = []
    for line in corpus:
        token_list = t.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
        
    return input_sequences, total_words

input_sequences, total_words = get_sequence_of_tokens(corpus)

input_sequences

[[2, 3],
 [2, 3, 1],
 [2, 3, 1, 4],
 [2, 3, 1, 4, 5],
 [2, 3, 1, 4, 5, 6],
 [2, 3, 1, 4, 5, 6, 7],
 [2, 3, 1, 4, 5, 6, 7, 8],
 [2, 3, 1, 4, 5, 6, 7, 8, 1],
 [2, 3, 1, 4, 5, 6, 7, 8, 1, 9],
 [2, 3, 1, 4, 5, 6, 7, 8, 1, 9, 10],
 [2, 3, 1, 4, 5, 6, 7, 8, 1, 9, 10, 11]]

In [19]:
total_words

12

In [20]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.utils as ku 

# pad sequences 
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_sequence_len, padding = 'pre'))
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes = total_words)
    
    return predictors, label, max_sequence_len

In [21]:
predictors, label, max_sequence_len = generate_padded_sequences(input_sequences)

In [22]:
predictors

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  3],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  2,  3,  1],
       [ 0,  0,  0,  0,  0,  0,  0,  2,  3,  1,  4],
       [ 0,  0,  0,  0,  0,  0,  2,  3,  1,  4,  5],
       [ 0,  0,  0,  0,  0,  2,  3,  1,  4,  5,  6],
       [ 0,  0,  0,  0,  2,  3,  1,  4,  5,  6,  7],
       [ 0,  0,  0,  2,  3,  1,  4,  5,  6,  7,  8],
       [ 0,  0,  2,  3,  1,  4,  5,  6,  7,  8,  1],
       [ 0,  2,  3,  1,  4,  5,  6,  7,  8,  1,  9],
       [ 2,  3,  1,  4,  5,  6,  7,  8,  1,  9, 10]])

In [23]:
predictors.shape

(11, 11)

In [24]:
label

array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32)

In [25]:
max_sequence_len

12

In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

def create_model(max_sequence_len, total_words):
    model = Sequential()
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=max_sequence_len - 1))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 11, 10)            120       
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 12)                1212      
Total params: 45,732
Trainable params: 45,732
Non-trainable params: 0
_________________________________________________________________


In [29]:
model.fit(predictors, label, epochs=100, verbose=5)

Train on 11 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
E

<tensorflow.python.keras.callbacks.History at 0x2f0794139c8>