In [None]:
'''

Steps :

1. get the Corpus Text
2. Preprocess and Tokenize the text
3. Text to Integer Conversion
4. Building N Gram Sequence
5. Generate Padded Sequences
    Predictor: padding of the 'N Gram Sequence'
    labels : One Hot Encoding of the next word in the Sequence
6. Build Keras Model 'Embedding' Layer 
         

In [1]:
corpus = ["Located on the southern tip of Lake Union","Located in the city's vibrant core",
          "Located in the heart of downtown Seattle",
          "Whats near our hotel downtown Seattle location? The better \nquestion might be what?s not nearby"]

In [2]:
print(corpus)

['Located on the southern tip of Lake Union', "Located in the city's vibrant core", 'Located in the heart of downtown Seattle', 'Whats near our hotel downtown Seattle location? The better \nquestion might be what?s not nearby']


In [230]:
print(len(corpus))

4


In [231]:
corpus = [x for x in corpus]
corpus

['Located on the southern tip of Lake Union',
 "Located in the city's vibrant core",
 'Located in the heart of downtown Seattle',
 'Whats near our hotel downtown Seattle location? The better \nquestion might be what?s not nearby']

In [232]:
from tensorflow.keras.preprocessing.text import Tokenizer

t = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, \
              oov_token=None, document_count=0)

t.fit_on_texts(corpus)

In [233]:
# Tokenization
t = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, \
              oov_token=None, document_count=0)

In [234]:
t.fit_on_texts(corpus)

In [235]:
t.word_index

{'the': 1,
 'located': 2,
 'of': 3,
 'in': 4,
 'downtown': 5,
 'seattle': 6,
 'on': 7,
 'southern': 8,
 'tip': 9,
 'lake': 10,
 'union': 11,
 "city's": 12,
 'vibrant': 13,
 'core': 14,
 'heart': 15,
 'whats': 16,
 'near': 17,
 'our': 18,
 'hotel': 19,
 'location': 20,
 'better': 21,
 'question': 22,
 'might': 23,
 'be': 24,
 'what': 25,
 's': 26,
 'not': 27,
 'nearby': 28}

In [236]:
word2id = t.word_index

word2id

{'the': 1,
 'located': 2,
 'of': 3,
 'in': 4,
 'downtown': 5,
 'seattle': 6,
 'on': 7,
 'southern': 8,
 'tip': 9,
 'lake': 10,
 'union': 11,
 "city's": 12,
 'vibrant': 13,
 'core': 14,
 'heart': 15,
 'whats': 16,
 'near': 17,
 'our': 18,
 'hotel': 19,
 'location': 20,
 'better': 21,
 'question': 22,
 'might': 23,
 'be': 24,
 'what': 25,
 's': 26,
 'not': 27,
 'nearby': 28}

In [237]:
id2word = {v:k for k, v in word2id.items()}

id2word

{1: 'the',
 2: 'located',
 3: 'of',
 4: 'in',
 5: 'downtown',
 6: 'seattle',
 7: 'on',
 8: 'southern',
 9: 'tip',
 10: 'lake',
 11: 'union',
 12: "city's",
 13: 'vibrant',
 14: 'core',
 15: 'heart',
 16: 'whats',
 17: 'near',
 18: 'our',
 19: 'hotel',
 20: 'location',
 21: 'better',
 22: 'question',
 23: 'might',
 24: 'be',
 25: 'what',
 26: 's',
 27: 'not',
 28: 'nearby'}

In [238]:
token_list = [t.texts_to_sequences([x])[0] for x in corpus]

token_list

[[2, 7, 1, 8, 9, 3, 10, 11],
 [2, 4, 1, 12, 13, 14],
 [2, 4, 1, 15, 3, 5, 6],
 [16, 17, 18, 19, 5, 6, 20, 1, 21, 22, 23, 24, 25, 26, 27, 28]]

In [240]:
[[id2word[y] for y in x]for x in token_list]

[['located', 'on', 'the', 'southern', 'tip', 'of', 'lake', 'union'],
 ['located', 'in', 'the', "city's", 'vibrant', 'core'],
 ['located', 'in', 'the', 'heart', 'of', 'downtown', 'seattle'],
 ['whats',
  'near',
  'our',
  'hotel',
  'downtown',
  'seattle',
  'location',
  'the',
  'better',
  'question',
  'might',
  'be',
  'what',
  's',
  'not',
  'nearby']]

In [243]:
total_words = len(t.word_index) + 1

total_words

29

In [244]:
token_list = [t.texts_to_sequences([x])[0] for x in corpus]

token_list

[[2, 7, 1, 8, 9, 3, 10, 11],
 [2, 4, 1, 12, 13, 14],
 [2, 4, 1, 15, 3, 5, 6],
 [16, 17, 18, 19, 5, 6, 20, 1, 21, 22, 23, 24, 25, 26, 27, 28]]

In [143]:
def get_sequence_of_tokens(corpus):
    t.fit_on_texts(corpus)
    total_words = len(t.word_index) + 1
    
    input_sequences = []
    for line in corpus:
        token_list = t.texts_to_sequences([line])[0]
        #print(token_list)
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            #print(n_gram_sequence)
            input_sequences.append(n_gram_sequence)
        
    return input_sequences, total_words

input_sequences, total_words = get_sequence_of_tokens(corpus)



In [144]:
input_sequences

[[2, 7],
 [2, 7, 1],
 [2, 7, 1, 8],
 [2, 7, 1, 8, 9],
 [2, 7, 1, 8, 9, 3],
 [2, 7, 1, 8, 9, 3, 10],
 [2, 7, 1, 8, 9, 3, 10, 11],
 [2, 4],
 [2, 4, 1],
 [2, 4, 1, 12],
 [2, 4, 1, 12, 13],
 [2, 4, 1, 12, 13, 14],
 [2, 4],
 [2, 4, 1],
 [2, 4, 1, 15],
 [2, 4, 1, 15, 3],
 [2, 4, 1, 15, 3, 5],
 [2, 4, 1, 15, 3, 5, 6],
 [16, 17],
 [16, 17, 18],
 [16, 17, 18, 19],
 [16, 17, 18, 19, 5],
 [16, 17, 18, 19, 5, 6],
 [16, 17, 18, 19, 5, 6, 20],
 [16, 17, 18, 19, 5, 6, 20, 1],
 [16, 17, 18, 19, 5, 6, 20, 1, 21],
 [16, 17, 18, 19, 5, 6, 20, 1, 21, 22],
 [16, 17, 18, 19, 5, 6, 20, 1, 21, 22, 23],
 [16, 17, 18, 19, 5, 6, 20, 1, 21, 22, 23, 24],
 [16, 17, 18, 19, 5, 6, 20, 1, 21, 22, 23, 24, 25],
 [16, 17, 18, 19, 5, 6, 20, 1, 21, 22, 23, 24, 25, 26],
 [16, 17, 18, 19, 5, 6, 20, 1, 21, 22, 23, 24, 25, 26, 27],
 [16, 17, 18, 19, 5, 6, 20, 1, 21, 22, 23, 24, 25, 26, 27, 28]]

In [146]:
t.index_word

{1: 'the',
 2: 'located',
 3: 'of',
 4: 'in',
 5: 'downtown',
 6: 'seattle',
 7: 'on',
 8: 'southern',
 9: 'tip',
 10: 'lake',
 11: 'union',
 12: "city's",
 13: 'vibrant',
 14: 'core',
 15: 'heart',
 16: 'whats',
 17: 'near',
 18: 'our',
 19: 'hotel',
 20: 'location',
 21: 'better',
 22: 'question',
 23: 'might',
 24: 'be',
 25: 'what',
 26: 's',
 27: 'not',
 28: 'nearby'}

In [156]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.utils as ku 

# pad sequences 
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    print(max_sequence_len)
    input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_sequence_len, padding = 'pre'))
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes = total_words)
    
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(input_sequences)

16


In [224]:
predictors

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  7],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  7,  1],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  7,  1,  8],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  7,  1,  8,  9],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  7,  1,  8,  9,  3],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  2,  7,  1,  8,  9,  3, 10],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  4],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  4,  1],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  4,  1, 12],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  4,  1, 12, 13],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  4],
       [ 0,  0,  0,  0,  0,  0,  0

In [225]:
label

array([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0

In [151]:
predictors.shape

(33, 15)

In [226]:
label.shape

(33, 29)

In [227]:
print("total_words:",total_words)
print("max_sequence_len:",max_sequence_len)

total_words: 29
max_sequence_len: 16


In [46]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

def create_model(max_sequence_len, total_words):
    model = Sequential()
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=max_sequence_len - 1))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    #model.save("Text_Summa")
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 18, 10)            790       
_________________________________________________________________
lstm (LSTM)                  (None, 100)               44400     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 79)                7979      
Total params: 53,169
Trainable params: 53,169
Non-trainable params: 0
_________________________________________________________________


In [47]:
model.fit(predictors, label, epochs=10, verbose=5)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1874001d160>

In [153]:
def generate_text(seed_text, next_words, model, max_seq_len):
    for _ in range(next_words):
        token_list = t.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
        
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ''
        
        for word,index in t.word_index.items():
            if index == predicted:
                output_word = word
                break
                
        seed_text = seed_text + " " + output_word
        
    return seed_text.title()

In [50]:
print(generate_text("hilton seattle downtown", 5, model, max_sequence_len))
print()
print(generate_text("best western seattle airport hotel", 10, model, max_sequence_len))
print()
print(generate_text('located in the heart of downtown seattle', 20, model, max_sequence_len))

Hilton Seattle Downtown The The The The The

Best Western Seattle Airport Hotel The The The The The The The The The The

Located In The Heart Of Downtown Seattle The The The The The The The The The The The The The The The The The The The The
