In [2]:
values = ['My name is Santhosh','What is Your name','Where are you come from','Where are you going','Where are you staying','What are you doing']

In [38]:
import pandas as pd
import string

dataset = pd.DataFrame(values,columns=['values'])

v = dataset.values

In [39]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

In [41]:
corpus = [clean_text(x) for x in v]


['my name is santhosh',
 'what is your name',
 'where are you come from',
 'where are you going',
 'where are you staying',
 'what are you doing']

In [88]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

tokenizer.fit_on_texts(corpus)

tokenizer.word_index

{'are': 1,
 'you': 2,
 'where': 3,
 'name': 4,
 'is': 5,
 'what': 6,
 'my': 7,
 'santhosh': 8,
 'your': 9,
 'come': 10,
 'from': 11,
 'going': 12,
 'staying': 13,
 'doing': 14}

In [None]:
def get_sequence_of_tokens(corpus):
    
    # tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    print("total_words =",total_words)
    
     ## convert data to sequence of tokens 
    input_sequences = []
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

In [71]:
inp_sequences , total_words = get_sequence_of_tokens(corpus)

total_words = 15


In [96]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from tensorflow.keras.utils import to_categorical

def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len


In [97]:
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [102]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Embedding,Dropout


def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

In [103]:
model = create_model(max_sequence_len, total_words)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 4, 10)             150       
_________________________________________________________________
lstm (LSTM)                  (None, 100)               44400     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 15)                1515      
Total params: 46,065
Trainable params: 46,065
Non-trainable params: 0
_________________________________________________________________


In [107]:
model.fit(predictors, label, epochs=100, verbose=2)

Epoch 1/100
 - 0s - loss: 1.3392
Epoch 2/100
 - 0s - loss: 1.2475
Epoch 3/100
 - 0s - loss: 1.2630
Epoch 4/100
 - 0s - loss: 1.3205
Epoch 5/100
 - 0s - loss: 1.1498
Epoch 6/100
 - 0s - loss: 1.2773
Epoch 7/100
 - 0s - loss: 1.1871
Epoch 8/100
 - 0s - loss: 1.1944
Epoch 9/100
 - 0s - loss: 1.1526
Epoch 10/100
 - 0s - loss: 1.0374
Epoch 11/100
 - 0s - loss: 1.1208
Epoch 12/100
 - 0s - loss: 1.1468
Epoch 13/100
 - 0s - loss: 1.0576
Epoch 14/100
 - 0s - loss: 1.0722
Epoch 15/100
 - 0s - loss: 1.0595
Epoch 16/100
 - 0s - loss: 1.0296
Epoch 17/100
 - 0s - loss: 1.0204
Epoch 18/100
 - 0s - loss: 1.0028
Epoch 19/100
 - 0s - loss: 1.0137
Epoch 20/100
 - 0s - loss: 1.0025
Epoch 21/100
 - 0s - loss: 0.9449
Epoch 22/100
 - 0s - loss: 0.9650
Epoch 23/100
 - 0s - loss: 0.9063
Epoch 24/100
 - 0s - loss: 0.9234
Epoch 25/100
 - 0s - loss: 0.8728
Epoch 26/100
 - 0s - loss: 0.9631
Epoch 27/100
 - 0s - loss: 0.8723
Epoch 28/100
 - 0s - loss: 0.9627
Epoch 29/100
 - 0s - loss: 0.8622
Epoch 30/100
 - 0s - lo

<tensorflow.python.keras.callbacks.History at 0x1f487710>

In [105]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [112]:
print(generate_text("you", 2, model, max_sequence_len))

You Are You
