In [55]:
import numpy as np
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, InputLayer

In [5]:
doc = """Python is an interpreted, object-oriented, high-level programming language with dynamic semantics. Its high-level built in data structures, combined with dynamic typing and dynamic binding, make it very attractive for Rapid Application Development, as well as for use as a scripting or glue language to connect existing components together. Python's simple, easy to learn syntax emphasizes readability and therefore reduces the cost of program maintenance. Python supports modules and packages, which encourages program modularity and code reuse. The Python interpreter and the extensive standard library are available in source or binary form without charge for all major platforms, and can be freely distributed.

Often, programmers fall in love with Python because of the increased productivity it provides. Since there is no compilation step, the edit-test-debug cycle is incredibly fast. Debugging Python programs is easy: a bug or bad input will never cause a segmentation fault. Instead, when the interpreter discovers an error, it raises an exception. When the program doesn't catch the exception, the interpreter prints a stack trace. A source level debugger allows inspection of local and global variables, evaluation of arbitrary expressions, setting breakpoints, stepping through the code a line at a time, and so on. The debugger is written in Python itself, testifying to Python's introspective power. On the other hand, often the quickest way to debug a program is to add a few print statements to the source: the fast edit-test-debug cycle makes this simple approach very effective."""

In [9]:
tokens = Tokenizer()
tokens.fit_on_texts([doc])
total_words = len(tokens.word_index)

In [10]:
total_words

157

In [14]:
tokens.word_index


{'the': 1,
 'a': 2,
 'and': 3,
 'python': 4,
 'is': 5,
 'to': 6,
 'in': 7,
 'of': 8,
 'program': 9,
 'an': 10,
 'level': 11,
 'with': 12,
 'dynamic': 13,
 'it': 14,
 'for': 15,
 'as': 16,
 'or': 17,
 'interpreter': 18,
 'source': 19,
 'debug': 20,
 'high': 21,
 'language': 22,
 'very': 23,
 "python's": 24,
 'simple': 25,
 'easy': 26,
 'code': 27,
 'often': 28,
 'edit': 29,
 'test': 30,
 'cycle': 31,
 'fast': 32,
 'when': 33,
 'exception': 34,
 'debugger': 35,
 'on': 36,
 'interpreted': 37,
 'object': 38,
 'oriented': 39,
 'programming': 40,
 'semantics': 41,
 'its': 42,
 'built': 43,
 'data': 44,
 'structures': 45,
 'combined': 46,
 'typing': 47,
 'binding': 48,
 'make': 49,
 'attractive': 50,
 'rapid': 51,
 'application': 52,
 'development': 53,
 'well': 54,
 'use': 55,
 'scripting': 56,
 'glue': 57,
 'connect': 58,
 'existing': 59,
 'components': 60,
 'together': 61,
 'learn': 62,
 'syntax': 63,
 'emphasizes': 64,
 'readability': 65,
 'therefore': 66,
 'reduces': 67,
 'cost': 68,
 'm

In [35]:
input_seq = []
for sentance in doc.split("."):
    #print(sentance)
    #print(tokens.texts_to_sequences([sentance])[0])
    sentance_token = tokens.texts_to_sequences([sentance])[0]
    for i in range(1, len(sentance_token)+1):
        n_gram_seq = sentance_token[:i+1]
        print(n_gram_seq)
        input_seq.append(n_gram_seq)

[4, 5]
[4, 5, 10]
[4, 5, 10, 37]
[4, 5, 10, 37, 38]
[4, 5, 10, 37, 38, 39]
[4, 5, 10, 37, 38, 39, 21]
[4, 5, 10, 37, 38, 39, 21, 11]
[4, 5, 10, 37, 38, 39, 21, 11, 40]
[4, 5, 10, 37, 38, 39, 21, 11, 40, 22]
[4, 5, 10, 37, 38, 39, 21, 11, 40, 22, 12]
[4, 5, 10, 37, 38, 39, 21, 11, 40, 22, 12, 13]
[4, 5, 10, 37, 38, 39, 21, 11, 40, 22, 12, 13, 41]
[4, 5, 10, 37, 38, 39, 21, 11, 40, 22, 12, 13, 41]
[42, 21]
[42, 21, 11]
[42, 21, 11, 43]
[42, 21, 11, 43, 7]
[42, 21, 11, 43, 7, 44]
[42, 21, 11, 43, 7, 44, 45]
[42, 21, 11, 43, 7, 44, 45, 46]
[42, 21, 11, 43, 7, 44, 45, 46, 12]
[42, 21, 11, 43, 7, 44, 45, 46, 12, 13]
[42, 21, 11, 43, 7, 44, 45, 46, 12, 13, 47]
[42, 21, 11, 43, 7, 44, 45, 46, 12, 13, 47, 3]
[42, 21, 11, 43, 7, 44, 45, 46, 12, 13, 47, 3, 13]
[42, 21, 11, 43, 7, 44, 45, 46, 12, 13, 47, 3, 13, 48]
[42, 21, 11, 43, 7, 44, 45, 46, 12, 13, 47, 3, 13, 48, 49]
[42, 21, 11, 43, 7, 44, 45, 46, 12, 13, 47, 3, 13, 48, 49, 14]
[42, 21, 11, 43, 7, 44, 45, 46, 12, 13, 47, 3, 13, 48, 49, 14, 

In [37]:
max_seqquence_len = max([len(i) for i in input_seq])
max_seqquence_len

38

In [38]:
my_pad_sequence = pad_sequences(input_seq,maxlen=max_seqquence_len, padding="pre")

In [40]:
my_pad_sequence[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5])

In [44]:
x = my_pad_sequence[:,:-1]

In [45]:
y = my_pad_sequence[:, -1]

In [82]:
x.shape

(248, 37)

In [47]:
y

array([  5,  10,  37,  38,  39,  21,  11,  40,  22,  12,  13,  41,  41,
        21,  11,  43,   7,  44,  45,  46,  12,  13,  47,   3,  13,  48,
        49,  14,  23,  50,  15,  51,  52,  53,  16,  54,  16,  15,  55,
        16,   2,  56,  17,  57,  22,   6,  58,  59,  60,  61,  61,  25,
        26,   6,  62,  63,  64,  65,   3,  66,  67,   1,  68,   8,   9,
        69,  69,  70,  71,   3,  72,  73,  74,   9,  75,   3,  27,  76,
        76,   4,  18,   3,   1,  77,  78,  79,  80,  81,   7,  19,  17,
        82,  83,  84,  85,  15,  86,  87,  88,   3,  89,  90,  91,  92,
        92,  93,  94,   7,  95,  12,   4,  96,   8,   1,  97,  98,  14,
        99,  99, 101,   5, 102, 103, 104,   1,  29,  30,  20,  31,   5,
       105,  32,  32,   4, 107,   5,  26,   2, 108,  17, 109, 110, 111,
       112, 113,   2, 114, 115, 115,  33,   1,  18, 117,  10, 118,  14,
       119,  10,  34,  34,   1,   9, 120, 121,   1,  34,   1,  18, 122,
         2, 123, 124, 124,  19,  11,  35, 125, 126,   8, 127,   

In [51]:
y = tensorflow.keras.utils.to_categorical(y,num_classes=total_words+1)

In [52]:
y[0]

array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.])

In [65]:
total_words

157

In [68]:
max_seqquence_len

38

In [87]:
model = Sequential()
model.add(InputLayer(shape=(37,)))
model.add(Embedding(total_words+1, 100))

model.add(LSTM(150))
model.add(Dense(total_words+1, activation="softmax"))
print(model.summary())

None


In [88]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x, y, epochs=100, verbose=1)

Epoch 1/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - accuracy: 0.0114 - loss: 5.0607 
Epoch 2/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.0352 - loss: 4.9896  
Epoch 3/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.0415 - loss: 4.8593
Epoch 4/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.0656 - loss: 4.8010
Epoch 5/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.0724 - loss: 4.6949
Epoch 6/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.0634 - loss: 4.6888
Epoch 7/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.0690 - loss: 4.5934
Epoch 8/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.0916 - loss: 4.4680
Epoch 9/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.9670 - loss: 0.6206
Epoch 70/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9652 - loss: 0.5893
Epoch 71/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9793 - loss: 0.5562
Epoch 72/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.9707 - loss: 0.5609
Epoch 73/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9757 - loss: 0.5406
Epoch 74/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.9706 - loss: 0.5002
Epoch 75/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.9789 - loss: 0.4860
Epoch 76/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9805 - loss: 0.4668
Epoch 77/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

<keras.src.callbacks.history.History at 0x1a5f10b3ad0>

In [100]:
input_text = "ram kumar"
predict_next_words= 10

for _ in range(predict_next_words):
    token_list = tokens.texts_to_sequences([input_text])[0]
    print(token_list)
    token_list = pad_sequences([token_list], maxlen=max_seqquence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokens.word_index.items():
        if index == predicted:
            output_word = word
            break
    input_text += " " + output_word

print(input_text)

[]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1, 4]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1, 4, 18]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1, 4, 18, 3]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1, 4, 18, 3, 1]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1, 4, 18, 3, 1, 77]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1, 4, 18, 3, 1, 77, 78]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1, 4, 18, 3, 1, 77, 78, 79]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1, 4, 18, 3, 1, 77, 78, 79, 80]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
ram kumar the python interpreter and the extensive standard library are available
