In [1]:
import numpy as np
import tensorflow as tf
#tokeniser - converts text to integers
from tensorflow.keras.preprocessing.text import Tokenizer
#pad_sequences - pad sequences of different lengths with zeros or truncate them to a specified length
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import pickle

In [2]:
# Reading corpus from the text file
with open("IndiaUS.txt", 'r', encoding='utf-8') as myfile:
    mytext = myfile.read()

In [None]:
print(mytext)

In [4]:
mytokenizer = Tokenizer()


In [5]:
mytokenizer.fit_on_texts([mytext])
total_words = len(mytokenizer.word_index) + 1

In [6]:
mytokenizer.word_index


{'the': 1,
 'to': 2,
 'in': 3,
 'a': 4,
 'and': 5,
 'india': 6,
 'of': 7,
 'mr': 8,
 'us': 9,
 'is': 10,
 'modi': 11,
 'has': 12,
 'that': 13,
 'biden': 14,
 'as': 15,
 'with': 16,
 'not': 17,
 "india's": 18,
 'for': 19,
 'but': 20,
 'on': 21,
 'washington': 22,
 'an': 23,
 'it': 24,
 'says': 25,
 'will': 26,
 'are': 27,
 'indian': 28,
 'prime': 29,
 'minister': 30,
 'this': 31,
 'have': 32,
 'trade': 33,
 'visit': 34,
 'by': 35,
 'president': 36,
 'relationship': 37,
 'been': 38,
 'about': 39,
 'strategic': 40,
 'more': 41,
 'up': 42,
 'during': 43,
 'also': 44,
 'make': 45,
 'sirohi': 46,
 'china': 47,
 'air': 48,
 'force': 49,
 'state': 50,
 'his': 51,
 'ties': 52,
 'two': 53,
 'at': 54,
 'white': 55,
 'house': 56,
 'be': 57,
 'lot': 58,
 'ms': 59,
 'technology': 60,
 'jet': 61,
 'semiconductor': 62,
 'now': 63,
 'first': 64,
 'was': 65,
 'global': 66,
 'one': 67,
 'he': 68,
 'without': 69,
 'they': 70,
 'narendra': 71,
 'joe': 72,
 'world': 73,
 'potential': 74,
 'how': 75,
 'becom

In [7]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(mytokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
'''
n-gram sequence:
 contiguous sequence of n items from a given sample of text 

"I love machine learning"

list = [1, 2, 3, 4]
bigram  : "i love","love machine","machine learning"
[1,2]
[1,2,3]
[1,2,3,4]

trigram : "i love machine",love machine learning"
[1,2,3]
[1,2,3,4]


'''

In [8]:
my_input_sequences = []
for line in mytext.split('\n'):
    #tokenizing the text using the texts_to_sequences,
    # converts the text line into a list of integer tokens
    token_list = mytokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        #creating n-gram :
        #i+1 means 2-grams(bigrams)
        #i+2 means 3-grams(trigrams)
        my_n_gram_sequence = token_list[:i+1]
        my_input_sequences.append(my_n_gram_sequence)

In [9]:
#store the maximum sequence length among all the sequences
max_sequence_len = max([len(seq) for seq in my_input_sequences])


In [None]:
''' 
Padding
--------------------

-> max_sequence_len value can be useful when padding sequences to ensure they are all of the same length.
-> Padding involves adding dummy elements (usually zeros) to sequences that are shorter than a predefined length.
   This ensures that all sequences have the same length, which is often required when training neural networks or
   performing batch operations
-> To train a batch of sequences in (Neural networks), they need to have the same dimensions
-> LSTM or RNN, require input sequences of the same length.
   This is because the network's architecture assumes a fixed input size.

   my_input_sequences = [[1, 2, 3],
                        [4, 5],
                        [6, 7, 8, 9]]

   after padding with max_sequence_len = 4 (last list length)

   input_sequences    = [[0, 1, 2, 3],
                        [0, 0, 4, 5],
                        [6, 7, 8, 9]]
now its all are same length
This padded array is now suitable for feeding into a neural network model for training.

'''


In [10]:

#padding='pre' = padding is added to the beginning (prefix) of the sequences
# padding='post' padding is added to the end of the sequences
input_sequences = np.array(pad_sequences(my_input_sequences, maxlen=max_sequence_len, padding='pre'))

In [11]:
input_sequences[1]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,  99,   4, 177], dtype=int32)

In [12]:
# includes all rows (:) and all columns except the last one (:-1).
# This means that X will consist of the entire sequence except for the last element in each sequence(which need to be predicted)
X = input_sequences[:, :-1]

#  y is being assigned the last column (-1) of input_sequences last element in each sequence (target to predict)
y = input_sequences[:, -1]

In [None]:
'''
The data is now structured in a way that you can use it for supervised training.
X will be your input data, and y will be your corresponding target labels
'''

In [13]:
X[1]


array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 99,  4],
      dtype=int32)

In [14]:
y

array([  4, 177,  50, ...,  25,  59,  46], dtype=int32)

In [15]:
#  converting the target values y into one-hot encoded vectors using the to_categorical
# Each value in y is transformed into a binary vector of length total_words,
# where the position corresponding to the target value is set to 1 and the rest are set to 0.
'''
In one-hot encoding:

Each category is represented by a unique binary vector.
The length of the vector is equal to the total number of unique categories (classes).
Only one element in the vector is set to 1 (hot), and the rest are set to 0 (cold).

the target values in y are being treated as categories, and you're converting them
to one-hot encoded vectors of length total_words. The position corresponding to the target 
value is set to 1, and the rest are set to 0.

'''


y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))


In [None]:
y[1]

In [17]:
model = Sequential()
# embedding layer to map words to vectors
# The Embedding layer is used to create word embeddings that map words from a 
# high-dimensional space (the vocabulary size) to a lower-dimensional space (embedding dimension)
    # total_words: number of unique words in your vocabulary, which defines the size of the input vocabulary.
    # 100: The dimensionality of the word embeddings. This is a hyperparameter you can adjust based on your needs.
    # input_length=max_sequence_len-1: The length of the input sequences (excluding the target word) after padding.

model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))

In [18]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
model.fit(X, y, epochs=100, verbose=1)


In [20]:
model.save('word_generation_model.h5')


Testing model

In [21]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# Load the tokenizer from training
with open('tokenizer.pickle', 'rb') as handle:
    mytokenizer = pickle.load(handle)

# Load the trained model
model = tf.keras.models.load_model('word_generation_model.h5')

input_text = "narendra modi with"
predict_next_words = 10

for _ in range(predict_next_words):
    token_list = mytokenizer.texts_to_sequences([input_text])[0]
    token_list = pad_sequences([token_list], maxlen=model.input_shape[1], padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = mytokenizer.index_word[predicted[0]]  # Convert index to word using tokenizer
    input_text += " " + output_word

print(input_text)


narendra modi with india a semiconductor base us memory chip giant micron technology
