In [None]:
## importing necessary libaries
import numpy as np ## for mathematical operations
import tensorflow as tf ## used for deep learning
from tensorflow.keras.preprocessing.text import Tokenizer ## Tokenizer class helps in tokenizing of sentences, text to number,remove stopword
from tensorflow.keras.preprocessing.sequence import pad_sequences ## helps in padding
from tensorflow.keras.models import Sequential #3 helps to develop model sequentially
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense ## embedding helps in converting the numerical form of word into meaningful vectors,SimpleRNN LSTM GRU class used for building models
from tensorflow.keras.utils import to_categorical ## used to convert class labels (integers) into one-hot encoded vectors

In [None]:
text = """ In the middle of the night, in the middle of the night
           Just call my name, I'm yours to tame
           In the middle of the night, in the middle of the night
           I'm wide awake, I crave your taste
           All night long 'til morning comes
           I'm getting what is mine, you gon' get yours, oh no, ooh
           In the middle of the night, in the middle of the night """

## 3 para of song named middle of night by elley duhé

Tokenization

In [None]:
tokenizer = Tokenizer() # creating object of a class
tokenizer.fit_on_texts([text]) # fitting the text

In [None]:
print(tokenizer.word_index) ## during fitting , it creates a vocabulary of all unique words which is stored in a dictionary and also assigns unique index to each unique word

{'the': 1, 'night': 2, 'in': 3, 'middle': 4, 'of': 5, "i'm": 6, 'yours': 7, 'just': 8, 'call': 9, 'my': 10, 'name': 11, 'to': 12, 'tame': 13, 'wide': 14, 'awake': 15, 'i': 16, 'crave': 17, 'your': 18, 'taste': 19, 'all': 20, 'long': 21, "'til": 22, 'morning': 23, 'comes': 24, 'getting': 25, 'what': 26, 'is': 27, 'mine': 28, 'you': 29, "gon'": 30, 'get': 31, 'oh': 32, 'no': 33, 'ooh': 34}


In [None]:
total_words = len(tokenizer.word_index)

In [None]:
total_words

34

In [None]:
new_total_words = total_words + 1 ## vocabulary size includes all words + padding token (0) as it was default in embedding layers which starts with 0

In [None]:
new_total_words

35

Input Sequence

In [None]:
input_sequences = []
token_list = tokenizer.texts_to_sequences([text])[0] ## [0] is used to prevent the creation of list within list

for i in range(1, len(token_list)):  ## helps to train model to predict next word with the help of n-gram
    n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence)


Padding

In [None]:
max_seq_len = max(len(seq) for seq in input_sequences) ## helps to get max length sequence with the help of list compresion and max inbuilt function

In [None]:
input_sequences = np.array(pad_sequences(sequences = input_sequences, maxlen = max_seq_len, padding = 'pre')) ## padding is used to  to make all sequences the same length by adding "dummy values" like (0)

spliting the data for training for input and labels

In [None]:
x = input_sequences[:, :-1] # in input sequence model will get every word except last words as it is being removing last column from data to create it as label
y = to_categorical(input_sequences[:, -1], num_classes = new_total_words) ## that removed last volumn will become word to predict and model would be trained on this

Usinf def Function to call or choose any model type :RNN/LSTM/GRU

In [None]:
def build_model(cell_type="GRU"): ## creating a function with cell_type parameter which is decided by user to use any model that  comforts and if not given any model even after ask then it will take GRU automatically
    model = Sequential() ## use Sequential class to build a linear stacking layers for  model
    model.add(Embedding( new_total_words,10 , input_length = max_seq_len - 1 )) ## this layer helps in converting word's integer into vectors

    if cell_type == "RNN":
        model.add(SimpleRNN(100)) ## simplernn class with 100 hidden neurons
    elif cell_type == "LSTM":
        model.add(LSTM(100))      ## LSTM class with 100 hidden neurons
    elif cell_type == "GRU":
        model.add(GRU(100))       ## GRU class with 100 hidden neurons

    model.add(Dense(new_total_words, activation='softmax')) ## adding dense layers which makes equal to same layers in numbers as of new_total_words with each having softmax activation function
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) ## calculate loss ,optimizer is used to reduce loss and metrics observes accuracy during training
    return model
a = input('enter the model:').upper() ## ask for model

model = build_model(cell_type = a )  # Change to "RNN" or "LSTM" to test others


enter the model:LSTM




Train the model

In [None]:
model.fit(x, y, epochs=200, verbose=1) ## training the model

Epoch 1/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 52ms/step - accuracy: 0.0685 - loss: 3.5547
Epoch 2/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.2040 - loss: 3.5457
Epoch 3/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.1585 - loss: 3.5380
Epoch 4/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.1898 - loss: 3.5240
Epoch 5/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.1703 - loss: 3.4988
Epoch 6/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.2054 - loss: 3.4031
Epoch 7/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.1585 - loss: 3.2038
Epoch 8/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.1976 - loss: 3.1059
Epoch 9/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x7afaba7bb090>

In [None]:
def predict_next_word(model, tokenizer, seed_text, max_seq_len): #creating a function to predict next word
    token_list = tokenizer.texts_to_sequences([seed_text])[0] ## converts word into numeric integers
    token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre') ## fixing the length of each input
    predicted_probs = model.predict(token_list, verbose=0) #3 predicts the output
    predicted_index = np.argmax(predicted_probs) ## finds the index of word with the highest predicted probability

    for word, index in tokenizer.word_index.items(): ## Convert predicted index back to word
        if index == predicted_index: ## gives word: index mapping
            return word
    return "" ## returns empty strings if index not found

In [None]:
seed_text = "in the "
next_word = predict_next_word(model, tokenizer, seed_text, max_seq_len)
print(f"Next word prediction: '{next_word}'")

Next word prediction: 'middle'
