# Language model for Text completion:
    
1. Predict the next word or sequence of words given a context.

2. In NLP, RNN , LSTM, Transformers based model used to build


# Steps:
    
1. Data Preparation

2. Tokenization

3. Create input Sequences and Targets

4. Word Embeddings

5. Build models with RNN/LSTM/transformer based architectureTensorflow/pytorch/keras

6. Loss Function and Optimizer

7. Training

8. Evaluation

9. Inference

10. Fine-Tuning(Optional)

In [1]:
#import libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense

In [3]:
#Sample Data
text_data=["The quick brown fox jumps over the lazy dog.",'She sells seashells by the seashore']
text_data

['The quick brown fox jumps over the lazy dog.',
 'She sells seashells by the seashore']

In [4]:
#Tokenization
tokenizer=tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(text_data)
total_words=len(tokenizer.word_index)+1
print(total_words)

14


In [5]:
#Create input sequences and targets
input_sequences=[]
for line in text_data:
    token_list=tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence=token_list[:i+1]
        input_sequences.append(n_gram_sequence)
    
max_sequence_length=max([len(seq) for seq in input_sequences])
input_sequences=tf.keras.preprocessing.sequence.pad_sequences(input_sequences,maxlen=max_sequence_length)
x,y=input_sequences[:,:-1],input_sequences[:,-1]
y=tf.keras.utils.to_categorical(y,num_classes=total_words)

In [6]:
from sys import meta_path
#Build the model
model=Sequential()
model.add(Embedding(total_words,100,input_length=max_sequence_length-1)) # input embedd
model.add(LSTM(100)) #hidden layer
model.add(Dense(total_words,activation='softmax')) #output layer

#Compile the model
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
#Train
model.fit(x,y,epochs=100,verbose=2)


Epoch 1/100
1/1 - 13s - loss: 2.6408 - accuracy: 0.0000e+00 - 13s/epoch - 13s/step
Epoch 2/100
1/1 - 0s - loss: 2.6315 - accuracy: 0.0000e+00 - 34ms/epoch - 34ms/step
Epoch 3/100
1/1 - 0s - loss: 2.6223 - accuracy: 0.0769 - 31ms/epoch - 31ms/step
Epoch 4/100
1/1 - 0s - loss: 2.6129 - accuracy: 0.3077 - 16ms/epoch - 16ms/step
Epoch 5/100
1/1 - 0s - loss: 2.6033 - accuracy: 0.3846 - 22ms/epoch - 22ms/step
Epoch 6/100
1/1 - 0s - loss: 2.5932 - accuracy: 0.3077 - 21ms/epoch - 21ms/step
Epoch 7/100
1/1 - 0s - loss: 2.5825 - accuracy: 0.3077 - 18ms/epoch - 18ms/step
Epoch 8/100
1/1 - 0s - loss: 2.5710 - accuracy: 0.3077 - 16ms/epoch - 16ms/step
Epoch 9/100
1/1 - 0s - loss: 2.5586 - accuracy: 0.3077 - 22ms/epoch - 22ms/step
Epoch 10/100
1/1 - 0s - loss: 2.5449 - accuracy: 0.3077 - 19ms/epoch - 19ms/step
Epoch 11/100
1/1 - 0s - loss: 2.5298 - accuracy: 0.2308 - 24ms/epoch - 24ms/step
Epoch 12/100
1/1 - 0s - loss: 2.5130 - accuracy: 0.2308 - 20ms/epoch - 20ms/step
Epoch 13/100
1/1 - 0s - loss: 

<keras.src.callbacks.History at 0x26c6df97d50>

In [15]:
#Generate text completion

seed_text="she sells seashells"
next_words=10
for _ in range(next_words):
    token_list=tokenizer.texts_to_sequences([seed_text])[0]
    token_list=tf.keras.preprocessing.sequence.pad_sequences([token_list],maxlen=max_sequence_length-1)
    
    predicted=np.argmax(model.predict(token_list,verbose=2))
    output_word=""
    
    for word,index in tokenizer.word_index.items():
        if index==predicted:
            output_word=word
            break
    seed_text +=" "+output_word
print(seed_text)   

1/1 - 0s - 48ms/epoch - 48ms/step
1/1 - 0s - 65ms/epoch - 65ms/step
1/1 - 0s - 47ms/epoch - 47ms/step
1/1 - 0s - 48ms/epoch - 48ms/step
1/1 - 0s - 47ms/epoch - 47ms/step
1/1 - 0s - 33ms/epoch - 33ms/step
1/1 - 0s - 33ms/epoch - 33ms/step
1/1 - 0s - 40ms/epoch - 40ms/step
1/1 - 0s - 46ms/epoch - 46ms/step
1/1 - 0s - 50ms/epoch - 50ms/step
she sells seashells by the seashore seashore the dog dog dog dog dog


In [16]:
#Generate text completion

seed_text="quick brown fox"
next_words=3
for _ in range(next_words):
    token_list=tokenizer.texts_to_sequences([seed_text])[0]
    token_list=tf.keras.preprocessing.sequence.pad_sequences([token_list],maxlen=max_sequence_length-1)
    
    predicted=np.argmax(model.predict(token_list,verbose=2))
    output_word=""
    
    for word,index in tokenizer.word_index.items():
        if index==predicted:
            output_word=word
            break
    seed_text +=" "+output_word
print(seed_text)   

1/1 - 0s - 49ms/epoch - 49ms/step
1/1 - 0s - 33ms/epoch - 33ms/step
1/1 - 0s - 47ms/epoch - 47ms/step
quick brown fox fox jumps over
