In [15]:
import string
import re
import numpy as np
from pickle import dump, load
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, Input, Dropout
from random import randint
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard

# **Data Preparation**

In [2]:
def load_doc(filename):
    file = open(filename, 'r', encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [3]:
def clean_doc(doc):
    doc = doc.replace('--', ' ')
    tokens = doc.split()
    re_punc = re.compile(f"[{re.escape(string.punctuation)}]") 
    tokens = [re_punc.sub('', w) for w in tokens] # remove punctuations
    tokens = [w for w in tokens if w.isalpha()]
    tokens = [w.lower() for w in tokens]
    return tokens

In [4]:
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [5]:
in_filename = '/kaggle/input/text-file/mann_ki_baat_content.txt'
doc = load_doc(in_filename)
tokens = clean_doc(doc)
print(f"Total tokens: {len(tokens)}")
print(f"Unique tokens: {len(set(tokens))}")

Total tokens: 211166
Unique tokens: 12506


In [6]:
# organize into sequences of tokens (input: 50 words, output: 1 word)
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
    seq = tokens[i-length : i]
    line = ' '.join(seq)
    sequences.append(line)
print(f"Total sequences: {len(sequences)}")

out_filename = 'mann_ki_baat_sequences.txt'
save_doc(sequences, out_filename)

Total sequences: 211115


# **Train Language Model**

In [7]:
def define_model(vocab_size, seq_length):
    model = Sequential()
    model.add(Input(shape = (seq_length,)))
    model.add(Embedding(vocab_size, 50))
    model.add(LSTM(100, return_sequences = True))
    model.add(Dropout(0.2))
    model.add(LSTM(100))
    model.add(Dense(100, activation = 'relu'))
    model.add(Dense(vocab_size, activation = 'softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    model.summary()
    return model

In [8]:
in_filename = 'mann_ki_baat_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [10]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [11]:
dump(tokenizer, open('tokenizer.pkl', 'wb')) # save tokenizer 

In [12]:
tokenizer = load(open('tokenizer.pkl', 'rb'))
vocab_size = len(tokenizer.word_index) + 1
sequences = np.array(sequences)

# separate into input and output
x, y = sequences[:, :-1], sequences[:, -1]
y = to_categorical(y, num_classes = vocab_size)
seq_length = x.shape[1]

In [13]:
def data_generator(x, y, batch_size):
    while True:
        for start in range(0, len(x), batch_size):
            end = min(start + batch_size, len(x))
            yield np.array(x[start:end]), np.array(y[start:end])

In [17]:
model = define_model(vocab_size, seq_length)

batch_size = 128
steps_per_epoch = len(x) // batch_size

# use the generator
train_generator = data_generator(x, y, batch_size)

# define callbacks
checkpoint_callback = ModelCheckpoint('model_checkpoint.keras', monitor='loss', save_best_only=True)
early_stopping_callback = EarlyStopping(monitor='loss', patience=5)
tensorboard_callback = TensorBoard(log_dir='./logs')

# train the model
model.fit(train_generator, steps_per_epoch=steps_per_epoch, epochs=100, 
          callbacks=[checkpoint_callback, early_stopping_callback, tensorboard_callback])
model.save('model.keras')

Epoch 1/100
[1m1649/1649[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 23ms/step - accuracy: 0.0590 - loss: 7.0198
Epoch 2/100
[1m1649/1649[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 23ms/step - accuracy: 0.0888 - loss: 6.2233
Epoch 3/100
[1m1649/1649[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 23ms/step - accuracy: 0.1075 - loss: 5.9708
Epoch 4/100
[1m1649/1649[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 23ms/step - accuracy: 0.1160 - loss: 5.7950
Epoch 5/100
[1m1649/1649[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 23ms/step - accuracy: 0.1269 - loss: 5.6132
Epoch 6/100
[1m1649/1649[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 23ms/step - accuracy: 0.1359 - loss: 5.4413
Epoch 7/100
[1m1649/1649[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 23ms/step - accuracy: 0.1442 - loss: 5.2928
Epoch 8/100
[1m1649/1649[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 23ms/step - accuracy: 0.1501 - loss: 5.1658


In [18]:
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    for _ in range (n_words):
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = pad_sequences([encoded], maxlen = seq_length, truncating = 'pre')
        yhat = model.predict(encoded, verbose = 0)
        yhat = np.argmax(yhat, axis = -1)
        out_word = ''
        for word, index in  tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

In [19]:
in_filename = 'mann_ki_baat_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
# input : line.size() - output_size
seq_length = len(lines[0].split()) - 1

In [21]:
model = load_model('model.keras')
tokenizer = load(open('tokenizer.pkl', 'rb'))

In [35]:
seed_text = lines[randint(0, len(lines))] # select a random line
print(seed_text + '\n')

generated = generate_seq(model, tokenizer, seq_length, seed_text, 75)
print(generated)

i thank you all very much namaskar my dear countrymen namaskar we are seeing how the country is fighting against with all her might this has been the biggest pandemic in the last hundred years and during this very pandemic india has confronted many a natural disaster with fortitude meanwhile there

is no dearth of people who have to depend forward with the people of the yogik sciences the other season the people of the country is also organized in the qs asia university rankings i had the privilege of encouragement and when i am happy that the people of the country is celebrating the marudu rule well the people of the country is also organized in the qs asia system sets at the grassroots level
