### Importing the Libraries

In [30]:
# Import all necessary libraries for the entire notebook
import re
import numpy as np
import itertools
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter, defaultdict
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt  # Optional: for plotting, if needed

# Ensure NLTK resources are downloaded (if not already done)
nltk.download('book')  # For tokenizing text into sentences and words
nltk.download('punkt')

# Check TensorFlow version, important for compatibility with certain functionalities
import tensorflow as tf
print("TensorFlow version:", tf.__version__)




TensorFlow version: 2.17.0


[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/kaushikkaranam/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /Users/kaushikkaranam/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     /Users/kaushikkaranam/nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/kaushikkaranam/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     /Users/kaushikkaranam/nltk_data...
[nltk_data]    |   Package conll2000 is already up-to-date!
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     /Users/kaushikkaranam/nltk_data...
[nltk_data]    |   Package conll20

In [40]:
import nltk

In [42]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/kaushikkaranam/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

### 1. Data Preprocessing

In [81]:
import re
from nltk import tokenize

#alphabets= "([A-Za-z])"
#prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
#suffixes = "(Inc|Ltd|Jr|Sr|Co)"
#starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
#acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
#websites = "[.](com|net|org|io|gov|edu|me)"
#digits = "([0-9])"

# If you want to restrict the size of the voabulary
# Right now, we set it in the next cell to be the entire vocabular: vocabulary_size = len(word_freq.items())
#vocabulary_size = 3000

unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

# Read the data and append SENTENCE_START and SENTENCE_END tokens
text = ''
print( "Reading txt file...")
with open(r'data/Mahabharat.txt', 'r') as f:
    text = f.read()



Reading txt file...


In [446]:
# Load the text data from a file
text_file_path = 'data/Mahabharat.txt'
with open(text_file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Clean and preprocess the text
text = text.lower().replace('\n', ' ').replace('\r', '').replace('.', ' .')

# Initialize the tokenizer and fit it on the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

# Convert text to sequence of tokens
sequences = tokenizer.texts_to_sequences([text])[0]

# Create training sequences
train_length = 50
step = 3
sentences = []
next_words = []

for i in range(0, len(sequences) - train_length, step):
    sentences.append(sequences[i: i + train_length])
    next_words.append(sequences[i + train_length])

# Prepare the input and output data
X = pad_sequences(sentences, maxlen=train_length)
y = tf.keras.utils.to_categorical(next_words, num_classes=len(tokenizer.word_index) + 1)


In [83]:
text = text.lower()
text = text.replace('i ', 'I ')

In [85]:
# Tokenize the cleaned text into sentences
sentences = tokenize.sent_tokenize(text)

# Check the length of sentences
print(f"Number of sentences: {len(sentences)}")

# If there are at least 110 sentences, print some for inspection
if len(sentences) > 100:
    for i in range(100, min(110, len(sentences))):  # Ensure index doesn't go out of range
        print(f"Sentence {i}: {sentences[i]}")
else:
    print("There are fewer than 100 sentences in the text.")


Number of sentences: 2145
Sentence 100: one of the names of vishnu is purushottama.
Sentence 101: poor urvasi, when called upon to confess on whom her heart was set, forgetting the part she had to act, says "I love pururavas," instead of "I love purushottama."
Sentence 102: her teacher bharata, the author of the play, is so much exasperated by this mistake, that he pronounces a curse upon urvasi.
Sentence 103: "you must lose your divine knowledge."
Sentence 104: after the close of the performance, indra, observing her as she stood apart, ashamed and disconsolate, calls her and says:—

"the mortal, who engrosses your thoughts, has been my friend in the days of adversity; he has helped me in the conflict with the enemies of the gods, and is entitled to my acknowledgements.
Sentence 105: you must, accordingly, repair to him and remain with him till he beholds the offspring you shall bear him."
Sentence 106: the god thus permits her to marry the mortal hero.
Sentence 107: after transacting

### 2. creating word mappings

In [87]:
import nltk
import itertools

# Append SENTENCE_START and SENTENCE_END
sentences = ["%s %s %s" % (sentence_start_token, x[:-1].replace("&", ""), sentence_end_token) for x in sentences] 
print("Parsed %d sentences." % len(sentences))

# Tokenize the sentences into words, making sure to remove end-of-sentence period
tokenized_sentences = [nltk.word_tokenize(sent.replace('.', '')) for sent in sentences]

# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print("Found %d unique word tokens." % len(word_freq.items()))

# Set vocabulary_size here, either to a specific value or based on word frequencies
vocabulary_size = 300000  

# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size - 1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])
print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]




Parsed 2145 sentences.
Found 6445 unique word tokens.
The least frequent word in our vocabulary is 'newsletter' and appeared 1 times.


In [89]:
vocab[0:20]

[('the', 3354),
 (',', 2912),
 ('SENTENCE_START', 2145),
 ('SENTENCE_END', 2145),
 ('of', 1708),
 ('and', 1251),
 ('to', 1225),
 ('a', 768),
 ('is', 675),
 ('in', 610),
 ('his', 582),
 ('her', 448),
 ('with', 407),
 ('by', 389),
 ('he', 357),
 ('king', 317),
 ('that', 259),
 ('or', 227),
 ('as', 220),
 ('for', 217)]

In [93]:
sentences[0:25]

['SENTENCE_START \nsakuntala or the lost ring SENTENCE_END',
 'SENTENCE_START in ancient days, there was a mighty king of the lunar dynasty by name dushyanta SENTENCE_END',
 'SENTENCE_START he was the king of hastinapur SENTENCE_END',
 'SENTENCE_START he once goes out a-hunting and in the pursuit of a deer comes near the hermitage of the sage kanwa, the chief of the hermits, where some anchorites request him not to kill the deer SENTENCE_END',
 'SENTENCE_START the king feels thirsty and was seeking water when he saw certain maidens of the hermits watering the favourite plants SENTENCE_END',
 'SENTENCE_START one of them, an exquisitely beautiful and bashful maiden, named sakuntala, received him SENTENCE_END',
 'SENTENCE_START she was the daughter of the celestial nymph menaka by the celebrated sage viswamitra and foster-child of the hermit kanwa SENTENCE_END',
 'SENTENCE_START she is smitten with love at the first sight of the king, standing confused at the change of her own feeling SEN

### 3. Preparing Trigrams and Sequences

In [95]:
%%time
from collections import Counter
from nltk import ngrams
bigram_counts = Counter(ngrams(text.split(), 2))
bigram_counts.most_common(10)

CPU times: user 26.8 ms, sys: 3.63 ms, total: 30.4 ms
Wall time: 28.9 ms


[(('of', 'the'), 449),
 (('the', 'king'), 220),
 (('to', 'the'), 211),
 (('in', 'the'), 167),
 (('and', 'the'), 112),
 (('by', 'the'), 97),
 (('with', 'the'), 89),
 (('of', 'his'), 74),
 (('the', 'queen'), 73),
 (('to', 'be'), 70)]

In [97]:
%%time
import collections
def ngrams(text, n=2):
    return zip(*[text[i:] for i in range(n)])
bigram_counts = collections.Counter(ngrams(text.split(), 2))
bigram_counts.most_common(10)

CPU times: user 22.1 ms, sys: 3.6 ms, total: 25.7 ms
Wall time: 25 ms


[(('of', 'the'), 449),
 (('the', 'king'), 220),
 (('to', 'the'), 211),
 (('in', 'the'), 167),
 (('and', 'the'), 112),
 (('by', 'the'), 97),
 (('with', 'the'), 89),
 (('of', 'his'), 74),
 (('the', 'queen'), 73),
 (('to', 'be'), 70)]

In [99]:
text[0:1000]

'\nsakuntala or the lost ring.\n\n\nin ancient days, there was a mighty king of the lunar dynasty by name dushyanta. he was the king of hastinapur. he once goes out a-hunting and in the pursuit of a deer comes near the hermitage of the sage kanwa, the chief of the hermits, where some anchorites request him not to kill the deer. the king feels thirsty and was seeking water when he saw certain maidens of the hermits watering the favourite plants. one of them, an exquisitely beautiful and bashful maiden, named sakuntala, received him. she was the daughter of the celestial nymph menaka by the celebrated sage viswamitra and foster-child of the hermit kanwa. she is smitten with love at the first sight of the king, standing confused at the change of her own feeling. the love at first sight which the king conceives for her is of too deep a nature to be momentary. struck by her beauty he exclaims:—\n\n"her lip is ruddy as an opening bud; her graceful arms resemble tender shoots; attractive as t

In [101]:
first_word_counts = Counter([ p.replace('. ', '') for p in re.findall('\..[^" "]*', text)])
first_word_counts.most_common(10)

[('the', 272),
 ('he', 122),
 ('."', 96),
 ('I', 47),
 ('."\n\nthe', 37),
 ('she', 34),
 ('his', 29),
 ('it', 26),
 ('rama', 26),
 ('a', 24)]

In [103]:
#X_train = [[sentence_start_token] for sent,times in first_word_counts if sent != 'o.']
#y_train = [sent for sent in first_word_counts if sent != 'o.']
X_train = [[sentence_start_token]*c for sent,c in first_word_counts.items() if sent != 'o.']
y_train = [[sent]*c for sent,c in first_word_counts.items() if sent != 'o.']

In [105]:
X_train = [item for sublist in X_train for item in sublist]
y_train = [item for sublist in y_train for item in sublist]

In [107]:
X_train[0:10]

['SENTENCE_START',
 'SENTENCE_START',
 'SENTENCE_START',
 'SENTENCE_START',
 'SENTENCE_START',
 'SENTENCE_START',
 'SENTENCE_START',
 'SENTENCE_START',
 'SENTENCE_START',
 'SENTENCE_START']

In [109]:
print(y_train)

['he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', '

In [111]:
len(X_train), len(y_train)

(1562, 1562)

In [113]:
import random

def fisher_yates (arr1, arr2):
     
    # We will Start from the last element
    # and swap one by one.
    n = len(arr1)
    if n != len(arr2):
        return None
    
    for i in range(n - 1, 0, -1):

        # Pick a random index from 0 to i
        j = random.randint(0, i)
        #print(i, j)

        # Swap arr[i] with the element at random index
        arr1[i], arr1[j] = arr1[j], arr1[i]
        arr2[i], arr2[j] = arr2[j], arr2[i]
        
    return arr1, arr2

In [115]:
import random as rd
one = ['a', 'b', 'c']
two = ['1', '2', '3']
one, two = fisher_yates(one, two)
one, two

(['a', 'b', 'c'], ['1', '2', '3'])

In [117]:
one = [['a'], ['b'], ['c']]
two = [['1'], ['2'], ['3']]
one, two = fisher_yates(one, two)
one, two

([['b'], ['c'], ['a']], [['2'], ['3'], ['1']])

In [119]:
X_train, y_train = fisher_yates(X_train, y_train)
len(X_train), len(y_train)

(1562, 1562)

In [121]:
X_tokens = [[word_to_index[symbol]] for symbol,word in zip(X_train, y_train) if word in word_to_index]
y_tokens = [[word_to_index[word]] for symbol,word in zip(X_train, y_train) if word in word_to_index]

In [123]:
X_train = X_tokens
y_train = y_tokens

In [125]:
len(X_train), len(y_train)

(1222, 1222)

In [127]:
X_train[0:5], y_train[0:5]

([[2], [2], [2], [2], [2]], [[0], [0], [46], [27], [14]])

In [129]:
ngrams_up_to_20 = []
for i in range(2, 21):
    ngram_counts = Counter(ngrams(text.split(), i))
    print('ngram-', i, 'length:', len(ngram_counts))
    ngrams_up_to_20.append(ngram_counts)

ngram- 2 length: 27343
ngram- 3 length: 36204
ngram- 4 length: 38104
ngram- 5 length: 38535
ngram- 6 length: 38666
ngram- 7 length: 38716
ngram- 8 length: 38736
ngram- 9 length: 38742
ngram- 10 length: 38744
ngram- 11 length: 38745
ngram- 12 length: 38746
ngram- 13 length: 38747
ngram- 14 length: 38746
ngram- 15 length: 38745
ngram- 16 length: 38744
ngram- 17 length: 38743
ngram- 18 length: 38742
ngram- 19 length: 38741
ngram- 20 length: 38740


In [131]:
def remove_periods(ngram):
    for wrd in ngram[0]:
        if '.' in wrd or "’" in wrd or "‘" in wrd:
            return False
    return True
    
def my_filter(ngrams):
    return filter(remove_periods, ngrams)

In [133]:
l = list(filter(lambda x: 1 < int(x[1]), ngrams_up_to_20[0].most_common()))
len(l), l

(3798,
 [(('of', 'the'), 449),
  (('the', 'king'), 220),
  (('to', 'the'), 211),
  (('in', 'the'), 167),
  (('and', 'the'), 112),
  (('by', 'the'), 97),
  (('with', 'the'), 89),
  (('of', 'his'), 74),
  (('the', 'queen'), 73),
  (('to', 'be'), 70),
  (('of', 'a'), 68),
  (('*', '*'), 66),
  (('from', 'the'), 64),
  (('on', 'the'), 63),
  (('at', 'the'), 61),
  (('of', 'her'), 56),
  (('project', 'gutenberg™'), 54),
  (('king', 'of'), 52),
  (('for', 'the'), 51),
  (('he', 'is'), 48),
  (('is', 'the'), 48),
  (('that', 'the'), 47),
  (('as', 'the'), 42),
  (('to', 'his'), 41),
  (('in', 'a'), 40),
  (('it', 'is'), 37),
  (('with', 'his'), 37),
  (('by', 'a'), 36),
  (('in', 'his'), 34),
  (('the', 'queen,'), 34),
  (('the', 'sage'), 31),
  (('the', 'project'), 30),
  (('and', 'his'), 29),
  (('is', 'a'), 29),
  (('or', 'the'), 28),
  (('as', 'a'), 28),
  (('do', 'not'), 28),
  (('of', 'this'), 28),
  (('son', 'of'), 27),
  (('him', 'to'), 26),
  (('to', 'her'), 26),
  (('is', 'now'), 25

In [135]:
def my_filter(ngrams):
    return filter(remove_periods, list(filter(lambda x: 1 < int(x[1]), ngrams)))

In [137]:
bigrams_to_learn = ngrams_up_to_20[0]
X_train_example = [[word_to_index[sent[0][0]]] for sent in my_filter(bigrams_to_learn.most_common())
                  if sent[0][0] in word_to_index and sent[0][1] in word_to_index]
y_train_example = [[word_to_index[sent[0][1]]] for sent in my_filter(bigrams_to_learn.most_common())
                  if sent[0][0] in word_to_index and sent[0][1] in word_to_index]

In [139]:
X_train_example[0:10], y_train_example[0:10]

([[4], [0], [6], [9], [5], [13], [12], [4], [0], [6]],
 [[0], [15], [0], [0], [0], [0], [0], [10], [36], [31]])

In [141]:
len(X_train_example), len(y_train_example)

(3127, 3127)

In [143]:
trigrams_to_learn = ngrams_up_to_20[1].copy()
[sent[0] for sent in my_filter(trigrams_to_learn.most_common())]

[('the', 'king', 'of'),
 ('*', '*', '*'),
 ('the', 'king', 'and'),
 ('project', 'gutenberg™', 'electronic'),
 ('the', 'son', 'of'),
 ('the', 'project', 'gutenberg'),
 ('the', 'king', 'is'),
 ('the', 'daughter', 'of'),
 ('the', 'project', 'gutenberg™'),
 ('project', 'gutenberg', 'literary'),
 ('gutenberg', 'literary', 'archive'),
 ('of', 'the', 'king'),
 ('the', 'terms', 'of'),
 ('is', 'about', 'to'),
 ('the', 'god', 'of'),
 ('king', 'of', 'the'),
 ('one', 'of', 'the'),
 ('terms', 'of', 'this'),
 ('as', 'well', 'as'),
 ('the', 'king', 'replies,'),
 ('of', 'the', 'king,'),
 ('to', 'the', 'king'),
 ('the', 'cause', 'of'),
 ('the', 'author', 'of'),
 ('of', 'the', 'queen,'),
 ('to', 'be', 'a'),
 ('of', 'the', 'project'),
 ('in', 'the', 'united'),
 ('set', 'forth', 'in'),
 ('gutenberg™', 'electronic', 'works'),
 ('of', 'project', 'gutenberg™'),
 ('literary', 'archive', 'foundation'),
 ('the', 'hermitage', 'of'),
 ('to', 'the', 'queen,'),
 ('the', 'hands', 'of'),
 ('the', 'brother', 'of'),
 (

In [145]:
X_train_example.extend([[word_to_index[w] for w in sent[0][:-1]] for sent in my_filter(trigrams_to_learn.most_common())
               if all([w in word_to_index for w in sent[0]])])
y_train_example.extend([[word_to_index[w] for w in sent[0][1:]] for sent in my_filter(trigrams_to_learn.most_common())
               if all([w in word_to_index for w in sent[0]])])

In [147]:
len(X_train_example), len(y_train_example)

(4434, 4434)

In [149]:
X_train_example[1575:1585], y_train_example[1575:1585]

([[10], [10], [1152], [90], [63], [28], [921], [5], [271], [1480]],
 [[1151], [75], [4], [7], [143], [34], [0], [2055], [174], [6]])

In [151]:
bigrams_to_learn = ngrams_up_to_20[0]
X_train_2 = [[word_to_index[sent[0][0]]] for sent in my_filter(bigrams_to_learn.most_common())
                  if sent[0][0] in word_to_index and sent[0][1] in word_to_index]
y_train_2 = [[word_to_index[sent[0][1]]] for sent in my_filter(bigrams_to_learn.most_common())
                  if sent[0][0] in word_to_index and sent[0][1] in word_to_index]
X_train_2, y_train_2 = fisher_yates(X_train_2, y_train_2)

In [153]:
len(X_train_2), len(y_train_2)

(3127, 3127)

In [155]:
X_train_2[0:10], y_train_2[0:10]

([[38], [42], [0], [2173], [91], [8], [2973], [7], [144], [2192]],
 [[0], [25], [719], [6], [89], [10], [62], [205], [40], [6]])

In [157]:
X_train.extend(X_train_2)
y_train.extend(y_train_2)

In [159]:
len(X_train), len(y_train)

(4349, 4349)

In [161]:
random.sample(list(zip(X_train, y_train)), 10)

[([990], [0]),
 ([391], [12]),
 ([8], [161]),
 ([7], [1074]),
 ([2], [75]),
 ([1617], [6]),
 ([263], [5]),
 ([6], [17]),
 ([2], [0]),
 ([7], [183])]

In [163]:
ngrams_to_learn = ngrams_up_to_20[1]
ngrams_to_learn.most_common(10)

[(('the', 'king', 'of'), 37),
 (('*', '*', '*'), 33),
 (('the', 'king', 'and'), 20),
 (('project', 'gutenberg™', 'electronic'), 18),
 (('the', 'son', 'of'), 17),
 (('the', 'project', 'gutenberg'), 17),
 (('the', 'king', 'is'), 15),
 (('the', 'daughter', 'of'), 14),
 (('the', 'project', 'gutenberg™'), 13),
 (('project', 'gutenberg', 'literary'), 13)]

In [165]:
[sent[0] for sent in my_filter(ngrams_to_learn.most_common(10))]

[('the', 'king', 'of'),
 ('*', '*', '*'),
 ('the', 'king', 'and'),
 ('project', 'gutenberg™', 'electronic'),
 ('the', 'son', 'of'),
 ('the', 'project', 'gutenberg'),
 ('the', 'king', 'is'),
 ('the', 'daughter', 'of'),
 ('the', 'project', 'gutenberg™'),
 ('project', 'gutenberg', 'literary')]

In [167]:
X_train_2 = [[word_to_index[w] for w in sent[0][:-1]] for sent in my_filter(ngrams_to_learn.most_common())
               if all([w in word_to_index for w in sent[0]])]
y_train_2 = [[word_to_index[w] for w in sent[0][1:]] for sent in my_filter(ngrams_to_learn.most_common())
               if all([w in word_to_index for w in sent[0]])]
X_train_2, y_train_2 = fisher_yates(X_train_2, y_train_2)
X_train_2[0:5], y_train_2[0:5], len(X_train_2), len(y_train_2)

([[334, 4], [0, 141], [6, 689], [46, 1256], [8, 488]],
 [[4, 0], [141, 4], [689, 0], [1256, 25], [488, 9]],
 1307,
 1307)

In [169]:
def my_filter(ngrams):
    return filter(remove_periods, ngrams)

In [171]:
X_train_2 = [[word_to_index[w] for w in sent[0][:-1]] for sent in my_filter(ngrams_to_learn.most_common())
               if all([w in word_to_index for w in sent[0]])]
y_train_2 = [[word_to_index[w] for w in sent[0][1:]] for sent in my_filter(ngrams_to_learn.most_common())
               if all([w in word_to_index for w in sent[0]])]
X_train_2 = X_train_2[:2000]
y_train_2 = y_train_2[:2000]
X_train_2, y_train_2 = fisher_yates(X_train_2, y_train_2)
X_train_2[0:5], y_train_2[0:5], len(X_train_2), len(y_train_2)

([[1115, 13], [0, 1361], [12, 75], [16, 24], [5, 40]],
 [[13, 11], [1361, 4], [75, 25], [24, 45], [40, 0]],
 2000,
 2000)

In [173]:
ngrams_to_learn = ngrams_up_to_20[1]
X_train_2 = [[word_to_index[w] for w in sent[0][:-1]] for sent in my_filter(ngrams_to_learn.most_common())
               if all([w in word_to_index for w in sent[0]])]
y_train_2 = [[word_to_index[w] for w in sent[0][1:]] for sent in my_filter(ngrams_to_learn.most_common())
               if all([w in word_to_index for w in sent[0]])]
print(X_train_2[0:5], y_train_2[0:5], len(X_train_2), len(y_train_2))

[[0, 15], [48, 48], [0, 15], [55, 72], [0, 60]] [[15, 4], [48, 48], [15, 5], [72, 158], [60, 4]] 20933 20933


In [175]:
word_to_index['SENTENCE_END']

3

In [177]:
def check_eos(trigram):
    if trigram[1] == word_to_index['SENTENCE_END']:
          return True  
    return False

trigrams_eos = list(filter(check_eos, y_train_2))
len(trigrams_eos), trigrams_eos[0:5]

(0, [])

In [179]:
from tqdm import tqdm
for i in tqdm(range(1, len(ngrams_up_to_20))):
    ngrams_to_learn = ngrams_up_to_20[i]
    X_train_2 = [[word_to_index[w] for w in sent[0][:-1]] for sent in my_filter(ngrams_to_learn.most_common())
                   if all([w in word_to_index for w in sent[0]])]
    y_train_2 = [[word_to_index[w] for w in sent[0][1:]] for sent in my_filter(ngrams_to_learn.most_common())
                   if all([w in word_to_index for w in sent[0]])]
    X_train_2 = X_train_2[:2000]
    y_train_2 = y_train_2[:2000]
    X_train_2, y_train_2 = fisher_yates(X_train_2, y_train_2)
    X_train.extend(X_train_2)
    y_train.extend(y_train_2)

100%|███████████████████████████████████████████| 18/18 [00:01<00:00, 17.13it/s]


In [181]:
len(X_train), len(y_train)

(34425, 34425)

In [183]:
print(random.sample(list(zip(X_train, y_train)), 10))

[([1971, 9, 0, 158, 74, 8, 448, 5, 1705, 6, 24, 702, 3034, 319], [9, 0, 158, 74, 8, 448, 5, 1705, 6, 24, 702, 3034, 319, 4]), ([3967, 6, 0, 532, 4, 1002, 5, 91, 99, 0, 124, 4], [6, 0, 532, 4, 1002, 5, 91, 99, 0, 124, 4, 85]), ([1360, 16, 24, 67, 77, 12, 130, 55, 72, 158, 113, 264, 149, 1953, 12, 0, 199], [16, 24, 67, 77, 12, 130, 55, 72, 158, 113, 264, 149, 1953, 12, 0, 199, 162]), ([0], [743]), ([54, 14, 876, 877, 1111, 4], [14, 876, 877, 1111, 4, 0]), ([1891], [4]), ([8, 1302, 146], [1302, 146, 0]), ([5, 592, 2520, 6, 2521, 47, 1184, 2522, 13], [592, 2520, 6, 2521, 47, 1184, 2522, 13, 1763]), ([7, 2040, 4, 908, 99, 52], [2040, 4, 908, 99, 52, 30]), ([12, 55, 72, 158, 113, 64, 24, 663, 0, 162, 4, 26, 243, 5], [55, 72, 158, 113, 64, 24, 663, 0, 162, 4, 26, 243, 5, 333])]


In [185]:
len(tokenized_sentences)

2145

In [187]:
tokenized_sentences[100]

['SENTENCE_START',
 'one',
 'of',
 'the',
 'names',
 'of',
 'vishnu',
 'is',
 'purushottama',
 'SENTENCE_END']

In [189]:
[[word_to_index[w] for w in sent] for sent in tokenized_sentences if all([w in word_to_index for w in sent])][100]

[2, 56, 4, 0, 2062, 4, 747, 8, 2063, 3]

In [191]:
X_train_full_sentences = [[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences
                         if all([w in word_to_index for w in sent])]
y_train_full_sentences = [[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences
                         if all([w in word_to_index for w in sent])]

In [193]:
print(X_train_full_sentences[0:5], y_train_full_sentences[0:5])

[[2, 132, 17, 0, 715, 442], [2, 9, 1105, 319, 1, 90, 40, 7, 557, 15, 4, 0, 1106, 1107, 13, 320, 3056], [2, 14, 40, 0, 15, 4, 716], [2, 14, 211, 91, 79, 3057, 5, 9, 0, 1108, 4, 7, 874, 92, 387, 0, 224, 4, 0, 86, 1109, 1, 0, 244, 4, 0, 875, 1, 110, 78, 3058, 321, 34, 29, 6, 717, 0, 874], [2, 0, 15, 613, 3059, 5, 40, 1110, 558, 54, 14, 876, 877, 1111, 4, 0, 875, 3060, 0, 388, 3061]] [[132, 17, 0, 715, 442, 3], [9, 1105, 319, 1, 90, 40, 7, 557, 15, 4, 0, 1106, 1107, 13, 320, 3056, 3], [14, 40, 0, 15, 4, 716, 3], [14, 211, 91, 79, 3057, 5, 9, 0, 1108, 4, 7, 874, 92, 387, 0, 224, 4, 0, 86, 1109, 1, 0, 244, 4, 0, 875, 1, 110, 78, 3058, 321, 34, 29, 6, 717, 0, 874, 3], [0, 15, 613, 3059, 5, 40, 1110, 558, 54, 14, 876, 877, 1111, 4, 0, 875, 3060, 0, 388, 3061, 3]]


In [195]:
import random
last_n_words = []
for i in range(3, 20):
    tokenized_sentences_400 = random.sample(list(tokenized_sentences), 400)
    for s in tokenized_sentences_400:
        last_n_words.append(s[::-1][:i][::-1])

print(random.sample(last_n_words, 10))

[['SENTENCE_START', '1e', 'SENTENCE_END'], ['SENTENCE_START', 'vibhishana', 'is', 'now', 'crowned', 'king', 'of', 'lanka', 'SENTENCE_END'], ['everything', 'about', 'the', 'matrimonial', 'promise', 'SENTENCE_END'], ['exclaims', ',', '``', 'where', 'is', 'she', 'SENTENCE_END'], ['alone', 'with', 'his', 'jester', 'SENTENCE_END'], ['SENTENCE_START', 'the', 'minister', 'is', 'glad', 'that', 'his', 'aims', 'are', 'fulfilled', 'SENTENCE_END'], ['who', ',', 'in', 'consideration', 'of', 'ganadasa', "'s", 'being', 'patronised', 'by', 'the', 'queen', ',', 'refers', 'the', 'dispute', 'to', 'her', 'SENTENCE_END'], ['the', 'dust', 'from', 'under', 'them', 'upon', 'his', 'head', 'SENTENCE_END'], ['the', 'performance', 'alone', ',', 'summons', 'the', 'queen', 'who', 'arrives', 'soon', 'SENTENCE_END'], ['yet', 'such', 'is', 'their', 'native', 'tenderness', 'that', 'they', 'can', 'not', 'assume', 'a', 'harsh', 'expression', 'SENTENCE_END']]


In [197]:
len(last_n_words)

6800

In [199]:
X_train_eos = [[word_to_index[w] for w in sent[:-1]] for sent in last_n_words
                         if all([w in word_to_index for w in sent])]
y_train_eos = [[word_to_index[w] for w in sent[1:]] for sent in last_n_words
                         if all([w in word_to_index for w in sent])]

In [201]:
len(X_train_eos), len(y_train_eos)

(6800, 6800)

In [203]:
X_train.extend(X_train_eos)
y_train.extend(y_train_eos)

In [205]:
len(X_train), len(y_train)

(41225, 41225)

In [207]:
import pickle
with open('data/X_train_Mahabharat.pkl', 'wb') as file:
    pickle.dump(X_train, file)

In [209]:
with open('data/y_train_Mahabharat.pkl', 'wb') as file:
    pickle.dump(y_train, file)

In [211]:
with open('data/tokenized_sentences_Mahabharat.pkl', 'wb') as file:
    pickle.dump(tokenized_sentences, file)

In [213]:
with open('data/word_to_index_Mahabharat.pkl', 'wb') as file:
    pickle.dump(word_to_index, file)

In [215]:
with open('data/index_to_word_Mahabharat.pkl', 'wb') as file:
    pickle.dump(index_to_word, file)

In [217]:
X_train2 = np.asarray(X_train,dtype=object)
y_train2 = np.asarray(y_train,dtype=object)

In [219]:
X_train2.shape, y_train2.shape

((41225,), (41225,))

In [221]:
print(random.sample(list(zip(X_train2, y_train2)), 10))

[([558, 4, 0], [4, 0, 503]), ([0, 2171, 4, 0, 253, 45, 638, 2172, 28, 0, 2125, 6, 39, 0, 769, 370], [2171, 4, 0, 253, 45, 638, 2172, 28, 0, 2125, 6, 39, 0, 769, 370, 73]), ([682, 4], [4, 0]), ([2, 5147, 100, 1, 0, 36, 274, 1, 33, 80, 5148], [5147, 100, 1, 0, 36, 274, 1, 33, 80, 5148, 3]), ([0, 764], [764, 3]), ([1653, 25, 0, 2324, 4, 10, 392, 5, 163, 808, 6, 327, 0, 3721, 392], [25, 0, 2324, 4, 10, 392, 5, 163, 808, 6, 327, 0, 3721, 392, 5]), ([9, 47, 677], [47, 677, 3]), ([8, 645, 22, 0, 565, 13, 202, 41, 14, 573, 6], [645, 22, 0, 565, 13, 202, 41, 14, 573, 6, 7]), ([54, 0, 4009, 240, 1295, 9, 4010, 0], [0, 4009, 240, 1295, 9, 4010, 0, 2201]), ([580, 202, 8, 2738, 13, 0, 1858, 4, 385, 50, 255, 4, 4938, 6, 0, 186, 4939, 12, 101], [202, 8, 2738, 13, 0, 1858, 4, 385, 50, 255, 4, 4938, 6, 0, 186, 4939, 12, 101, 14])]


In [223]:
embedding_dim = 100
vocabulary_size, embedding_dim

(300000, 100)

In [225]:
import os
import numpy as np

#glove_dir = 'data/glove'
glove_dir = "data"

embeddings_index = {} #initialize dictionary
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'), encoding='utf8')
try:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
except:
    print(line)
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [227]:
vocabulary_size

300000

In [229]:
embedding_dim = 100

embedding_matrix = np.zeros((vocabulary_size, embedding_dim))
for word, i in vocab:
    embedding_vector = embeddings_index.get(word)
    if i < vocabulary_size:
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [231]:
embedding_matrix.shape

(300000, 100)

In [233]:
vocab[200]

('people', 21)

In [235]:
embedding_matrix[12]

array([-0.29607001, -0.069697  , -0.18896   ,  1.02550006, -0.43494999,
        0.39840001,  0.42721   , -0.2018    ,  0.0083542 , -0.12468   ,
        0.031648  ,  0.11965   , -0.75585997,  0.058595  , -0.76255   ,
       -0.2359    ,  0.49783999, -0.26975   ,  0.06629   , -0.21338999,
       -0.96616   , -0.25887999, -0.44992   ,  0.050812  , -0.051264  ,
       -0.31377   , -0.11808   ,  0.56151998,  0.36386999,  0.013472  ,
        0.18610001,  0.47850001, -0.28551999,  0.55418998, -0.33179   ,
        0.22527   ,  0.29462999, -0.45513001,  0.019279  ,  0.45493999,
        0.062119  ,  0.50862998, -0.39831001,  0.24673   ,  0.29069   ,
       -0.11369   , -0.39943999,  0.34242001, -0.33991   ,  0.27941   ,
       -0.020151  ,  0.83740002,  0.3152    , -0.20747   , -0.34476   ,
        0.57929999,  0.14407   ,  0.24065   , -0.59442002, -0.14353999,
       -0.36322999, -0.08545   ,  0.32905   , -0.029023  , -0.14643   ,
        0.32894999,  0.050045  , -0.46294999,  0.32347   ,  0.26

In [237]:
from scipy import spatial

def find_closest_embeddings(embedding):
    return sorted(embeddings_index.keys(), key=lambda word: spatial.distance.euclidean(embeddings_index[word], embedding))

In [239]:
find_closest_embeddings(embeddings_index["king"])[1:6]

['prince', 'queen', 'monarch', 'brother', 'uncle']

In [241]:
print(find_closest_embeddings(
    embeddings_index["twig"] - embeddings_index["branch"] + embeddings_index["hand"]
)[:10])

['flashlight', 'twig', 'clipboard', 'shove', 'hand', 'fingers', 'clutching', 'clutched', 'tossing', 'stroking']


In [243]:
vocabulary_size, embedding_dim

(300000, 100)

### 4. Model Architecture

In [296]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

class KerasRNN:
    def __init__(self, word_dim, hidden_dim=100, glove_embedding_matrix=None):
        self.model = Sequential()
        
        # Embedding layer initialized with GloVe embeddings
        if glove_embedding_matrix is not None:
            self.model.add(Embedding(input_dim=word_dim, output_dim=glove_embedding_matrix.shape[1],
                                     weights=[glove_embedding_matrix], trainable=False))
        
        # Adding LSTM layers
        self.model.add(LSTM(hidden_dim, return_sequences=True))
        self.model.add(LSTM(hidden_dim))
        
        # Output layer
        self.model.add(Dense(word_dim, activation='softmax'))
        
        # Compile the model
        self.model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [300]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Dense, LSTM, Layer

class CustomRNN(Model):
    def __init__(self, vocabulary_size, hidden_dim=100, embedding_matrix=None):
        super(CustomRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_layer = Embedding(input_dim=vocabulary_size, output_dim=embedding_matrix.shape[1],
                                         weights=[embedding_matrix], trainable=False)
        self.lstm_layer = LSTM(hidden_dim, return_sequences=True, return_state=True)
        self.dense_layer = Dense(vocabulary_size, activation='softmax')

    def call(self, inputs):
        # Get embeddings
        x = self.embedding_layer(inputs)
        
        # LSTM processing
        lstm_outputs, _, _ = self.lstm_layer(x)

        # Apply dense layer to each time step
        output = self.dense_layer(lstm_outputs)

        return output

# Example of how to use this model
vocabulary_size = 10000  # just an example size
embedding_dim = 100      # dimension of your GloVe embeddings
embedding_matrix = tf.random.normal([vocabulary_size, embedding_dim])  # Mocked embedding matrix

# Instantiate and use the RNN
model = CustomRNN(vocabulary_size, embedding_matrix=embedding_matrix)

# Prepare some mock input data: Batch of sequences
input_data = tf.random.uniform((32, 10), dtype=tf.int32, maxval=vocabulary_size)  # batch_size = 32, sequence_length = 10

# Forward propagation
outputs = model(input_data)
print(outputs.shape)  # Should print (32, 10, vocabulary_size) indicating batch_size, sequence_length, vocabulary_size


(32, 10, 10000)


In [314]:
def predict(self, x):
        # Perform forward propagation
        o, s = self.forward_propagation(x)
        # Return the index of the highest score from the last timestep output
        return tf.argmax(o[-1], axis=1)

In [316]:
def predict(self, x):
    # Perform forward propagation
    o, s = self.forward_propagation(x)
    # Return the index of the highest score from each output across the sequence
    return tf.argmax(o, axis=2)


In [318]:
print ("x:\n%s\n%s" % (" ".join([index_to_word[x] for x in X_train2[1000]]), X_train2[1000]))

x:
SENTENCE_START
[2]


In [320]:
print ("x:\n%s\n%s" % (" ".join([index_to_word[x] for x in X_train2[20000]]), X_train2[20000]))

x:
the king enters the forest of meditation and is
[0, 15, 165, 0, 182, 4, 2585, 5, 8]


In [322]:
vocabulary_size, X_train2[10000]     

(3000, [21, 25, 180, 1154])

In [336]:

class CustomRNN(tf.keras.Model):
    def __init__(self, vocabulary_size, embedding_dim=100, rnn_units=100):
        super(CustomRNN, self).__init__()
        self.embedding = tf.keras.layers.Embedding(input_dim=vocabulary_size, output_dim=embedding_dim)
        self.rnn = tf.keras.layers.SimpleRNN(rnn_units, return_sequences=True)
        self.dense = tf.keras.layers.Dense(vocabulary_size)

    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.rnn(x)
        return self.dense(x)  # logits returned here

def custom_loss(y_true, y_pred):
    # Use sparse categorical crossentropy which does softmax internally and calculates loss
    scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    return scce(y_true, y_pred)

# Example usage:
model = CustomRNN(vocabulary_size=3000)
model.compile(optimizer='adam', loss=custom_loss, metrics=['accuracy'])

# Example training data
# Create dummy data
import numpy as np
np.random.seed(17)
X_train = np.random.randint(0, 3000, (32, 10))  # 32 sequences, each of length 10
y_train = np.random.randint(0, 3000, (32, 10))  # Corresponding targets

# Convert to tensors
X_train = tf.convert_to_tensor(X_train, dtype=tf.int32)
y_train = tf.convert_to_tensor(y_train, dtype=tf.int32)

# Train the model
model.fit(X_train, y_train, epochs=1, batch_size=32)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 378ms/step - accuracy: 0.0000e+00 - loss: 8.0066


<keras.src.callbacks.history.History at 0x3a1b2dc30>

In [340]:
# Ensure model is compiled with a loss function
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Evaluate the model on a subset of the data
loss, accuracy = model.evaluate(X_train[:1000], y_train[:1000], verbose=0)
print("Expected Loss for random predictions: %f" % np.log(vocabulary_size))
print("Actual loss: %f" % loss)


Expected Loss for random predictions: 8.006368
Actual loss: 9.899039


In [342]:
class CustomRNN(tf.keras.Model):
    def __init__(self, vocabulary_size, embedding_dim=100, rnn_units=100):
        super(CustomRNN, self).__init__()
        self.embedding = tf.keras.layers.Embedding(input_dim=vocabulary_size, output_dim=embedding_dim)
        self.rnn = tf.keras.layers.SimpleRNN(rnn_units, return_sequences=True)
        self.dense = tf.keras.layers.Dense(vocabulary_size)

    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.rnn(x)
        return self.dense(x)

# Initialize and compile the model
vocabulary_size = 3000
model = CustomRNN(vocabulary_size)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Example training with minimal data
x = np.random.randint(0, vocabulary_size, (1, 10))  # Single training example
y = np.random.randint(0, vocabulary_size, (1, 10))  # Corresponding labels

# Train the model
model.fit(x, y, epochs=1)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 428ms/step - accuracy: 0.0000e+00 - loss: 16.5894


<keras.src.callbacks.history.History at 0x3a4decdf0>

In [348]:
class CustomRNN(tf.keras.Model):
    def __init__(self, vocabulary_size, embedding_dim=10, rnn_units=10, bptt_truncate=1000):
        super(CustomRNN, self).__init__()
        self.bptt_truncate = bptt_truncate
        self.embedding = tf.keras.layers.Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, trainable=True)
        self.rnn = tf.keras.layers.SimpleRNN(rnn_units, return_sequences=True, trainable=True)
        self.dense = tf.keras.layers.Dense(vocabulary_size, trainable=True)

    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.rnn(x)
        return self.dense(x)

    def compute_loss(self, x, y):
        logits = self(x, training=True)
        loss = tf.keras.losses.sparse_categorical_crossentropy(y, logits, from_logits=True)
        return tf.reduce_mean(loss)

    def gradient_check(self, x, y, h=0.001, error_threshold=0.01):
        # Convert x and y to tensors
        x = tf.convert_to_tensor(x, dtype=tf.int32)
        y = tf.convert_to_tensor(y, dtype=tf.int32)
        
        with tf.GradientTape() as tape:
            loss = self.compute_loss(x, y)
        gradients = tape.gradient(loss, self.trainable_variables)
        
        for var, grad in zip(self.trainable_variables, gradients):
            print(f"Performing gradient check for parameter {var.name} with size {var.shape}")
            
            it = np.nditer(var.numpy(), flags=['multi_index'], op_flags=['readwrite'])
            while not it.finished:
                ix = it.multi_index
                
                original_value = var.numpy()[ix]
                
                # Increase by h
                var.assign_add(tf.reshape(tf.constant(h, dtype=tf.float32), [1] * len(var.shape)))
                loss_plus_h = self.compute_loss(x, y)
                
                # Decrease by h
                var.assign(original_value)  # Reset to original
                var.assign_sub(tf.reshape(tf.constant(h, dtype=tf.float32), [1] * len(var.shape)))
                loss_minus_h = self.compute_loss(x, y)
                
                # Reset to original
                var.assign(original_value)
                
                # Calculate numerical gradient
                estimated_gradient = (loss_plus_h - loss_minus_h) / (2 * h)
                backprop_gradient = grad.numpy()[ix]
                
                relative_error = np.abs(backprop_gradient - estimated_gradient) / (np.abs(backprop_gradient) + np.abs(estimated_gradient))
                
                if relative_error > error_threshold:
                    print(f"Gradient Check ERROR: parameter={var.name} index={ix}")
                    print(f"+h Loss: {loss_plus_h.numpy()}")
                    print(f"-h Loss: {loss_minus_h.numpy()}")
                    print(f"Estimated_gradient: {estimated_gradient}")
                    print(f"Backpropagation gradient: {backprop_gradient}")
                    print(f"Relative Error: {relative_error}")
                    return False
                
                it.iternext()
            
            print(f"Gradient check for parameter {var.name} passed.")
        return True


In [354]:
def train_with_sgd(model, X_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
    optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    losses = []
    num_examples_seen = 0

    for epoch in range(nepoch):
        print(f"Epoch {epoch+1}/{nepoch}")

        # Optionally evaluate the loss
        if (epoch % evaluate_loss_after == 0):
            preds = model(X_train, training=False)
            loss = loss_fn(y_train, preds)
            losses.append((num_examples_seen, loss.numpy()))
            current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print(f"{current_time}: Loss after num_examples_seen={num_examples_seen} epoch={epoch}: {loss.numpy()}")

            # Adjust the learning rate if loss increases
            if len(losses) > 1 and losses[-1][1] > losses[-2][1]:
                learning_rate *= 0.5
                optimizer.learning_rate = learning_rate
                print(f"Setting learning rate to {learning_rate}")

        # Training loop
        for i in range(len(y_train)):
            with tf.GradientTape() as tape:
                predictions = model(X_train[i:i+1], training=True)  # Slice i:i+1 to keep batch dimension
                loss = loss_fn(y_train[i:i+1], predictions)
            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
            num_examples_seen += 1

In [356]:
vocabulary_size

3000

In [362]:
len(index_to_word)

6446

In [364]:
def generate_sentence(model, word_to_index, index_to_word, sentence_start_token, sentence_end_token, unknown_token, senten_max_length):
    # We start the sentence with the start token
    new_sentence = [word_to_index[sentence_start_token]]
    
    # Prepare the model for predicting next words
    while (new_sentence[-1] != word_to_index[sentence_end_token]) and len(new_sentence) < senten_max_length:
        # Prepare input for prediction
        current_input = tf.expand_dims(new_sentence, axis=0)  # Expand dims for batch size
        
        # Predict logits for next word
        logits = model(current_input)
        next_word_probs = tf.nn.softmax(logits[:, -1, :])  # Get softmax for the last time step

        # Sample the next word using logits
        sampled_word = tf.random.categorical(next_word_probs, num_samples=1)
        
        # Ensure not to choose unknown token
        while sampled_word.numpy()[0][0] == word_to_index[unknown_token]:
            sampled_word = tf.random.categorical(next_word_probs, num_samples=1)
        
        # Append the sampled word to the sentence
        new_sentence.append(sampled_word.numpy()[0][0])
        
        if len(new_sentence) >= senten_max_length or new_sentence[-1] == word_to_index[sentence_end_token]:
            break

    # Convert indices to words
    sentence_str = [index_to_word[idx] for idx in new_sentence if idx in index_to_word]
    
    return ' '.join(sentence_str)

### 5. Model Training 

In [426]:
class RNN(tf.keras.Model):
    def __init__(self, vocabulary_size, embedding_dim=100, rnn_units=100):
        super(RNN, self).__init__()
        self.embedding = tf.keras.layers.Embedding(input_dim=vocabulary_size, output_dim=embedding_dim)
        self.rnn = tf.keras.layers.SimpleRNN(rnn_units, return_sequences=True)  # Only returning the sequence output
        self.dense = tf.keras.layers.Dense(vocabulary_size, activation='softmax')  # Using softmax for categorical output

    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.rnn(x)  # No need to unpack, only one output since return_state=False
        return self.dense(x)

# Parameters
vocabulary_size = 300000  # Adjust according to your dataset

In [428]:
model = RNN(vocabulary_size)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [430]:
# Dummy data for demonstration (ensure you have real data in practice)
import numpy as np
np.random.seed(17)
X_train2 = np.random.randint(0, vocabulary_size, (100, 10))  # 100 sequences, each of length 10
y_train2 = np.random.randint(0, vocabulary_size, (100, 10))  # Corresponding labels for each sequence element

# Convert to tensors if not already
X_train2 = tf.convert_to_tensor(X_train2, dtype=tf.int32)
y_train2 = tf.convert_to_tensor(y_train2, dtype=tf.int32)

In [432]:
# Train the model
history = model.fit(X_train2, y_train2, batch_size=32, epochs=80)

Epoch 1/80
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 542ms/step - accuracy: 0.0000e+00 - loss: 12.6115
Epoch 2/80
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 507ms/step - accuracy: 0.8242 - loss: 12.5934
Epoch 3/80
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 521ms/step - accuracy: 0.9921 - loss: 12.5728
Epoch 4/80
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 512ms/step - accuracy: 0.9994 - loss: 12.5435
Epoch 5/80
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 531ms/step - accuracy: 0.9968 - loss: 12.4962
Epoch 6/80
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 519ms/step - accuracy: 0.7896 - loss: 12.4075
Epoch 7/80
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 513ms/step - accuracy: 0.4653 - loss: 12.2383
Epoch 8/80
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 514ms/step - accuracy: 0.3083 - loss: 11.9549
Epoch 9/80
[1m4/4[0m [32m━━━━━━━━━━━━━━━━

In [450]:
# Define the model architecture
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=train_length),
    LSTM(150, return_sequences=True),
    LSTM(150),
    Dense(len(tokenizer.word_index) + 1, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=100, batch_size=64)


Epoch 1/100
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 67ms/step - accuracy: 0.0758 - loss: 7.5922
Epoch 2/100
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 70ms/step - accuracy: 0.0800 - loss: 6.4926
Epoch 3/100
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 72ms/step - accuracy: 0.0877 - loss: 6.3333
Epoch 4/100
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 70ms/step - accuracy: 0.1011 - loss: 6.2176
Epoch 5/100
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 72ms/step - accuracy: 0.1126 - loss: 6.0869
Epoch 6/100
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 72ms/step - accuracy: 0.1212 - loss: 5.9782
Epoch 7/100
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 73ms/step - accuracy: 0.1273 - loss: 5.8680
Epoch 8/100
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 72ms/step - accuracy: 0.1302 - loss: 5.7793
Epoch 9/100
[1m

<keras.src.callbacks.history.History at 0x3a2d938b0>

### 6. Generating Text

In [452]:
def generate_text(seed_text, num_generate=100):
    for _ in range(num_generate):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=train_length-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)
        predicted_index = np.argmax(predicted_probs, axis=-1)
        # Ensure that predicted_index is an integer, not an array
        if isinstance(predicted_index, np.ndarray):
            predicted_index = predicted_index[0]  # Take the first element if it's an array
        output_word = tokenizer.index_word.get(predicted_index, '')  # Safely get the word
        seed_text += " " + output_word
    return seed_text


In [466]:
# Example usage of the text generation function
seed_text = "In this world we rarely behold such characters as theirs. Their lofty rank is the abode of wisdom and of piety, of valour and of virtue. Their fame spreads white and spotless through the universe. A son has sprung from Devarata whose opening virtues early give occasion of rejoicing to the world. Now, in his bloom, this youth has been sent to our city to collect ripe stores of knowledge. His name is Madhava."
generated_text = generate_text(seed_text, num_generate=50)
print(generated_text)


In this world we rarely behold such characters as theirs. Their lofty rank is the abode of wisdom and of piety, of valour and of virtue. Their fame spreads white and spotless through the universe. A son has sprung from Devarata whose opening virtues early give occasion of rejoicing to the world. Now, in his bloom, this youth has been sent to our city to collect ripe stores of knowledge. His name is Madhava. if feels her maidens a majesty as my lord i am pledge and one charita or his hundred to well slain her purpose i resemble not be made it thinks and unprotected for this rai of matrimonial world his friend gotama is discovered the king of his worshippers of the


### Conclusion 

We have successfully developed and trained a model using TensorFlow with the Keras API, incorporating GloVe embeddings to enhance its output capabilities. By integrating GloVe embeddings, our model can leverage pre-trained word vectors, which improves its understanding of context and semantics in text generation. Utilizing TensorFlow and the Keras Sequential API has allowed us to streamline the model architecture, making it efficient and scalable for larger datasets.