In [3]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

In [4]:
from keras.models import Sequential
from keras.layers import Embedding,SimpleRNN,Dense

In [8]:
plays=pd.read_csv("/content/Shakespeare_data.csv")
plays.shape
plays.columns


Index(['Dataline', 'Play', 'PlayerLinenumber', 'ActSceneLine', 'Player',
       'PlayerLine'],
      dtype='object')

In [9]:
samples_lines=plays['PlayerLine'].sample(n=10000,random_state=42)
data=" ".join(samples_lines)

In [10]:
data

"That hath deprived me of your grace and favour, Their bodies, even to loathing, for they so stunk, Men at some time are masters of their fates: Disgorges such a tempest forth, That monster, custom, who all sense doth eat, To this chair bind him. Villain, thou shalt find-- Dexterity so obeying appetite Hector, in view of Trojans and of Greeks, Who know the world, see heaven, but, feeling woe, I should my tears let fall upon your cheek, Let not that doctor e'er come near my house: Arise, and say how thou camest here. Have I not heard these islanders shout out and one thing more, that you be never so hardy to And say I am Revenge, sent from below To furnish me upon my longing journey. All his revenue. And thus the native hue of resolution Why, art thou mad, old fellow? porringer fell off her head, for kindling such a That you shall stifle in your own report Is not this suit of mine, that thou declare It is as easy to count atomies as to resolve the But let this same be presently perform'

In [11]:
#tokenization
tokenizer=Tokenizer()
tokenizer.fit_on_texts([data])
word_index=tokenizer.word_index
total_words=len(word_index)+1

In [12]:
input_sequences=[]
token_list=tokenizer.texts_to_sequences([data])[0]
for i in range(1,len(token_list) ):
  n_gram_sequence=token_list[:i+1]
  input_sequences.append(n_gram_sequence)

In [13]:
max_sequence_len=max([len(seq) for seq in input_sequences])
input_sequences=pad_sequences(input_sequences,maxlen=max_sequence_len,padding='pre')

In [14]:
X,y=input_sequences[:,:-1],input_sequences[:,-1]
y=to_categorical(y,num_classes=total_words)

In [15]:
model=Sequential()
model.add(Embedding(total_words,1000,input_length=max_sequence_len-1))
model.add(SimpleRNN(200))
model.add(Dense(total_words,activation='softmax'))

In [16]:
model.compile(loss="categorical_crossentropy",optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 751, 100)          38700     
                                                                 
 simple_rnn (SimpleRNN)      (None, 150)               37650     
                                                                 
 dense (Dense)               (None, 387)               58437     
                                                                 
Total params: 134787 (526.51 KB)
Trainable params: 134787 (526.51 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [17]:
model.fit(X,y,epochs=100,batch_size=64,verbose=1)

Epoch 1/26
Epoch 2/26
Epoch 3/26
Epoch 4/26
Epoch 5/26
Epoch 6/26
Epoch 7/26
Epoch 8/26
Epoch 9/26
Epoch 10/26
Epoch 11/26
Epoch 12/26
Epoch 13/26
Epoch 14/26
Epoch 15/26
Epoch 16/26
Epoch 17/26
Epoch 18/26
Epoch 19/26
Epoch 20/26
Epoch 21/26
Epoch 22/26
Epoch 23/26
Epoch 24/26
Epoch 25/26
Epoch 26/26


<keras.src.callbacks.History at 0x7d24780f7370>

In [22]:
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted)
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [24]:
seed_text = "That hath deprived me of your"
next_word = predict_next_word(model, tokenizer, seed_text, max_sequence_len)
print(f"Next word prediction: {next_word}")

Next word prediction: and
