In [1]:
pip install numpy tensorflow pandas




In [34]:
import numpy as np
import pandas as pd
import tensorflow as tf
import kagglehub
import os
from tensorflow.keras.layers import LSTM,Embedding,Dense,Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
path = kagglehub.dataset_download("guslovesmath/shakespeare-plays-dataset")
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/guslovesmath/shakespeare-plays-dataset?dataset_version_number=1...


100%|██████████| 2.62M/2.62M [00:00<00:00, 109MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/guslovesmath/shakespeare-plays-dataset/versions/1





In [4]:
data="/root/.cache/kagglehub/datasets/guslovesmath/shakespeare-plays-dataset/versions/1"

contents=os.listdir(data)
print(contents)

['shakespeare_plays.csv']


In [5]:
df=pd.read_csv("/root/.cache/kagglehub/datasets/guslovesmath/shakespeare-plays-dataset/versions/1/shakespeare_plays.csv",nrows=10000)

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,play_name,genre,character,act,scene,sentence,text,sex
0,0,All's Well That Ends Well,Comedy,Countess,1,1,1,"In delivering my son from me, I bury a second ...",female
1,1,All's Well That Ends Well,Comedy,Bertram,1,1,2,"And I in going, madam, weep o'er my father's d...",male
2,2,All's Well That Ends Well,Comedy,Bertram,1,1,3,"anew: but I must attend his majesty's command, to",male
3,3,All's Well That Ends Well,Comedy,Bertram,1,1,4,"whom I am now in ward, evermore in subjection.",male
4,4,All's Well That Ends Well,Comedy,Lafeu,1,1,5,"You shall find of the king a husband, madam; you,",male


In [7]:
text=df['text']

In [8]:
text

Unnamed: 0,text
0,"In delivering my son from me, I bury a second ..."
1,"And I in going, madam, weep o'er my father's d..."
2,"anew: but I must attend his majesty's command, to"
3,"whom I am now in ward, evermore in subjection."
4,"You shall find of the king a husband, madam; you,"
...,...
9995,"Hark, Polydore, it sounds! But what occasion"
9996,Hath Cadwal now to give it motion? Hark!
9997,Is he at home?
9998,He went hence even now.


In [11]:
text=df['text'].str.lower()

In [12]:
text

Unnamed: 0,text
0,"in delivering my son from me, i bury a second ..."
1,"and i in going, madam, weep o'er my father's d..."
2,"anew: but i must attend his majesty's command, to"
3,"whom i am now in ward, evermore in subjection."
4,"you shall find of the king a husband, madam; you,"
...,...
9995,"hark, polydore, it sounds! but what occasion"
9996,hath cadwal now to give it motion? hark!
9997,is he at home?
9998,he went hence even now.


In [14]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text.astype(str).tolist())
total_words = len(tokenizer.word_index) + 1


In [17]:
input_sequences = []
words = df['text'].str.split()


In [19]:
words

Unnamed: 0,text
0,"[In, delivering, my, son, from, me,, I, bury, ..."
1,"[And, I, in, going,, madam,, weep, o'er, my, f..."
2,"[anew:, but, I, must, attend, his, majesty's, ..."
3,"[whom, I, am, now, in, ward,, evermore, in, su..."
4,"[You, shall, find, of, the, king, a, husband,,..."
...,...
9995,"[Hark,, Polydore,, it, sounds!, But, what, occ..."
9996,"[Hath, Cadwal, now, to, give, it, motion?, Hark!]"
9997,"[Is, he, at, home?]"
9998,"[He, went, hence, even, now.]"


In [22]:
for line in words:
  for i in range(1,len(line)):
    seq = line[:i+1]
    input_sequences.append(tokenizer.texts_to_sequences([" ".join(seq)])[0])


In [23]:
max_seq_length = max([len(seq) for seq in input_sequences])

In [24]:
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_length, padding="pre")


In [25]:
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [32]:
model = tf.keras.models.Sequential([
    Embedding(total_words, 200, input_length=max_seq_length-1),
    LSTM(256, return_sequences=True),
    LSTM(256),
    Dense(256, activation="relu"),
    Dense(total_words, activation="softmax")
])

In [35]:
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001), metrics=["accuracy"])


In [36]:
model.fit(X, y, epochs=50, verbose=1)


Epoch 1/50
[1m2058/2058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 11ms/step - accuracy: 0.0297 - loss: 6.8080
Epoch 2/50
[1m2058/2058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 10ms/step - accuracy: 0.0440 - loss: 6.2815
Epoch 3/50
[1m2058/2058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 10ms/step - accuracy: 0.0641 - loss: 5.9820
Epoch 4/50
[1m2058/2058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 10ms/step - accuracy: 0.0791 - loss: 5.7949
Epoch 5/50
[1m2058/2058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 10ms/step - accuracy: 0.0882 - loss: 5.6264
Epoch 6/50
[1m2058/2058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 10ms/step - accuracy: 0.0980 - loss: 5.4594
Epoch 7/50
[1m2058/2058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 10ms/step - accuracy: 0.1025 - loss: 5.3120
Epoch 8/50
[1m2058/2058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 10ms/step - accuracy: 0.1100 - loss: 5.2025
Epoch 9/

<keras.src.callbacks.history.History at 0x7c25256d65d0>

In [39]:
def generate_text(seed_text, next_words=20):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_length-1, padding="pre")
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

print(generate_text("and i "))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29