In [23]:
# Data Collection 

import tensorflow as tf
import pandas as pd
import nltk
from nltk.corpus import gutenberg
nltk.download('gutenberg')

# Load the dataset
data = gutenberg.raw('shakespeare-hamlet.txt')

# Save to a text file
with open('shakespeare_hamlet.txt', 'w', encoding='utf-8') as f:
    f.write(data)

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\raksh/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [24]:
# Data Preprocessing

import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Load the text data
with open('shakespeare_hamlet.txt', 'r', encoding='utf-8') as f:
    text = f.read().lower()

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
total_words


4818

In [25]:
# Create input sequences and labels
input_sequences = []
for line in text.split('\n'):                                   # the entire text is split into lines
    token_list = tokenizer.texts_to_sequences([line])[0]        # each line is converted to a sequence of tokens
    for i in range(1, len(token_list)):                         # for each token in the line (starting from the second token)   
        n_gram_sequence = token_list[:i+1]                      # create n-gram sequences - sequences of increasing length :- if i = 1, then the list will be token_list[0:2] (first two word's tokens), i.e [3, 5] if i = 2, then the list will be token_list[0:3] (first three word's tokens), i.e [3, 5, 7] and so on
        input_sequences.append(n_gram_sequence)                 # add the n-gram sequence to the list
      # print(line, ":", n_gram_sequence)



In [26]:
# Pad sequences to ensure uniform input size
max_sequence_len = max([len(x) for x in input_sequences])                                # find the maximum sequence length              
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre') # add padding to the sequences so that all sequences have the same length - here maxlen = 14
input_sequences 

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]])

In [27]:
# Create predictors and label
X = input_sequences[:,:-1]  # all columns except the last column
Y = input_sequences[:,-1]   # only the last column
print(X)

[[   0    0    0 ...    0    0    1]
 [   0    0    0 ...    0    1  687]
 [   0    0    0 ...    1  687    4]
 ...
 [   0    0    0 ...  687    4   45]
 [   0    0    0 ...    4   45 1047]
 [   0    0    0 ...   45 1047    4]]


In [28]:
print(Y)

[ 687    4   45 ... 1047    4  193]


In [29]:
# Since some words might be repeated, they would be converted to categories
y = tf.keras.utils.to_categorical(y, num_classes=total_words)
print(y)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [30]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
# Training the Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Define the model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1)) # Embedding layer
model.add(LSTM(150, return_sequences=True))                             # First LSTM layer
model.add(Dropout(0.2))                                                 # Dropout layer to prevent overfitting
model.add(LSTM(100))                                                    # Second LSTM layer
model.add(Dense(total_words, activation='softmax'))                     # Output layer
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 13, 100)           481800    
                                                                 
 lstm (LSTM)                 (None, 13, 150)           150600    
                                                                 
 dropout (Dropout)           (None, 13, 150)           0         
                                                                 
 lstm_1 (LSTM)               (None, 100)               100400    
                                                                 
 dense (Dense)               (None, 4818)              486618    
                                                                 
Total params: 1219418 (4.65 MB)
Trainable params: 1219418 (4.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [35]:
# Training the Model
history = model.fit(X_train, y_train, epochs=50, verbose=1, validation_data=(X_test, y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
# Function to predict the next words
def predict_next_words(model, tokenizer, text, num_words):
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted, axis=1)[0]
        
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                #text += " " + word
                #break
                return word
    return text


# Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]  # Ensure the sequence length matches max_sequence_len-1
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [38]:
input_text = "To be, or not to be"
print(f"Input Text: {input_text}")
max_sequence_len = model.input_shape[1] + 1
next_word = predict_next_words(model, tokenizer, input_text, max_sequence_len)
print(f"Predicted Text: {next_word}")

Input Text: To be, or not to be
Predicted Text: To be, or not to be buried in't and in denmarke where it waste is all a rat a rat


In [None]:
# Save the model
model.save('lstm_text_generation_model.h5')

# Save the tokenizer
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    