In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import regex as re

# Load and clean Shakespeare's Hamlet
def file_to_sentence_list(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Trim Gutenberg header/footer
    start_index = text.find("*** START OF")
    end_index = text.find("*** END OF")
    if start_index != -1 and end_index != -1:
        text = text[start_index:end_index]

    # Basic cleaning
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s.!?]', '', text)  # keep only letters and sentence punctuation
    text = re.sub(r'\s+', ' ', text)

    # Split into sentences
    sentences = [sentence.strip() for sentence in re.split(r'(?<=[.!?])\s+', text) if sentence.strip()]
    return sentences

# Use the uploaded file
file_path = '/content/drive/MyDrive/Colab Notebooks/archive (2)/1661-0.txt'
text_data = file_to_sentence_list(file_path)

# Step 2: Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
total_words = len(tokenizer.word_index) + 1

# Create input sequences
input_sequences = []
for line in text_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Build the model
model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_sequence_len-1))
model.add(LSTM(128))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train the model
model.fit(X, y, epochs=50, verbose=1)  # Reduce to 50 epochs for faster training

# Predict next words
seed_text = "To be or not"
next_words = 5

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted_probs = model.predict(token_list, verbose=0)
    predicted_word = tokenizer.index_word[np.argmax(predicted_probs)]
    seed_text += " " + predicted_word

print("Next predicted words:", seed_text)




Epoch 1/50
[1m3036/3036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m480s[0m 157ms/step - accuracy: 0.0597 - loss: 6.6903
Epoch 2/50
[1m2492/3036[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m1:25[0m 157ms/step - accuracy: 0.0945 - loss: 5.8335

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
