In [None]:
import os
import string
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

# Load and preprocess all data
def load_and_preprocess(file_path):
    """
    Load and preprocess the dataset from a CSV file.

    This function checks if the specified file exists and raises an error if not. 
    It reads the data into a DataFrame, extracts the 'PlayerLine' column, 
    removes any NaN values, and cleans each line by removing punctuation, 
    converting to lowercase, and encoding to ASCII while ignoring errors.

    Parameters:
    file_path (str): The path to the CSV file containing the data.

    Returns:
    list: A list of cleaned text lines.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File {file_path} not found.")

    df = pd.read_csv(file_path)
    lines = df['PlayerLine'].dropna().tolist()
    cleaned = [''.join(c for c in line if c not in string.punctuation).lower().encode("utf-8").decode("ascii", "ignore") for line in lines]
    return cleaned

# Tokenize and create sequences
def tokenize_sequences(corpus):
    """
    Tokenize the text corpus and create sequences of tokens.

    This function initializes a Keras Tokenizer, fits it on the provided corpus, 
    and generates sequences of tokens for each line in the corpus. It constructs 
    a list of sequences where each sequence contains the tokens of the text up to 
    the current index.

    Parameters:
    corpus (list): A list of cleaned text lines.

    Returns:
    tuple: A tuple containing the list of sequences and the tokenizer instance.
    """
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(corpus)
    sequences = []
    for text in corpus:
        tokens = tokenizer.texts_to_sequences([text])[0]
        for i in range(1, len(tokens)):
            sequences.append(tokens[:i+1])
    return sequences, tokenizer

# Prepare predictors and labels
def prepare_data(sequences, vocab_size):
    """
    Prepare the predictors and labels for the model.

    This function determines the maximum length of the sequences and pads them 
    to ensure uniformity. It also returns the maximum length to be used in the model.

    Parameters:
    sequences (list): A list of tokenized sequences.
    vocab_size (int): The size of the vocabulary.

    Returns:
    tuple: A tuple containing the padded sequences and the maximum length.
    """
    max_len = max(len(seq) for seq in sequences)
    padded = pad_sequences(sequences, maxlen=max_len, padding='pre')
    return padded, max_len

# Build LSTM model
def build_model(vocab_size, max_len):
    """
    Build and compile an LSTM model for text generation.

    This function creates a Sequential model, adding an Embedding layer, an LSTM 
    layer, and a Dense layer with a softmax activation. It also compiles the model 
    with categorical crossentropy loss and the Adam optimizer.

    Parameters:
    vocab_size (int): The size of the vocabulary.
    max_len (int): The maximum length of input sequences.

    Returns:
    Model: A compiled Keras Sequential model.
    """
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_len-1))
    model.add(LSTM(150))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Generate text
def generate_text(seed, model, tokenizer, max_len, num_words):
    """
    Generate text based on a seed input using the trained model.

    This function takes a seed string and generates a specified number of words. 
    It predicts the next word based on the current seed and appends it to the seed 
    for the next prediction, iterating until the desired number of words is generated.

    Parameters:
    seed (str): The initial text to start generation.
    model (Model): The trained Keras model for text generation.
    tokenizer (Tokenizer): The Keras tokenizer used for encoding the text.
    max_len (int): The maximum length of input sequences.
    num_words (int): The number of words to generate.

    Returns:
    str: The generated text as a string.
    """
    for _ in range(num_words):
        tokens = tokenizer.texts_to_sequences([seed])[0]
        tokens = pad_sequences([tokens], maxlen=max_len-1, padding='pre')
        preds = model.predict(tokens, verbose=0)
        next_idx = np.argmax(preds)
        for word, index in tokenizer.word_index.items():
            if index == next_idx:
                seed += ' ' + word
                break
    return seed.title()

def main():
    data_file = 'Shakespeare_data.csv'
    corpus = load_and_preprocess(data_file)
    print(f"Loaded {len(corpus)} lines.")

    sequences, tokenizer = tokenize_sequences(corpus)
    vocab_size = len(tokenizer.word_index) + 1
    print(f"Vocabulary size: {vocab_size}")

    print("\nTokenizer's Word Index:")
    for word, index in tokenizer.word_index.items():
        print(f"{word}: {index}")

    print("\nSample Tokenized Sequences:")
    for i in range(5):
        print(f"Original: {corpus[i]}")
        print(f"Tokenized: {tokenizer.texts_to_sequences([corpus[i]])[0]}\n")

    padded, max_len = prepare_data(sequences, vocab_size)

    total_words = sum(len(seq) for seq in sequences)
    unique_words = vocab_size - 1
    print(f"Total words in the dataset: {total_words}")
    print(f"Unique words in the dataset: {unique_words}")

    X = padded[:11000, :-1]
    y = to_categorical(padded[:11000, -1], num_classes=vocab_size)

    print(f"Using {X.shape[0]} lines for training (limited to 11000).")

    model = build_model(vocab_size, max_len)

    early_stop = EarlyStopping(monitor='loss', patience=3)
    model.fit(X, y, epochs=100, batch_size=512, callbacks=[early_stop], verbose=1)

    model.save('lstm_word_completion.keras')

    while True:
        seed = input("Enter seed text (or type 'exit' to quit): ").strip()
        if seed.lower() == 'exit':
            break
        try:
            num = int(input("Number of words to generate: "))
        except ValueError:
            print("Please enter a valid number.")
            continue
        generated = generate_text(seed, model, tokenizer, max_len, num)
        print(f"Generated Text:\n{generated}\n")

if __name__ == "__main__":
    main()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
atonements: 22402
compremises: 22403
vizaments: 22404
deathsbedgot: 22405
resurrections: 22406
wellwillers: 22407
plessing: 22408
peradventures: 22409
pageand: 22410
cotsall: 22411
redressd: 22412
cabbage: 22413
banbury: 22414
mephostophilus: 22415
umpires: 22416
finally: 22417
prief: 22418
tevil: 22419
affectations: 22420
millsixpences: 22421
shovelboards: 22422
yead: 22423
mountainforeigner: 22424
latten: 22425
labras: 22426
nuthooks: 22427
fap: 22428
careires: 22429
udge: 22430
shortcake: 22431
allhallowmas: 22432
philosophers: 22433
possitable: 22434
plessed: 22435
veneys: 22436
sackerson: 22437
shrieked: 22438
illfavored: 22439
laundry: 22440
washer: 22441
wringer: 22442
altogethers: 22443
pippins: 22444
scholarly: 22445
cashier: 22446
keisar: 22447
pheezar: 22448
tap: 22449
hungarian: 22450
spigot: 22451
tinderbox: 22452
fico: 22453
conycatch: 22454
carves: 22455
invitation: 22456
hardest: 22457
englished: 22458
ext



Epoch 1/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 3s/step - accuracy: 0.0209 - loss: 9.9272
Epoch 2/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 2s/step - accuracy: 0.0355 - loss: 6.8619
Epoch 3/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 3s/step - accuracy: 0.0391 - loss: 6.4138
Epoch 4/100
