In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [2]:
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text


In [3]:
def preprocess_text(text):
    text = text.lower()
    return text

In [4]:
def tokenize_text(text):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([text])
    total_words = len(tokenizer.word_index) + 1
    sequences = tokenizer.texts_to_sequences([text])[0]
    return sequences, total_words, tokenizer

In [5]:
def create_sequences(sequences, seq_length):
    X = []
    y = []
    for i in range(len(sequences)):
        end_index = i + seq_length
        if end_index > len(sequences) - 1:
            break
        seq_x, seq_y = sequences[i:end_index], sequences[end_index]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

In [6]:
def create_model(seq_length, total_words):
    model = Sequential([
        Embedding(total_words, 100, input_length=seq_length),
        LSTM(256, return_sequences=True),
        LSTM(256),
        Dense(256, activation='relu'),
        Dense(total_words, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [26]:
def generate_random_sentence(model, tokenizer, seq_length, num_words):
    start_index = np.random.randint(0, len(sequences) - seq_length - 1)
    seed_text = ' '.join([tokenizer.index_word[idx] for idx in sequences[start_index:start_index+seq_length]])

    result = seed_text
    for _ in range(num_words):
        encoded_text = tokenizer.texts_to_sequences([seed_text])[0]
        encoded_text = pad_sequences([encoded_text], maxlen=seq_length, truncating='pre')
        y_pred = np.argmax(model.predict(encoded_text), axis=-1)
        predicted_word = tokenizer.index_word[y_pred[0]]
        seed_text += ' ' + predicted_word
        result += ' ' + predicted_word
    return result

In [23]:
if __name__ == "__main__":
    file_path = 'test_data_long.txt'  
    text = read_text_file(file_path)

    text = preprocess_text(text)

    sequences, total_words, tokenizer = tokenize_text(text)

    seq_length = 50  
    X, y = create_sequences(sequences, seq_length)

    model = create_model(seq_length, total_words)
    model.fit(X, to_categorical(y, num_classes=total_words), epochs=10, batch_size=600)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
generated_sentence = generate_random_sentence(model, tokenizer, seq_length, num_words=20)
print(generated_sentence)

between the two that's the devil of it answered our host there was drury was one of the two he laid out the boy was walking out with little nellie seymour and one day they met benton and that night benton came into the bar and started making foul innuendoes to the house and the house and the house and the man and i had been a man of the
