In [19]:
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer

# Load and tokenize the data
data = open('clean_data.txt', 'r').read()

# Tokenize the data and fit it to the text
tokenizer = Tokenizer(oov_token='<oov>')
tokenizer.fit_on_texts(data.split())

# Save tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
import numpy as np
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# Create input sequences from the data
def create_input_sequences(data, n_gram_size=6):
    # Create n-gram input sequences based on an n-gram size of 6
    input_sequences = []
    token_list = tokenizer.texts_to_sequences([data])[0]

    # Sliding iteration which takes every 6 words in a row as an input sequence
    for i in range(1, len(token_list) - n_gram_size):
        n_gram_sequence = token_list[i:i+n_gram_size]
        input_sequences.append(n_gram_sequence)

    # Pad sequences
    max_sequence_len = max([len(x) for x in input_sequences])
    return np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')), max_sequence_len

# Create the features and labels, and split the data into training and testing sets
def create_training_data(input_sequences):
    # Create features and labels
    xs, labels = input_sequences[:,:-1], input_sequences[:,-1]
    ys = to_categorical(labels, num_classes=total_words)

    # Split data
    return train_test_split(xs, ys, test_size=0.1, shuffle=True)

# Train the model
def train_model(X_train, y_train, total_words, max_sequence_len):
    # Create callbacks
    checkpoint = ModelCheckpoint("model.keras", monitor='loss', verbose=1, save_best_only=True, mode='auto')
    reduce = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.0001, verbose=1)

    # Create optimizer
    optimizer = Adam(learning_rate=0.01)

    # Create model
    model = Sequential()
    model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
    model.add(Bidirectional(LSTM(512)))
    model.add(Dense(total_words, activation='softmax'))

    model.summary()

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    model.fit(
        X_train, y_train, epochs=20, batch_size=2000,
        callbacks=[checkpoint, reduce]
    )

# Load cleaned data
data = open('clean_data.txt', 'r').read().split(' ')
total_words = len(tokenizer.word_index) + 1

# Create input sequences and training data
input_sequences, max_sequence_len = create_input_sequences(data)
X_train, X_test, y_train, y_test = create_training_data(input_sequences)

# Train the model
train_model(X_train, y_train, total_words, max_sequence_len)




Epoch 1/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.0337 - loss: 6.9172
Epoch 1: loss improved from inf to 6.36203, saving model to model.keras
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 1s/step - accuracy: 0.0340 - loss: 6.9075 - learning_rate: 0.0100
Epoch 2/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.1074 - loss: 5.4981
Epoch 2: loss improved from 6.36203 to 5.40065, saving model to model.keras
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 1s/step - accuracy: 0.1075 - loss: 5.4964 - learning_rate: 0.0100
Epoch 3/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.1439 - loss: 4.9015
Epoch 3: loss improved from 5.40065 to 4.89671, saving model to model.keras
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 1s/step - accuracy: 0.1440 - loss: 4.9014 - learning_rate: 0.0100
Epoch 4/20
[1m56/56[0m [32m━━

In [21]:
import pickle
from keras.models import load_model

model = None
tokenizer = None

def load_tokenizer():
    global tokenizer
    if tokenizer is None:
        # Load the tokenizer
        with open('tokenizer.pickle', 'rb') as handle:
            tokenizer = pickle.load(handle)
    return tokenizer

def load_model():
    global model
    if model is None:
        # Load the model
        model = load_model('model.keras')
    return model


In [22]:
import numpy as np

# Predict text based on a set of seed text
def predict_text(seed_text):
    # Convert the seed text into a token list using the same process as the previous tokenization
    load_tokenizer()
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=5, padding='pre')

    # Make the prediction
    m = load_model()
    predict_x = m.predict(token_list, batch_size=500, verbose=0)

    # Find the top three words
    predict_x = np.argpartition(predict_x, -3, axis=1)[0][-3:]

    # Reverse the list so the most popular is first
    predictions = list(predict_x)
    predictions.reverse()

    # Iterate over the predicted words, and find the word in the tokenizer dictionary that matches
    output_words = []
    for prediction in predictions:
        for word, index in tokenizer.word_index.items():
            if prediction == index:
                output_words.append(word)
                break

    return output_words


In [28]:
import pickle
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load the tokenizer
def load_tokenizer():
    with open('tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
    return tokenizer

# Load the model
def load_trained_model():
    model = load_model('model.keras')
    return model

# Function to predict text
def predict_text(seed_text, tokenizer, model):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=5, padding='pre')
    predict_x = model.predict(token_list, batch_size=500, verbose=0)
    predict_x = np.argpartition(predict_x, -3, axis=1)[0][-3:]
    predictions = list(predict_x)
    predictions.reverse()
    output_words = []
    for prediction in predictions:
        for word, index in tokenizer.word_index.items():
            if prediction == index:
                output_words.append(word)
                break
    return output_words

# Main function to run the test
def main():
    seed_text = "where"
    tokenizer = load_tokenizer()
    model = load_trained_model()
    predictions = predict_text(seed_text, tokenizer, model)
    print(f"Predictions for '{seed_text}': {predictions}")

if __name__ == "__main__":
    main()


Predictions for 'where': ['he', 'prevented', 'their']
