In [53]:
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [54]:
with open('sherlock-holm.es_stories_plain-text_advs.txt', 'r', encoding='utf-8') as file:
    text_data = file.read()

# Preprocessing the text (lowercase and punctuation)
text_data = text_data.lower() 
text_data = re.sub(r'[^a-z0-9\s]', '', text_data) 

# Splitting text into lines
lines = text_data.split('\n')
lines = [line.strip() for line in lines if line.strip()]  # Removing empty lines


In [55]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text_data])
total_words = len(tokenizer.word_index) + 1

In [57]:
input_sequences = []
for line in lines:
    token_list = tokenizer.texts_to_sequences([line])[0]
    print("Token List for line '{}': {}".format(line, token_list))  # Debugging print
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Padding sequences
if input_sequences:
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
else:
    raise ValueError("No input sequences generated. Check your text data and tokenization.")

max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

Token List for line 'the adventures of sherlock holmes': [1, 1513, 5, 128, 33]
Token List for line 'arthur conan doyle': [600, 4413, 4414]
Token List for line 'table of contents': [256, 5, 1514]
Token List for line 'a scandal in bohemia': [6, 795, 7, 847]
Token List for line 'the redheaded league': [1, 527, 634]
Token List for line 'a case of identity': [6, 114, 5, 2028]
Token List for line 'the boscombe valley mystery': [1, 656, 1327, 480]
Token List for line 'the five orange pips': [1, 308, 916, 848]
Token List for line 'the man with the twisted lip': [1, 55, 18, 1, 980, 849]
Token List for line 'the adventure of the blue carbuncle': [1, 556, 5, 1, 442, 1328]
Token List for line 'the adventure of the speckled band': [1, 556, 5, 1, 1744, 739]
Token List for line 'the adventure of the engineers thumb': [1, 556, 5, 1, 2413, 657]
Token List for line 'the adventure of the noble bachelor': [1, 556, 5, 1, 740, 1202]
Token List for line 'the adventure of the beryl coronet': [1, 556, 5, 1, 20

In [58]:
print("Input Sequences: ", input_sequences)

Input Sequences:  [[   0    0    0 ...    0    1 1513]
 [   0    0    0 ...    1 1513    5]
 [   0    0    0 ... 1513    5  128]
 ...
 [   0    0    0 ...   29    1 8419]
 [   0    0    0 ...    1 8419 8420]
 [   0    0    0 ... 8419 8420 3551]]


In [35]:

# Creating predictors and label
predictors, label = input_sequences[:,:-1], input_sequences[:,-1]
label = tf.keras.utils.to_categorical(label, num_classes=total_words)

# Creating the model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(predictors, label, epochs=100, verbose=1)





Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoc

In [59]:
def predict_next_word(text):
    sequence = tokenizer.texts_to_sequences([text])[0]
    sequence = pad_sequences([sequence], maxlen=max_sequence_len-1, padding='pre')
    predictions = model.predict(sequence, verbose=0)
    predicted_index = np.argmax(predictions, axis=-1)  # Get the index of the max probability
    for word, index in tokenizer.word_index.items():
        if index == predicted_index:
            return word
    return ""

In [60]:
print(predict_next_word("test"))

looked
