In [9]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
import PyPDF2
import re

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

# Load the PDF and extract text

text_corpus = extract_text_from_pdf('book.pdf')
text_corpus=text_corpus.replace('  ','')
text_corpus=text_corpus.replace('\n',' ').lower()
#text_corpus=re.sub('[^a-z]',' ',text_corpus)
text_corpus=text_corpus[2000:28000]

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text_corpus])
total_words = len(tokenizer.word_index) + 1


In [3]:
len(text_corpus)

26000

In [27]:
total_words

1063

In [158]:
print(text_corpus[5000:6000])



In [160]:
a='''sh
oh'''.replace('\n',' ')

In [43]:
a

'sh oh'

In [29]:
input_sequences = []
for line in text_corpus[:].split('.'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[i-1:i+3]
        input_sequences.append(n_gram_sequence)

# Pad sequences to ensure uniform input size

input_sequences = np.array(pad_sequences(input_sequences, maxlen=4, padding='post'))

# Create predictors and label
X, y = input_sequences[:,:-1], input_sequences[:,-1]

# One-hot encode the labels
#y = to_categorical(y, num_classes=total_words)

# Build the model

In [136]:
len(input_sequences)

4972

In [31]:
print(X[:10])
y[:10]

[[504  19   1]
 [ 19   1 310]
 [  1 310 505]
 [310 505   0]
 [506   6 507]
 [  6 507  29]
 [507  29  15]
 [ 29  15 151]
 [ 15 151 508]
 [151 508   1]]


array([310, 505,   0,   0,  29,  15, 151, 508,   1, 509])

In [138]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=4-1))
model.add(LSTM(300,return_sequences=True))
model.add(LSTM(200,return_sequences=True))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

# Compile the model
adam = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam)

# Train the model
model.fit(X, y, epochs=100, verbose=2)

Epoch 1/100




156/156 - 7s - 44ms/step - loss: 6.1485
Epoch 2/100
156/156 - 3s - 17ms/step - loss: 5.8571
Epoch 3/100
156/156 - 3s - 17ms/step - loss: 5.8254
Epoch 4/100
156/156 - 3s - 17ms/step - loss: 5.8008
Epoch 5/100
156/156 - 3s - 17ms/step - loss: 5.7594
Epoch 6/100
156/156 - 3s - 17ms/step - loss: 5.7068
Epoch 7/100
156/156 - 3s - 18ms/step - loss: 5.6604
Epoch 8/100
156/156 - 3s - 18ms/step - loss: 5.5944
Epoch 9/100
156/156 - 3s - 17ms/step - loss: 5.5255
Epoch 10/100
156/156 - 3s - 17ms/step - loss: 5.4325
Epoch 11/100
156/156 - 3s - 17ms/step - loss: 5.3402
Epoch 12/100
156/156 - 3s - 17ms/step - loss: 5.2551
Epoch 13/100
156/156 - 3s - 17ms/step - loss: 5.1669
Epoch 14/100
156/156 - 3s - 18ms/step - loss: 5.0692
Epoch 15/100
156/156 - 3s - 18ms/step - loss: 4.9718
Epoch 16/100
156/156 - 3s - 18ms/step - loss: 4.8809
Epoch 17/100
156/156 - 3s - 17ms/step - loss: 4.8436
Epoch 18/100
156/156 - 3s - 17ms/step - loss: 4.7310
Epoch 19/100
156/156 - 3s - 17ms/step - loss: 4.6578
Epoch 20/100
1

<keras.src.callbacks.history.History at 0x1dfac790a10>

In [140]:
def predict_next_word(model, tokenizer, text_sequence):
    token_list = tokenizer.texts_to_sequences([text_sequence])[0]
    token_list = pad_sequences([token_list], maxlen=4-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted)
    predicted_word = tokenizer.index_word[predicted_word_index]
    return predicted_word

In [144]:
seed_text = "at the dull"
next_word = predict_next_word(model, tokenizer, seed_text)
print('Next word after:',seed_text,':',next_word)


Next word after: at the dull : weather


In [152]:
seed_text = "what should he"
next_word = predict_next_word(model, tokenizer, seed_text)
print('Next word after:',seed_text,':',next_word)


Next word after: what should he : do


In [154]:
seed_text = "a long time"
next_word = predict_next_word(model, tokenizer, seed_text)
print('Next word after:',seed_text,':',next_word)


Next word after: a long time : ago


In [156]:
seed_text = "but was it"
next_word = predict_next_word(model, tokenizer, seed_text)
print('Next word after:',seed_text,':',next_word)

Next word after: but was it : possible


In [21]:
input_sequences = []
for line in text_corpus[:].split('.'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    print(token_list,end='**')

[504, 19, 1, 310, 505]**[506, 6, 507, 29, 15, 151, 508, 1, 509, 83, 72, 20, 97, 84, 510]**[511, 73, 30, 25, 236, 11, 63, 311, 130, 4, 312, 24, 31, 512, 3, 85, 18, 8, 7, 109, 3, 7, 188, 2, 48, 74, 3, 7, 237, 2, 313, 35, 5, 110, 4, 9, 5, 314, 315, 238, 42, 111, 8, 239]**[240, 112, 3, 316, 22, 98, 5, 110, 3, 189, 513, 49, 2, 131, 3, 7]**[3, 113, 16, 132, 10, 11, 514, 241, 317, 5, 242, 33, 8, 3, 318, 16, 2, 319, 19, 1, 515, 99, 4, 56, 516, 57, 3, 320, 2, 97, 11, 517, 310, 133, 28, 8, 3, 12, 152, 134, 100]**[243, 518, 3, 85, 321, 11, 519, 520, 10, 64, 8, 244, 521, 153, 322, 9, 4, 322, 34]**[323, 86, 87, 31, 522, 135, 50, 154, 58, 323, 51, 324, 86, 19, 155, 4, 35, 325, 6, 8, 190, 1, 523, 6, 153, 524, 73, 191, 88, 525, 526, 4, 527, 528, 529, 14, 245, 156, 24, 1, 75, 33, 8, 26, 192, 152, 42, 2, 76, 530, 77, 193, 531, 14, 114]**[10, 192, 24, 78, 2, 532, 27, 3, 134, 11, 326, 327, 38, 35, 5, 533, 246, 22, 101, 38, 35, 5, 49, 328, 1, 534, 33, 8, 3, 29, 535, 5, 79, 157, 329, 131, 1, 327, 7, 4, 330,

In [25]:
input_sequences = []
for line in text_corpus[:].split('.'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[i-1:i+3]
        input_sequences.append(n_gram_sequence)
    break

In [27]:
input_sequences

[[504, 19, 1, 310], [19, 1, 310, 505], [1, 310, 505], [310, 505]]

In [37]:
input_sequences[3]

array([310, 505,   0,   0])