This cell imports the necessary Python libraries for PDF handling, Natural Language Processing, and building deep learning models.

In [1]:
# Import libraries
import fitz  # PyMuPDF library for PDF handling
import spacy # spaCy library for NLP
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
import numpy as np

This cell defines a function to extract text content from a given PDF file.

In [2]:
# Define a function to extract text from a PDF file
# It takes the path to the PDF file as input
def extract_text_from_pdf(pdf_path):
    # Open the PDF document
    doc = fitz.open(pdf_path)
    text = ""
    # Iterate through each page and extract text
    for page in doc:
        text += page.get_text()
    return text

This cell defines a function to preprocess the extracted text by converting it to lowercase and removing punctuation.

In [3]:
# Define a function to preprocess the text for model training
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation (you might want a more sophisticated approach)
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    return text

This cell defines a function to tokenize the text and create sequences suitable for training a language model. It also handles padding and separating input and output for training.

In [4]:
# Define a function to tokenize the text and create sequences for training
def create_sequences(text, sequence_length):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([text])
    total_words = len(tokenizer.word_index) + 1

    input_sequences = []
    for line in text.split('\n'):
        encoded_sequence = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(encoded_sequence)):
            n_gram_sequence = encoded_sequence[:i+1]
            input_sequences.append(n_gram_sequence)

    # Pad sequences for uniform length
    max_sequence_length = max([len(seq) for seq in input_sequences])
    padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(
        input_sequences, maxlen=max_sequence_length, padding='pre'
    )

    # Separate input and output
    xs, labels = padded_sequences[:, :-1], padded_sequences[:, -1]
    ys = to_categorical(labels, num_classes=total_words)

    return xs, ys, tokenizer, max_sequence_length, total_words

This cell defines the architecture of the LSTM (Long Short-Term Memory) model used for next word prediction.

In [5]:
# Define a function to build the LSTM model for next word prediction
def build_model(vocab_size, max_sequence_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_sequence_length - 1))
    model.add(LSTM(150))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

This cell defines a function to predict the next word in a given text sequence using the trained LSTM model.

In [6]:
# Define a function to predict the next word using the trained model
def predict_next_word(model, tokenizer, max_sequence_length, text_sequence):
    # Preprocess and tokenize the input sequence
    preprocessed_sequence = preprocess_text(text_sequence)
    encoded_sequence = tokenizer.texts_to_sequences([preprocessed_sequence])[0]
    # Pad the sequence
    padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(
        [encoded_sequence], maxlen=max_sequence_length - 1, padding='pre'
    )
    # Get model prediction
    predicted_probabilities = model.predict(padded_sequence)
    # Get the index of the word with the highest probability
    predicted_word_index = np.argmax(predicted_probabilities, axis=-1)
    # Map the index back to a word
    predicted_word = tokenizer.index_word[predicted_word_index[0]]
    return predicted_word

This is the main execution block. It specifies the PDF file path, calls the functions to extract and preprocess text, creates training sequences, builds the model, and includes commented-out code for training and prediction.

In [None]:
# Main execution block

# Specify the path to the PDF file
pdf_file_path = "/content/john-stuart-mill_system-of-logic.pdf"

# Extract text from the PDF file
print(f"Extracting text from {pdf_file_path}...")
extracted_text = extract_text_from_pdf(pdf_file_path)
print("Text extraction complete.")

# Preprocess the extracted text
print("Preprocessing text...")
preprocessed_text = preprocess_text(extracted_text)
print("Text preprocessing complete.")

# Define the sequence length for training
sequence_length = 5  # You can adjust this based on your data and needs

# Create sequences for training
print("Creating sequences for model training...")
xs, ys, tokenizer, max_sequence_length, total_words = create_sequences(preprocessed_text, sequence_length)
print(f"Sequences created. Vocabulary size: {total_words}, Max sequence length: {max_sequence_length}")

# Build the model
print("Building the LSTM model...")
model = build_model(total_words, max_sequence_length)
print("Model built.")
model.summary() # Print a summary of the model architecture

# Train the model (you'll likely need more data and epochs for good results)
print("Starting model training...")
# Uncomment the line below to train the model
model.fit(xs, ys, epochs=100, verbose=1)
print("Model training complete (if uncommented and run).")

# --- Example of how to use the prediction function (after training) ---
# Note: You need to train the model first by uncommenting model.fit
print("\nExample Prediction (requires trained model):")
input_text = "this is an example"
if 'model' in locals() and len(xs) > 0: # Check if model is built and data exists
    predicted_word = predict_next_word(model, tokenizer, max_sequence_length, input_text)
    print(f"The next predicted word after '{input_text}' is: {predicted_word}")
else:
     print("Model is not trained or data not available for prediction.")

Extracting text from /content/john-stuart-mill_system-of-logic.pdf...
Text extraction complete.
Preprocessing text...
Text preprocessing complete.
Creating sequences for model training...
Sequences created. Vocabulary size: 13802, Max sequence length: 29
Building the LSTM model...
Model built.




Starting model training...
