<a href="https://colab.research.google.com/github/SURESHBEEKHANI/Long-Short-Term-Memory_LSTM/blob/main/Long_Short_Term_Memory_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import the numpy library, which helps with mathematical operations
import numpy as np

# Import the tensorflow library, which is used for building and training machine learning models
import tensorflow as tf

# Import the Sequential class from tensorflow, which helps in building a sequence of layers for our model
from tensorflow.keras.models import Sequential

# Import the Tokenizer class from tensorflow, which helps to process and prepare text data for our model
from tensorflow.keras.preprocessing.text import Tokenizer

# Import the layers we will use in our model: Dense (fully connected layers), Embedding (for converting words into numerical data), and LSTM (a type of layer useful for understanding sequences)
from tensorflow.keras.layers import Dense, Embedding, LSTM

# Import the pad_sequences function from tensorflow, which helps to make sure all sequences (like sentences) are the same length by adding padding if needed
from tensorflow.keras.preprocessing.sequence import pad_sequences  # Fixed a typo here


# Load the data

In [None]:
pip install PyPDF2

In [None]:
# Import the PdfReader class from the PyPDF2 library, which helps us read PDF files
from PyPDF2 import PdfReader

# Open the PDF file in read-binary mode ('rb') so we can work with it
with open("/content/thebook.pdf", "rb") as file:
    # Create a PdfReader object to read the PDF file
    reader = PdfReader(file)

    # Create an empty list to store the text from each page of the PDF
    list_text = []

    # Loop through each page in the PDF file
    for page_num in range(len(reader.pages)):
        # Get the current page using its number
        page = reader.pages[page_num]

        # Extract the text from the current page
        text = page.extract_text()

        # Add the extracted text to the list
        list_text.append(text)

# Print the list that contains the text from all pages
print(list_text)


#Tokenization and Preprocessing data

Tokenization is the process of breaking text into smaller units (words, sentences, etc.). Preprocessing involves cleaning and normalizing text by steps such as lowercasing, removing punctuation and stop words, and stemming or lemmatizing words. Together, these steps prepare text data for analysis or machine learning.

In [None]:

# Initialize the Tokenizer object to preprocess and vectorize text data
tokenizer = Tokenizer()

# Fit the Tokenizer on the list of sentences (or texts) to build the vocabulary
tokenizer.fit_on_texts(list_text)

# Get the total number of unique words in the vocabulary, and assign and index value
total_words = len(tokenizer.word_index) + 1

# Print the total number of unique words in the vocabulary
print(total_words)

# Print the word index dictionary that maps words to their integer index
print(tokenizer.word_index)


# Initialize an empty list to hold the input sequences
input_sequence = []

# Iterate through each sentence in the list of sentences
for line in list_text:
    # Convert each sentence into a sequence of integers using the tokenizer
    # The 'texts_to_sequences' method converts words to their corresponding integer indices
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list) + 1):  # Use len(token_list) + 1 to include the last element
        n_gram_sequence = token_list[:i]
        input_sequence.append(n_gram_sequence)

# Find the maximum sequence length
max_sequence_len = max(len(seq) for seq in input_sequence)

# Pad sequences to
padded_sequences = pad_sequences(input_sequence, maxlen=max_sequence_len, padding='pre')
print(padded_sequences)

# Summary:
# This code initializes a Tokenizer, fits it on a list of sentences to build the vocabulary,
# and then prints the total number of unique words and their corresponding indices in the word index.

#Split the data into features (x) and targets (y)

In [None]:
# Split the data into features (x) and targets (y)
# Features (x) include all columns except the last one
x_train = padded_sequences[:, :-1]

# Targets (y) include only the last column
y_train = padded_sequences[:, -1]

# Print the features to verify their content
print("Features (x):")
print(x_train)

# Print the targets to verify their content
print("Targets (y):")
print(y_train)

# Convert the target data (y) to one-hot encoded format

In [None]:

# Convert the target data (y) to one-hot encoded format
# 'num_classes' specifies the total number of classes (e.g., total_words)
y_train = tf.keras.utils.to_categorical(y_train, num_classes=total_words)

# Print the one-hot encoded target data to verify the result
print("One-hot encoded targets (y):")
print(y_train)

# Define Architecture RNN (Recurrent Neural Network)

In [None]:
model = Sequential([
    # Embedding layer: converts input sequences of integers into dense vectors of fixed size
    Embedding(input_dim=total_words,  # Size of the vocabulary
              output_dim=10,           # Dimension of the dense embedding vectors
              input_length=max_sequence_len-1),  # Length of input sequences

    # Simple RNN layer: processes the sequence data and retains the temporal information
    LSTM(100),  # Number of units in the RNN layer

    # Dense layer: outputs the probability distribution over the vocabulary
    Dense(total_words,            # Size of the output layer (same as vocabulary size)
          activation='softmax')   # Softmax activation to produce probability distribution
])


In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [None]:
# Train the model
history = model.fit(
    x_train,                    # Input data
    y_train,                    # Target labels
    epochs=50,                  # Number of training epochs              # Number of samples per gradient update
    validation_split=0.2,       # Fraction of data to be used for validation
    verbose=1                   # Verbosity mode: 1 for progress bar
)

In [None]:

# This function predicts the next word(s) given a starting piece of text (seed_text).
def predict_next_word(seed_text, next_words=1):
    # Loop to predict the next word 'next_words' times
    for _ in range(next_words):
        # Convert the seed text into a format that the model can understand (a sequence of numbers)
        token_list = tokenizer.texts_to_sequences([seed_text])

        # Ensure the sequence is the right length for the model by padding it with zeros if needed
        token_list = pad_sequences(token_list, maxlen=max_sequence_len, padding='pre')

        # Use the model to predict the probabilities of the next word
        predicted_probs = model.predict(token_list, verbose=0)
        # Find the index of the word with the highest probability
        predicted_index = np.argmax(predicted_probs, axis=-1)[0]

        # Convert the index back into the actual word
        predicted_word = tokenizer.index_word.get(predicted_index, '')

        # Add the predicted word to the end of the seed text
        seed_text += ' ' + predicted_word

    # Return the updated seed text with the predicted words added
    return seed_text

# Example usage of the function:
seed_text = "what is machine learining  "
# Predict the next 4 words based on the seed text
predicted_text = predict_next_word(seed_text, next_words=50)
# Print the result
print(predicted_text)