<a href="https://colab.research.google.com/github/SURESHBEEKHANI/Long-Short-Term-Memory_LSTM/blob/main/Long_Short_Term_Memory_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [49]:
# Import the numpy library, which helps with mathematical operations
import numpy as np

# Import the tensorflow library, which is used for building and training machine learning models
import tensorflow as tf

# Import the Sequential class from tensorflow, which helps in building a sequence of layers for our model
from tensorflow.keras.models import Sequential

# Import the Tokenizer class from tensorflow, which helps to process and prepare text data for our model
from tensorflow.keras.preprocessing.text import Tokenizer

# Import the layers we will use in our model: Dense (fully connected layers), Embedding (for converting words into numerical data), and LSTM (a type of layer useful for understanding sequences)
from tensorflow.keras.layers import Dense, Embedding, LSTM

# Import the pad_sequences function from tensorflow, which helps to make sure all sequences (like sentences) are the same length by adding padding if needed
from tensorflow.keras.preprocessing.sequence import pad_sequences  # Fixed a typo here

# Load the data

In [50]:
!pip install pypdf




In [51]:
# Import the PdfReader class from the PyPDF2 library, which helps us read PDF files
from  pypdf import PdfReader

# Open the PDF file in read-binary mode ('rb') so we can work with it
with open("/kaggle/input/machinelearining/UnitIIntroductiontoAIML.pdf","rb") as file:
    # Create a PdfReader object to read the PDF file
    reader = PdfReader(file)

    # Create an empty list to store the text from each page of the PDF
    list_text = []

    # Loop through each page in the PDF file
    for page_num in range(len(reader.pages)):
        # Get the current page using its number
        page = reader.pages[page_num]

        # Extract the text from the current page
        text = page.extract_text()

        # Add the extracted text to the list
        list_text.append(text)

In [52]:
!pip install nltk



In [53]:
import string
from nltk.corpus import stopwords

def remove_punctuation_and_stopwords(text):
    # Create a translation table to remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    # Remove punctuation
    text = text.translate(translator)
    
    # Convert text to lowercase and split into words
    words = text.lower().split()
    
    # Get stop words
    stop_words = set(stopwords.words('english'))
    
    # Remove stop words
    filtered_words = [word for word in words if word not in stop_words]
    
    # Join words back into a string
    return ' '.join(filtered_words)

In [54]:
# Sample list of text
texts = list_text

# Apply the function to the list
list_text = [remove_punctuation_and_stopwords(text) for text in texts]

#Tokenization and Preprocessing data

Tokenization is the process of breaking text into smaller units (words, sentences, etc.). Preprocessing involves cleaning and normalizing text by steps such as lowercasing, removing punctuation and stop words, and stemming or lemmatizing words. Together, these steps prepare text data for analysis or machine learning.

In [55]:

# Initialize the Tokenizer object to preprocess and vectorize text data
tokenizer = Tokenizer()

# Fit the Tokenizer on the list of sentences (or texts) to build the vocabulary
tokenizer.fit_on_texts(list_text)

# Get the total number of unique words in the vocabulary, and assign and index value
total_words = len(tokenizer.word_index) + 1

# Print the total number of unique words in the vocabulary
print(total_words)

# Print the word index dictionary that maps words to their integer index
print(tokenizer.word_index)


# Initialize an empty list to hold the input sequences
input_sequence = []

# Iterate through each sentence in the list of sentences
for line in list_text:
    # Convert each sentence into a sequence of integers using the tokenizer
    # The 'texts_to_sequences' method converts words to their corresponding integer indices
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list) + 1):  # Use len(token_list) + 1 to include the last element
        n_gram_sequence = token_list[:i]
        input_sequence.append(n_gram_sequence)

# Find the maximum sequence length
max_sequence_len = max(len(seq) for seq in input_sequence)

# Pad sequences to
padded_sequences = pad_sequences(input_sequence, maxlen=max_sequence_len, padding='pre')
print(padded_sequences)

# Summary:
# This code initializes a Tokenizer, fits it on a list of sentences to build the vocabulary,
# and then prints the total number of unique words and their corresponding indices in the word index.

2526
{'engineering': 1, 'learning': 2, 'mechanical': 3, 'research': 4, 'vishal': 5, 'guru': 6, 'gobind': 7, 'singh': 8, 'j': 9, 'dhore': 10, 'assistant': 11, 'professor': 12, 'college': 13, 'centre': 14, 'nash': 15, 'intelligence': 16, 'data': 17, 'knowledge': 18, 'artificial': 19, 'ai': 20, 'problem': 21, 'human': 22, 'ik': 23, 'representation': 24, 'machine': 25, 'algorithms': 26, 'problems': 27, 'based': 28, 'science': 29, 'solving': 30, 'information': 31, 'approaches': 32, 'reinforcement': 33, '\uf076the': 34, 'process': 35, 'using': 36, 'systems': 37, 'planning': 38, 'world': 39, 'behavior': 40, 'supervised': 41, 'ofartificial': 42, 'reasoning': 43, 'symbolic': 44, 'tolearn': 45, 'simulation': 46, 'unsupervised': 47, 'also': 48, 'networks': 49, 'brain': 50, 'without': 51, 'themachine': 52, 'history': 53, 'need': 54, 'oflearning': 55, 'performance': 56, 'possible': 57, 'model': 58, 'given': 59, 'ml': 60, 'way': 61, 'language': 62, 'researchers': 63, 'processes': 64, 'used': 65, 'ap

#Split the data into features (x) and targets (y)

In [56]:
# Split the data into features (x) and targets (y)
# Features (x) include all columns except the last one
x_train = padded_sequences[:, :-1]

# Targets (y) include only the last column
y_train = padded_sequences[:, -1]

# Print the features to verify their content
print("Features (x):")
print(x_train)

# Print the targets to verify their content
print("Targets (y):")
print(y_train)

Features (x):
[[   0    0    0 ...    0    0    0]
 [   0    0    0 ...    0    0  241]
 [   0    0    0 ...    0  241  549]
 ...
 [   0    0    0 ...   14   15 2522]
 [   0    0    0 ...   15 2522 2523]
 [   0    0    0 ... 2522 2523 2524]]
Targets (y):
[ 241  549  550 ... 2523 2524 2525]


# Convert the target data (y) to one-hot encoded format

In [57]:

# Convert the target data (y) to one-hot encoded format
# 'num_classes' specifies the total number of classes (e.g., total_words)
y_train = tf.keras.utils.to_categorical(y_train, num_classes=total_words)

# Print the one-hot encoded target data to verify the result
print("One-hot encoded targets (y):")
print(y_train)

One-hot encoded targets (y):
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


# Define Architecture RNN (Recurrent Neural Network)

In [58]:
import numpy as np

# Assuming y_train is 2D, reshape it to 1D if needed
y_train = np.squeeze(y_train)

# Now check the shape
print(y_train.shape)

(5334, 2526)


In [59]:
model = Sequential([
    # Embedding layer: converts input sequences of integers into dense vectors of fixed size
    Embedding(total_words, 10,           # Dimension of the dense embedding vectors
              input_length=max_sequence_len-1),  # Length of input sequences

    # Simple RNN layer: processes the sequence data and retains the temporal information
    LSTM(100),  # Number of units in the RNN layer

    # Dense layer: outputs the probability distribution over the vocabulary
    Dense(total_words,            # Size of the output layer (same as vocabulary size)
          activation='softmax')   # Softmax activation to produce probability distribution
])

In [60]:
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',  # Use this for multi-class classification
    metrics=['accuracy']
)

In [61]:
# Train the model with a specified batch size
model.fit(
    x_train,
    y_train,
    epochs=50,
    batch_size=32,  # Specify the batch size here
    verbose=1  # Progress bar
)

Epoch 1/50
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.0156 - loss: 7.4823
Epoch 2/50
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.0356 - loss: 6.5479
Epoch 3/50
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.0595 - loss: 6.3016
Epoch 4/50
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.0834 - loss: 6.0449
Epoch 5/50
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.1337 - loss: 5.7343
Epoch 6/50
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.1873 - loss: 5.4115
Epoch 7/50
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.2201 - loss: 5.0964
Epoch 8/50
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.2489 - loss: 4.8282
Epoch 9/50
[1m167/167[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x7e99c745a410>

In [64]:

# This function predicts the next word(s) given a starting piece of text (seed_text).
def predict_next_word(seed_text, next_words=1):
    # Loop to predict the next word 'next_words' times
    for _ in range(next_words):
        # Convert the seed text into a format that the model can understand (a sequence of numbers)
        token_list = tokenizer.texts_to_sequences([seed_text])

        # Ensure the sequence is the right length for the model by padding it with zeros if needed
        token_list = pad_sequences(token_list, maxlen=max_sequence_len, padding='pre')

        # Use the model to predict the probabilities of the next word
        predicted_probs = model.predict(token_list, verbose=0)
        # Find the index of the word with the highest probability
        predicted_index = np.argmax(predicted_probs, axis=-1)[0]

        # Convert the index back into the actual word
        predicted_word = tokenizer.index_word.get(predicted_index, '')

        # Add the predicted word to the end of the seed text
        seed_text += ' ' + predicted_word

    # Return the updated seed text with the predicted words added
    return seed_text

# Example usage of the function:
seed_text = "Artificial Intelligence (AI)?"
# Predict the next 4 words based on the seed text
predicted_text = predict_next_word(seed_text, next_words=50)
# Print the result
print(predicted_text)

Artificial Intelligence (AI)? vishal j dhore assistant professor mechanical engineering guru gobind singh college engineering research centre nash ik learning isthefield learning isbased onthumb level itprovides theinformation readable inthis rule itmakes aprogrammer aprogrammer writes aprogram togive instructions toperform atask tothecomputer itislearned ie programmed thesystem willbeable todonew things also canbeseveral sources fortaking advice ashumansexperts
