<a href="https://colab.research.google.com/github/SURESHBEEKHANI/Long-Short-Term-Memory_LSTM/blob/main/Long_Short_Term_Memory_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [120]:
# Import the numpy library, which helps with mathematical operations
import numpy as np

# Import the tensorflow library, which is used for building and training machine learning models
import tensorflow as tf

import pandas as pd

# Import the Sequential class from tensorflow, which helps in building a sequence of layers for our model
from tensorflow.keras.models import Sequential

# Import the Tokenizer class from tensorflow, which helps to process and prepare text data for our model
from tensorflow.keras.preprocessing.text import Tokenizer

# Import the layers we will use in our model: Dense (fully connected layers), Embedding (for converting words into numerical data), and LSTM (a type of layer useful for understanding sequences)
from tensorflow.keras.layers import Dense, Embedding, LSTM

# Import the pad_sequences function from tensorflow, which helps to make sure all sequences (like sentences) are the same length by adding padding if needed
from tensorflow.keras.preprocessing.sequence import pad_sequences  # Fixed a typo here

#  Generate a Large Text Dataset

In [121]:
# Define templates for generating text
templates = [
    "machine learning is used for {task}",
    "data science involves {technique} and {technique}",
    "deep learning models are trained using {method}",
    "natural language processing helps in {application}",
    "reinforcement learning trains agents using {reward}",
    "supervised learning requires {data_type} data",
    "unsupervised learning finds {pattern} in data",
    "overfitting happens when the model learns {noise}",
    "cross-validation helps to evaluate {model}",
    "feature engineering is important for {task}"
]

# Define placeholders for generating sentences
tasks = ["classification", "regression", "clustering", "prediction"]
techniques = ["data cleaning", "feature selection", "model tuning", "data visualization"]
methods = ["gradient descent", "backpropagation", "stochastic gradient descent"]
applications = ["text classification", "sentiment analysis", "language translation"]
rewards = ["positive feedback", "negative feedback", "rewards"]
data_types = ["labeled", "unlabeled"]
patterns = ["clusters", "structures", "anomalies"]
noise = ["irrelevant patterns", "outliers"]
models = ["neural networks", "decision trees", "support vector machines"]

# Generate sentences
def generate_sentences(n):
    sentences = []
    for _ in range(n):
        template = np.random.choice(templates)
        sentence = template.format(
            task=np.random.choice(tasks),
            technique=np.random.choice(techniques),
            method=np.random.choice(methods),
            application=np.random.choice(applications),
            reward=np.random.choice(rewards),
            data_type=np.random.choice(data_types),
            pattern=np.random.choice(patterns),
            noise=np.random.choice(noise),
            model=np.random.choice(models)
        )
        sentences.append(sentence)
    return sentences

list_text = generate_sentences(1000)



In [122]:
print(list_text)

['supervised learning requires labeled data', 'unsupervised learning finds anomalies in data', 'feature engineering is important for classification', 'feature engineering is important for classification', 'machine learning is used for clustering', 'reinforcement learning trains agents using negative feedback', 'data science involves data cleaning and data cleaning', 'cross-validation helps to evaluate support vector machines', 'deep learning models are trained using stochastic gradient descent', 'deep learning models are trained using backpropagation', 'feature engineering is important for clustering', 'unsupervised learning finds structures in data', 'natural language processing helps in text classification', 'feature engineering is important for prediction', 'cross-validation helps to evaluate decision trees', 'feature engineering is important for prediction', 'reinforcement learning trains agents using negative feedback', 'feature engineering is important for prediction', 'natural l

# Tokenization and Preprocessing data

Tokenization is the process of breaking text into smaller units (words, sentences, etc.). Preprocessing involves cleaning and normalizing text by steps such as lowercasing, removing punctuation and stop words, and stemming or lemmatizing words. Together, these steps prepare text data for analysis or machine learning.

In [123]:

# Initialize the Tokenizer object to preprocess and vectorize text data
tokenizer = Tokenizer()

# Fit the Tokenizer on the list of sentences (or texts) to build the vocabulary
tokenizer.fit_on_texts(list_text)

# Get the total number of unique words in the vocabulary, and assign and index value
total_words = len(tokenizer.word_index) + 1

# Print the total number of unique words in the vocabulary
print(total_words)

# Print the word index dictionary that maps words to their integer index
print(tokenizer.word_index)


# Initialize an empty list to hold the input sequences
input_sequence = []

# Iterate through each sentence in the list of sentences
for line in list_text:
    # Convert each sentence into a sequence of integers using the tokenizer
    # The 'texts_to_sequences' method converts words to their corresponding integer indices
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list) + 1):  # Use len(token_list) + 1 to include the last element
        n_gram_sequence = token_list[:i]
        input_sequence.append(n_gram_sequence)

# Find the maximum sequence length
max_sequence_len = max(len(seq) for seq in input_sequence)

# Pad sequences to
padded_sequences = pad_sequences(input_sequence, maxlen=max_sequence_len, padding='pre')
print(padded_sequences)

# Summary:
# This code initializes a Tokenizer, fits it on a list of sentences to build the vocabulary,
# and then prints the total number of unique words and their corresponding indices in the word index.

75
{'learning': 1, 'data': 2, 'helps': 3, 'is': 4, 'for': 5, 'in': 6, 'using': 7, 'feature': 8, 'model': 9, 'language': 10, 'cross': 11, 'validation': 12, 'to': 13, 'evaluate': 14, 'supervised': 15, 'requires': 16, 'science': 17, 'involves': 18, 'and': 19, 'machine': 20, 'used': 21, 'engineering': 22, 'important': 23, 'deep': 24, 'models': 25, 'are': 26, 'trained': 27, 'unsupervised': 28, 'finds': 29, 'natural': 30, 'processing': 31, 'overfitting': 32, 'happens': 33, 'when': 34, 'the': 35, 'learns': 36, 'classification': 37, 'reinforcement': 38, 'trains': 39, 'agents': 40, 'gradient': 41, 'descent': 42, 'selection': 43, 'tuning': 44, 'labeled': 45, 'visualization': 46, 'feedback': 47, 'outliers': 48, 'clustering': 49, 'unlabeled': 50, 'regression': 51, 'structures': 52, 'prediction': 53, 'irrelevant': 54, 'patterns': 55, 'support': 56, 'vector': 57, 'machines': 58, 'decision': 59, 'trees': 60, 'neural': 61, 'networks': 62, 'cleaning': 63, 'stochastic': 64, 'text': 65, 'translation': 66

#Split the data into features (x) and targets (y)

In [124]:
# Split the data into features (x) and targets (y)
# Features (x) include all columns except the last one
x_train = padded_sequences[:, :-1]

# Targets (y) include only the last column
y_train = padded_sequences[:, -1]

# Print the features to verify their content
print("Features (x):")
print(x_train)

# Print the targets to verify their content
print("Targets (y):")
print(y_train)

Features (x):
[[ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0 15]
 [ 0  0  0 ...  0 15  1]
 ...
 [ 0  0  0 ... 12  3 13]
 [ 0  0  0 ...  3 13 14]
 [ 0  0 11 ... 13 14 61]]
Targets (y):
[15  1 16 ... 14 61 62]


# Convert the target data (y) to one-hot encoded format

In [125]:

# Convert the target data (y) to one-hot encoded format
# 'num_classes' specifies the total number of classes (e.g., total_words)
y_train = tf.keras.utils.to_categorical(y_train, num_classes=total_words)

# Print the one-hot encoded target data to verify the result
print("One-hot encoded targets (y):")
print(y_train)

One-hot encoded targets (y):
[[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Define Architecture RNN (Recurrent Neural Network)

In [126]:
import numpy as np

# Assuming y_train is 2D, reshape it to 1D if needed
y_train = np.squeeze(y_train)

# Now check the shape
print(y_train.shape)

(6750, 75)


In [127]:
model = Sequential([
    # Embedding layer: converts input sequences of integers into dense vectors of fixed size
    Embedding(total_words, 10,           # Dimension of the dense embedding vectors
              input_length=max_sequence_len-1),  # Length of input sequences

    # Simple RNN layer: processes the sequence data and retains the temporal information
    LSTM(100),  # Number of units in the RNN layer

    # Dense layer: outputs the probability distribution over the vocabulary
    Dense(total_words,            # Size of the output layer (same as vocabulary size)
          activation='softmax')   # Softmax activation to produce probability distribution
])

In [128]:
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',  # Use this for multi-class classification
    metrics=['accuracy']
)

In [129]:
# Train the model with a specified batch size
model.fit(
    x_train,
    y_train,
    epochs=50,
    batch_size=32,  # Specify the batch size here
    verbose=1  # Progress bar
)

Epoch 1/50
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.0990 - loss: 3.9826
Epoch 2/50
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.3416 - loss: 2.4759
Epoch 3/50
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5838 - loss: 1.4378
Epoch 4/50
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6762 - loss: 0.9895
Epoch 5/50
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7467 - loss: 0.7701
Epoch 6/50
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7556 - loss: 0.6730
Epoch 7/50
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7673 - loss: 0.6006
Epoch 8/50
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7770 - loss: 0.5673
Epoch 9/50
[1m211/211[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x7e9a9eff9210>

In [132]:

# This function predicts the next word(s) given a starting piece of text (seed_text).
def predict_next_word(seed_text, next_words=1):
    # Loop to predict the next word 'next_words' times
    for _ in range(next_words):
        # Convert the seed text into a format that the model can understand (a sequence of numbers)
        token_list = tokenizer.texts_to_sequences([seed_text])

        # Ensure the sequence is the right length for the model by padding it with zeros if needed
        token_list = pad_sequences(token_list, maxlen=max_sequence_len, padding='pre')

        # Use the model to predict the probabilities of the next word
        predicted_probs = model.predict(token_list, verbose=0)
        # Find the index of the word with the highest probability
        predicted_index = np.argmax(predicted_probs, axis=-1)[0]

        # Convert the index back into the actual word
        predicted_word = tokenizer.index_word.get(predicted_index, '')

        # Add the predicted word to the end of the seed text
        seed_text += ' ' + predicted_word

    # Return the updated seed text with the predicted words added
    return seed_text

# Example usage of the function:
seed_text = "machine learning is used for?"
# Predict the next 4 words based on the seed text
predicted_text = predict_next_word(seed_text, next_words=10)
# Print the result
print(predicted_text)

machine learning is used for? clustering classification feature gradient descent descent descent descent patterns feature
