<a href="https://colab.research.google.com/github/Sreejith-nair511/Summer_course_Ai/blob/main/LSTM_using_Sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import re
import random

# 1. Prepare the dataset
# For simplicity, let's create a dummy dataset.
# In a real-world scenario, you would load your dataset (e.g., IMDB reviews).
data = [
    ("This is a great movie!", 1),
    ("I really enjoyed this film.", 1),
    ("The acting was superb.", 1),
    ("What a terrible experience.", 0),
    ("I hated every minute of it.", 0),
    ("This movie was awful.", 0),
    ("It was okay, nothing special.", 0),
    ("Definitely recommend it.", 1),
    ("Waste of my time.", 0),
    ("So happy I watched this.", 1),
]

texts = [item[0] for item in data]
labels = [item[1] for item in data]

# Preprocessing
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

max_len = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, np.array(labels), test_size=0.2, random_state=42)

# 2. Build the LSTM Model
vocab_size = 5000
embedding_dim = 100
lstm_units = 150

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_len),
    LSTM(lstm_units),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
model.fit(X_train, y_train, epochs=20, batch_size=8, validation_data=(X_test, y_test))

# 3. Sentiment Classification (Downstream Task)
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Now, let's demonstrate text generation *using sampling* which could hypothetically be
# used for other tasks (e.g., generating synthetic text data, although not for sentiment classification directly
# with this model unless the generation is sentiment-conditioned).

# Get the trained model to generate text. For a classification model,
# this isn't its primary purpose, but we can use its internal state (if it were designed for it)
# or adapt it slightly for generation. A standard classification LSTM won't
# have an output layer designed for generating the next token in a sequence.
# A typical generative LSTM has a Dense layer with a softmax activation over the vocab size.

# To demonstrate sampling methods for text generation, we need a model that outputs probabilities
# over the vocabulary for the next token. The current model is a binary classifier.
# Let's simulate a generative process using the trained model's internal representations or
# if we were to adapt the model slightly for generation.
# **NOTE:** This simulation is for demonstration purposes of sampling *methods*.
# The generated text will not be meaningful for sentiment classification as the model
# was trained for binary classification, not text generation.

# To truly show text generation and sampling, we'd ideally train a separate language model.
# However, sticking to the prompt's structure (using the *trained model* for a downstream task),
# let's assume we want to generate text *related* to the input data (though again, the classifier output isn't ideal for this).

# Let's create a simple function that *simulates* generating the next token's probability distribution
# based on the final state of the LSTM when fed a sequence. This is a hacky way to demonstrate sampling
# and not how a real generative model works.

# A proper text generation setup would have:
# 1. Train a language model (LSTM with softmax output over vocab)
# 2. Implement sampling strategies to pick the next token based on the probability distribution.

# Let's build a simple generative layer on top for demonstration.
# This requires changing the model architecture slightly, which deviates from using the *trained* classifier directly.
# The prompt is slightly ambiguous here. Let's interpret it as: train an LSTM (could be a language model),
# use sampling methods for generation, and *then* use the *generated text* for sentiment classification.
# This requires a separate sentiment classifier.

# Let's train a simple language model first, then use sampling, then classify the generated text.

# --- Revised Approach: Train Language Model -> Generate Text with Sampling -> Classify Generated Text ---

# We need more text data to train a decent language model.
# Let's use the same small dataset but treat it as a sequence for language modeling.
# This will result in a very poor language model, but it serves to demonstrate the process.

all_text = " ".join(texts)
sequences = []
# Create sequences of tokens
for i in range(1, len(all_text.split())):
    sequence = all_text.split()[:i+1]
    sequences.append(sequence)

# Tokenize
tokenizer_lm = Tokenizer()
tokenizer_lm.fit_on_texts(sequences)
sequences_lm = tokenizer_lm.texts_to_sequences(sequences)

# Prepare input and output for language model
X_lm, y_lm = [], []
for seq in sequences_lm:
    X_lm.append(seq[:-1])
    y_lm.append(seq[-1])

X_lm = pad_sequences(X_lm, maxlen=max_len, padding='pre') # Pad at the beginning for LM
y_lm = tf.keras.utils.to_categorical(y_lm, num_classes=len(tokenizer_lm.word_index) + 1)

# Build Language Model
vocab_size_lm = len(tokenizer_lm.word_index) + 1
model_lm = Sequential([
    Embedding(vocab_size_lm, embedding_dim, input_length=max_len),
    LSTM(lstm_units, return_sequences=False), # We predict the next word
    Dense(vocab_size_lm, activation='softmax')
])

model_lm.compile(loss='categorical_crossentropy', optimizer='adam')
model_lm.summary()

# Train Language Model
# With this small dataset, training will be quick but the model will be weak.
model_lm.fit(X_lm, y_lm, epochs=50, verbose=0) # Train for more epochs on tiny data

# 4. Text Generation with Sampling Methods
def generate_text(model, tokenizer, seed_text, max_sequence_length, num_generate, sampling_method, temperature=1.0, top_p=0.9):
    generated_text = seed_text
    for _ in range(num_generate):
        token_list = tokenizer.texts_to_sequences([generated_text])[0]
        # Pad the sequence to the expected input length
        token_list = pad_sequences([token_list], maxlen=max_sequence_length, padding='pre')

        # Get the next word probability distribution
        predicted_probs = model.predict(token_list, verbose=0)[0]

        # Apply temperature
        predicted_probs = np.log(predicted_probs + 1e-10) / temperature
        exp_preds = np.exp(predicted_probs)
        predicted_probs = exp_preds / np.sum(exp_preds) # Softmax after temperature

        next_token_index = -1

        if sampling_method == 'random':
            next_token_index = np.random.choice(len(predicted_probs), p=predicted_probs)
        elif sampling_method == 'nucleus':
            # Sort probabilities in descending order
            sorted_preds = np.sort(predicted_probs)[::-1]
            sorted_indices = np.argsort(predicted_probs)[::-1]

            # Calculate cumulative probabilities
            cumulative_probs = np.cumsum(sorted_preds)

            # Find the smallest set of tokens whose cumulative probability exceeds top_p
            nucleus = sorted_indices[cumulative_probs < top_p]
            # Add the next token to ensure nucleus is not empty (edge case)
            if len(nucleus) == 0:
                nucleus = sorted_indices[:1] # Take the most probable token if nucleus is empty
            elif cumulative_probs[len(nucleus)-1] < top_p: # Add the last token if needed to cross top_p
                 nucleus = sorted_indices[:len(nucleus)+1]


            # Filter probabilities to the nucleus
            nucleus_probs = predicted_probs[nucleus]
            nucleus_probs = nucleus_probs / np.sum(nucleus_probs) # Re-normalize

            # Sample from the nucleus
            next_token_index = np.random.choice(nucleus, p=nucleus_probs)

        elif sampling_method == 'beam':
             # Beam search is more complex and usually generates multiple sequences in parallel,
             # keeping the top 'k' most probable sequences at each step.
             # Implementing full beam search here would make the function much more complex.
             # A simplified "beam-like" approach might just pick the top-k, but that's not true beam search.
             # Let's stick to random and nucleus for a single-token generation demonstration.
             # If you need full beam search, it requires tracking multiple candidate sequences.
             # For a single token generation, it reduces to just picking the top 1 if beam width is 1,
             # or considering top k if width is k (but still picking one based on some criteria).
             # Let's simulate by just picking the single most probable word for simplicity
             # to represent a deterministic "beam" of width 1. This isn't true beam search.
             next_token_index = np.argmax(predicted_probs)
        else:
            raise ValueError("Invalid sampling method")

        # Find the word from the index
        for word, index in tokenizer.word_index.items():
            if index == next_token_index:
                next_word = word
                break
        generated_text += " " + next_word

    return generated_text

# Example usage of sampling methods
seed_text = "This is"
num_generate_words = 10
max_seq_len_lm = max_len # Use the same max length for padding

print("--- Text Generation ---")
print(f"Seed: '{seed_text}'")

# Random Sampling
generated_random = generate_text(model_lm, tokenizer_lm, seed_text, max_seq_len_lm, num_generate_words, 'random', temperature=0.8)
print(f"Random Sampling (T=0.8): {generated_random}")

# Nucleus Sampling (Top-p)
generated_nucleus = generate_text(model_lm, tokenizer_lm, seed_text, max_seq_len_lm, num_generate_words, 'nucleus', top_p=0.9)
print(f"Nucleus Sampling (p=0.9): {generated_nucleus}")

# Beam "Sampling" (simplified - essentially greedy for a single token)
# Note: This is NOT proper beam search.
generated_beam = generate_text(model_lm, tokenizer_lm, seed_text, max_seq_len_lm, num_generate_words, 'beam')
print(f"Beam (Simplified Greedy): {generated_beam}")


# 5. Sentiment Classification of Generated Text
# To classify the generated text, we need to use the *original* sentiment classification model.
# This assumes the generated text is relevant to the sentiment task.
# The generated text from our simple LM trained on sentence fragments won't be meaningful for sentiment.
# However, to fulfill the prompt, we will classify the generated strings as if they were reviews.

# Re-use the original tokenizer and classification model
tokenizer_clf = Tokenizer(num_words=5000) # Recreate/load the original tokenizer
tokenizer_clf.fit_on_texts(texts) # Fit on the original training data

def classify_sentiment(text, model, tokenizer, max_sequence_length):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length, padding='post')
    prediction = model.predict(padded_sequence, verbose=0)[0][0]
    sentiment = 'Positive' if prediction > 0.4 else 'Negative'
    confidence = prediction if sentiment == 'Positive' else 1 - prediction
    return sentiment, confidence

print("\n--- Sentiment Classification of Generated Text ---")

sentiment_random, confidence_random = classify_sentiment(generated_random, model, tokenizer_clf, max_len)
print(f"Generated (Random): '{generated_random}' -> Sentiment: {sentiment_random} (Confidence: {confidence_random:.2f})")

sentiment_nucleus, confidence_nucleus = classify_sentiment(generated_nucleus, model, tokenizer_clf, max_len)
print(f"Generated (Nucleus): '{generated_nucleus}' -> Sentiment: {sentiment_nucleus} (Confidence: {confidence_nucleus:.2f})")

sentiment_beam, confidence_beam = classify_sentiment(generated_beam, model, tokenizer_clf, max_len)
print(f"Generated (Beam-like): '{generated_beam}' -> Sentiment: {sentiment_beam} (Confidence: {confidence_beam:.2f})")

# Note: The sentiment classification results for the generated text will likely be random or consistently biased
# because the generated text itself is not coherent or sentiment-rich due to the tiny training data and simple LM.
# This part demonstrates the *process* of using a separate model to classify generated text, not that the results are meaningful here.





Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step - accuracy: 0.7500 - loss: 0.6920 - val_accuracy: 0.0000e+00 - val_loss: 0.6948
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step - accuracy: 0.8750 - loss: 0.6896 - val_accuracy: 0.5000 - val_loss: 0.6948
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 0.7500 - loss: 0.6870 - val_accuracy: 0.5000 - val_loss: 0.6948
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 238ms/step - accuracy: 0.7500 - loss: 0.6871 - val_accuracy: 0.5000 - val_loss: 0.6947
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step - accuracy: 0.7500 - loss: 0.6805 - val_accuracy: 0.5000 - val_loss: 0.6946
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - accuracy: 1.0000 - loss: 0.6821 - val_accuracy: 0.5000 - val_loss: 0.6944
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━

--- Text Generation ---
Seed: 'This is'
Random Sampling (T=0.8): This is a great movie! i really enjoyed this the was the
Nucleus Sampling (p=0.9): This is is is great i really enjoyed this enjoyed the movie
Beam (Simplified Greedy): This is a great movie! i really enjoyed this film. film. film.

--- Sentiment Classification of Generated Text ---
Generated (Random): 'This is a great movie! i really enjoyed this the was the' -> Sentiment: Positive (Confidence: 0.55)
Generated (Nucleus): 'This is is is great i really enjoyed this enjoyed the movie' -> Sentiment: Positive (Confidence: 0.54)
Generated (Beam-like): 'This is a great movie! i really enjoyed this film. film. film.' -> Sentiment: Positive (Confidence: 0.52)
