In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, GRU, LSTM, SimpleRNN, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import re
import nltk
from nltk.tokenize import word_tokenize
import os
import time
import random
import tokenizers
from tokenizers import ByteLevelBPETokenizer
from keras import backend as K
import heapq
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

class ShakespeareTextGeneration:
    def __init__(self, file_path="shakespeare.txt", seq_length=40, batch_size=64):
        """
        Initialize the text generation class
        
        Args:
            file_path: Path to the Shakespeare dataset
            seq_length: Length of input sequences
            batch_size: Batch size for training
        """
        self.file_path = file_path
        self.seq_length = seq_length
        self.batch_size = batch_size
        self.raw_text = None
        self.tokenizer = None
        self.total_words = None
        self.input_sequences = None
        self.X = None
        self.y = None
        self.model = None
    
    def load_data(self):
        """Load Shakespeare text data"""
        try:
            with open(self.file_path, 'r', encoding='utf-8') as file:
                self.raw_text = file.read()
                print(f"Data loaded successfully. Total characters: {len(self.raw_text)}")
                print(f"First 500 characters: {self.raw_text[:500]}...")
                return True
        except FileNotFoundError:
            print(f"File not found: {self.file_path}")
            # Download Shakespeare data if not available
            import requests
            print("Attempting to download Shakespeare text...")
            url = "https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt"
            response = requests.get(url)
            if response.status_code == 200:
                with open('shakespeare.txt', 'wb') as f:
                    f.write(response.content)
                print("Shakespeare text downloaded successfully.")
                self.file_path = 'shakespeare.txt'
                with open(self.file_path, 'r', encoding='utf-8') as file:
                    self.raw_text = file.read()
                return True
            else:
                print("Failed to download Shakespeare text.")
                return False
    
    def preprocess_text(self):
        """Basic text preprocessing"""
        # Remove special characters and numbers
        self.raw_text = re.sub(r'[^\w\s]', ' ', self.raw_text)
        self.raw_text = re.sub(r'\d+', ' ', self.raw_text)
        # Convert to lowercase
        self.raw_text = self.raw_text.lower()
        # Remove extra whitespaces
        self.raw_text = re.sub(r'\s+', ' ', self.raw_text).strip()
        print("Basic preprocessing complete.")
        print(f"Preprocessed sample: {self.raw_text[:500]}...")
        return self.raw_text
    
    def tokenize_wordpiece(self, vocab_size=10000):
        """Tokenize text using WordPiece/BPE approach"""
        # Initialize BPE tokenizer
        self.bpe_tokenizer = ByteLevelBPETokenizer()
        
        # Save text to temporary file for tokenizer training
        with open("temp_shakespeare.txt", "w", encoding="utf-8") as f:
            f.write(self.raw_text)
        
        # Train the tokenizer
        self.bpe_tokenizer.train(
            files=["temp_shakespeare.txt"],
            vocab_size=vocab_size,
            min_frequency=2,
            special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
        )
        
        # Save the tokenizer
        os.makedirs("shakespeare_tokenizer", exist_ok=True)
        self.bpe_tokenizer.save_model("shakespeare_tokenizer")
        
        # Encode the text
        encoded = self.bpe_tokenizer.encode(self.raw_text)
        self.bpe_tokens = encoded.ids
        self.bpe_vocab_size = self.bpe_tokenizer.get_vocab_size()
        
        print(f"BPE Tokenization complete. Vocabulary size: {self.bpe_vocab_size}")
        print(f"Sample encoded tokens: {self.bpe_tokens[:20]}")
        
        # Clean up temp file
        os.remove("temp_shakespeare.txt")
        
        return self.bpe_tokens
    
    def tokenize_words(self, vocab_size=10000):
        """Simple word-level tokenization"""
        # Split text into words
        words = self.raw_text.split()
        print(f"Total words in corpus: {len(words)}")
        
        # Create a word-level tokenizer
        self.tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
        self.tokenizer.fit_on_texts([self.raw_text])
        self.total_words = min(vocab_size, len(self.tokenizer.word_index) + 1)
        print(f"Vocabulary size: {self.total_words}")
        
        # Convert text to sequence of tokens
        sequences = self.tokenizer.texts_to_sequences([self.raw_text])[0]
        
        # Create input sequences for training
        self.input_sequences = []
        for i in range(1, len(sequences) - self.seq_length):
            self.input_sequences.append(sequences[i:i+self.seq_length+1])
        
        print(f"Number of training sequences: {len(self.input_sequences)}")
        
        # Convert to numpy array
        self.input_sequences = np.array(self.input_sequences)
        
        # Split into training inputs and outputs
        self.X = self.input_sequences[:, :-1]
        self.y = self.input_sequences[:, -1]
        
        # One-hot encode the outputs
        self.y = tf.keras.utils.to_categorical(self.y, num_classes=self.total_words)
        
        return self.X, self.y
    
    def generate_sequences_varying_lengths(self, min_length=10, max_length=100, num_samples=1000):
        """Generate sequences of varying lengths to test model robustness"""
        sequences = self.tokenizer.texts_to_sequences([self.raw_text])[0]
        varying_sequences = []
        
        for _ in range(num_samples):
            seq_len = np.random.randint(min_length, max_length)
            start_idx = np.random.randint(0, len(sequences) - seq_len - 1)
            varying_sequences.append(sequences[start_idx:start_idx+seq_len+1])
        
        # Pad sequences
        padded_sequences = pad_sequences(varying_sequences, maxlen=max_length, padding='pre')
        
        # Split into inputs and outputs
        X_varying = padded_sequences[:, :-1]
        y_varying = padded_sequences[:, -1]
        
        # One-hot encode outputs
        y_varying = tf.keras.utils.to_categorical(y_varying, num_classes=self.total_words)
        
        print(f"Generated {len(varying_sequences)} sequences of varying lengths")
        return X_varying, y_varying
    
    def build_vanilla_rnn(self, embedding_dim=100, rnn_units=256, bidirectional=False):
        """Build a vanilla RNN model"""
        inputs = Input(shape=(self.seq_length,))
        embedding = Embedding(input_dim=self.total_words, output_dim=embedding_dim)(inputs)
        
        if bidirectional:
            rnn_layer = Bidirectional(SimpleRNN(rnn_units, return_sequences=True))(embedding)
            rnn_layer = Bidirectional(SimpleRNN(rnn_units))(rnn_layer)
        else:
            rnn_layer = SimpleRNN(rnn_units, return_sequences=True)(embedding)
            rnn_layer = SimpleRNN(rnn_units)(rnn_layer)
        
        outputs = Dense(self.total_words, activation='softmax')(rnn_layer)
        
        model = Model(inputs=inputs, outputs=outputs)
        print("Vanilla RNN model built:")
        print(model.summary())
        
        return model
    
    def build_lstm_model(self, embedding_dim=100, lstm_units=256, bidirectional=False):
        """Build an LSTM model"""
        inputs = Input(shape=(self.seq_length,))
        embedding = Embedding(input_dim=self.total_words, output_dim=embedding_dim)(inputs)
        
        if bidirectional:
            lstm_layer = Bidirectional(LSTM(lstm_units, return_sequences=True))(embedding)
            lstm_layer = Bidirectional(LSTM(lstm_units))(lstm_layer)
        else:
            lstm_layer = LSTM(lstm_units, return_sequences=True)(embedding)
            lstm_layer = LSTM(lstm_units)(lstm_layer)
        
        outputs = Dense(self.total_words, activation='softmax')(lstm_layer)
        
        model = Model(inputs=inputs, outputs=outputs)
        print("LSTM model built:")
        print(model.summary())
        
        return model
    
    def build_gru_model(self, embedding_dim=100, gru_units=256, bidirectional=False):
        """Build a GRU model"""
        inputs = Input(shape=(self.seq_length,))
        embedding = Embedding(input_dim=self.total_words, output_dim=embedding_dim)(inputs)
        
        if bidirectional:
            gru_layer = Bidirectional(GRU(gru_units, return_sequences=True))(embedding)
            gru_layer = Bidirectional(GRU(gru_units))(gru_layer)
        else:
            gru_layer = GRU(gru_units, return_sequences=True)(embedding)
            gru_layer = GRU(gru_units)(gru_layer)
        
        outputs = Dense(self.total_words, activation='softmax')(gru_layer)
        
        model = Model(inputs=inputs, outputs=outputs)
        print("GRU model built:")
        print(model.summary())
        
        return model
    
    def train_model(self, model, optimizer_name='adam', learning_rate=0.001, epochs=50, 
                   gradient_clipping=True, clip_value=1.0, teacher_forcing=True, 
                   teacher_forcing_ratio=0.5):
        """Train the model with various optimization strategies"""
        # Optimizer selection
        if optimizer_name.lower() == 'adam':
            if gradient_clipping:
                optimizer = Adam(learning_rate=learning_rate, clipvalue=clip_value)
            else:
                optimizer = Adam(learning_rate=learning_rate)
        elif optimizer_name.lower() == 'rmsprop':
            if gradient_clipping:
                optimizer = RMSprop(learning_rate=learning_rate, clipvalue=clip_value)
            else:
                optimizer = RMSprop(learning_rate=learning_rate)
        else:
            print(f"Unknown optimizer: {optimizer_name}. Using Adam instead.")
            optimizer = Adam(learning_rate=learning_rate)
        
        # Compile model
        model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
        
        # Callbacks
        model_checkpoint = ModelCheckpoint(
            filepath=f'shakespeare_{model.name}_checkpoint.h5',
            save_best_only=True,
            monitor='val_accuracy',
            mode='max'
        )
        
        early_stopping = EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True
        )
        
        # Teacher forcing implementation (simplified for demonstration)
        if teacher_forcing:
            # Split the data for teacher forcing
            train_size = int(0.8 * len(self.X))
            X_train, X_val = self.X[:train_size], self.X[train_size:]
            y_train, y_val = self.y[:train_size], self.y[train_size:]
            
            # Teacher forcing training
            for epoch in range(epochs):
                print(f"Epoch {epoch+1}/{epochs}")
                
                # With teacher forcing
                if np.random.random() < teacher_forcing_ratio:
                    model.fit(
                        X_train, y_train,
                        batch_size=self.batch_size,
                        epochs=1,
                        validation_data=(X_val, y_val),
                        callbacks=[model_checkpoint],
                        verbose=1
                    )
                
                # Without teacher forcing - using model's predictions
                else:
                    # Make predictions
                    pred_sequences = []
                    for i in range(0, len(X_train), self.batch_size):
                        batch_X = X_train[i:i+self.batch_size]
                        # Get model predictions for all but last token
                        for j in range(self.seq_length-1):
                            preds = model.predict(batch_X[:, :j+1], verbose=0)
                            next_token = np.argmax(preds, axis=1)
                            # Set the next token in the sequence
                            if j+1 < self.seq_length:
                                for k in range(len(batch_X)):
                                    batch_X[k, j+1] = next_token[k]
                        
                        pred_sequences.append(batch_X)
                    
                    # Combine predictions
                    pred_X = np.vstack(pred_sequences) if pred_sequences else X_train
                    
                    # Train on these sequences
                    model.fit(
                        pred_X, y_train[:len(pred_X)],
                        batch_size=self.batch_size,
                        epochs=1,
                        validation_data=(X_val, y_val),
                        callbacks=[model_checkpoint],
                        verbose=1
                    )
            
            # Final evaluation
            val_loss, val_acc = model.evaluate(X_val, y_val, verbose=1)
            print(f"Final validation loss: {val_loss}, accuracy: {val_acc}")
            
        else:
            # Regular training without teacher forcing
            history = model.fit(
                self.X, self.y,
                batch_size=self.batch_size,
                epochs=epochs,
                validation_split=0.2,
                callbacks=[model_checkpoint, early_stopping],
                verbose=1
            )
            
            # Plot training history
            plt.figure(figsize=(12, 4))
            
            plt.subplot(1, 2, 1)
            plt.plot(history.history['loss'], label='Train Loss')
            plt.plot(history.history['val_loss'], label='Validation Loss')
            plt.title('Loss')
            plt.xlabel('Epoch')
            plt.legend()
            
            plt.subplot(1, 2, 2)
            plt.plot(history.history['accuracy'], label='Train Accuracy')
            plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
            plt.title('Accuracy')
            plt.xlabel('Epoch')
            plt.legend()
            
            plt.tight_layout()
            plt.savefig(f'training_history_{model.name}.png')
            plt.show()
        
        self.model = model
        return model
    
    def temperature_sampling(self, preds, temperature=1.0):
        """Sample with temperature control to adjust randomness"""
        if temperature == 0:  # Deterministic (greedy) sampling
            return np.argmax(preds)
        
        # Scale predictions by temperature
        preds = np.asarray(preds).astype('float64')
        preds = np.log(preds) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)
        
        # Sample from the distribution
        probas = np.random.multinomial(1, preds, 1)
        return np.argmax(probas)
    
    def beam_search(self, seed_text, beam_width=3, max_length=50):
        """Implement beam search to generate higher-quality text"""
        # Tokenize seed text
        seed_seq = self.tokenizer.texts_to_sequences([seed_text])[0]
        
        # Pad sequence to required length
        padded_seq = pad_sequences([seed_seq], maxlen=self.seq_length, padding='pre')
        
        # Initial beam
        beams = [(0, padded_seq, [])]  # (score, sequence, generated_words)
        
        for _ in range(max_length):
            new_beams = []
            for score, seq, words in beams:
                # Get model predictions
                preds = self.model.predict(seq, verbose=0)[0]
                
                # Get top k predictions
                top_indices = np.argsort(preds)[-beam_width:]
                
                for idx in top_indices:
                    word = self.tokenizer.index_word.get(idx, "<OOV>")
                    new_score = score + np.log(preds[idx])
                    
                    # Create new sequence by shifting and adding new token
                    new_seq = np.copy(seq)
                    new_seq[0] = np.append(new_seq[0][1:], [idx])
                    
                    new_beams.append((new_score, new_seq, words + [word]))
            
            # Keep only the best beams
            beams = heapq.nlargest(beam_width, new_beams, key=lambda x: x[0])
        
        # Return the best sequence
        best_score, _, best_words = beams[0]
        return ' '.join(best_words)
    
    def generate_text(self, seed_text, max_length=100, temperature=1.0, use_beam_search=False, beam_width=3):
        """Generate text with temperature control or beam search"""
        if use_beam_search:
            return self.beam_search(seed_text, beam_width=beam_width, max_length=max_length)
        
        # Tokenize the seed text
        seed_seq = self.tokenizer.texts_to_sequences([seed_text])[0]
        generated_text = seed_text
        
        for _ in range(max_length):
            # Pad the sequence
            padded_seq = pad_sequences([seed_seq], maxlen=self.seq_length, padding='pre')
            
            # Get model predictions
            preds = self.model.predict(padded_seq, verbose=0)[0]
            
            # Sample with temperature
            next_index = self.temperature_sampling(preds, temperature)
            
            # Convert to word
            next_word = self.tokenizer.index_word.get(next_index, "<OOV>")
            
            # Add to generated text
            generated_text += " " + next_word
            
            # Update seed sequence
            seed_seq.append(next_index)
            seed_seq = seed_seq[1:]
        
        return generated_text
    
    def calculate_perplexity(self, text):
        """Calculate perplexity as an evaluation metric"""
        sequences = self.tokenizer.texts_to_sequences([text])[0]
        
        # Prepare sequences for evaluation
        X_eval = []
        y_eval = []
        
        for i in range(len(sequences) - self.seq_length):
            X_eval.append(sequences[i:i+self.seq_length])
            y_eval.append(sequences[i+self.seq_length])
        
        X_eval = np.array(X_eval)
        y_eval = tf.keras.utils.to_categorical(y_eval, num_classes=self.total_words)
        
        # Calculate loss (cross-entropy)
        loss = self.model.evaluate(X_eval, y_eval, verbose=0)
        
        # Perplexity = exp(cross-entropy loss)
        perplexity = np.exp(loss)
        return perplexity
    
    def compare_models(self, models_dict, seed_text, max_length=100, temperatures=[0.2, 0.5, 1.0]):
        """Compare text generated by different models"""
        results = {}
        
        for model_name, model in models_dict.items():
            self.model = model  # Set current model
            model_results = {}
            
            for temp in temperatures:
                generated_text = self.generate_text(
                    seed_text=seed_text,
                    max_length=max_length,
                    temperature=temp
                )
                
                # Calculate perplexity
                perplexity = self.calculate_perplexity(generated_text)
                
                model_results[f"temp_{temp}"] = {
                    "text": generated_text,
                    "perplexity": perplexity
                }
            
            # Generate with beam search
            beam_text = self.generate_text(
                seed_text=seed_text,
                max_length=max_length,
                use_beam_search=True,
                beam_width=3
            )
            
            beam_perplexity = self.calculate_perplexity(beam_text)
            model_results["beam_search"] = {
                "text": beam_text,
                "perplexity": beam_perplexity
            }
            
            results[model_name] = model_results
        
        return results
    
    def analyze_style(self, original_text, generated_text):
        """Analyze stylistic differences between original and generated text"""
        # Tokenize both texts
        original_words = original_text.split()
        generated_words = generated_text.split()
        
        # Calculate basic statistics
        avg_word_len_original = np.mean([len(w) for w in original_words])
        avg_word_len_generated = np.mean([len(w) for w in generated_words])
        
        # Vocabulary diversity (unique words / total words)
        vocab_diversity_original = len(set(original_words)) / len(original_words)
        vocab_diversity_generated = len(set(generated_words)) / len(generated_words)
        
        # Sentence length (approximate by splitting on periods)
        original_sentences = original_text.split('.')
        generated_sentences = generated_text.split('.')
        
        avg_sent_len_original = np.mean([len(s.split()) for s in original_sentences if s])
        avg_sent_len_generated = np.mean([len(s.split()) for s in generated_sentences if s])
        
        # Word frequency distribution
        from collections import Counter
        original_freq = Counter(original_words)
        generated_freq = Counter(generated_words)
        
        # Most common words
        original_common = original_freq.most_common(10)
        generated_common = generated_freq.most_common(10)
        
        # Calculate overlap in most common words
        original_common_words = set([w for w, _ in original_common])
        generated_common_words = set([w for w, _ in generated_common])
        common_overlap = len(original_common_words.intersection(generated_common_words))
        
        results = {
            "avg_word_length": {
                "original": avg_word_len_original,
                "generated": avg_word_len_generated,
                "difference": avg_word_len_original - avg_word_len_generated
            },
            "vocabulary_diversity": {
                "original": vocab_diversity_original,
                "generated": vocab_diversity_generated,
                "difference": vocab_diversity_original - vocab_diversity_generated
            },
            "avg_sentence_length": {
                "original": avg_sent_len_original,
                "generated": avg_sent_len_generated,
                "difference": avg_sent_len_original - avg_sent_len_generated
            },
            "common_words": {
                "original": original_common,
                "generated": generated_common,
                "overlap": common_overlap,
                "overlap_percentage": common_overlap / 10 * 100
            }
        }
        
        return results

# Run the entire pipeline
def main():
    # Initialize
    shakespeare = ShakespeareTextGeneration(file_path="shakespeare.txt")
    
    # Load and preprocess data
    if not shakespeare.load_data():
        print("Failed to load data. Exiting.")
        return
    
    shakespeare.preprocess_text()
    
    # Tokenize text (choose one method)
    # shakespeare.tokenize_wordpiece()  # BPE tokenization
    shakespeare.tokenize_words()  # Word-level tokenization
    
    # Generate sequences of varying lengths to test robustness
    X_varying, y_varying = shakespeare.generate_sequences_varying_lengths()
    
    # Build models
    vanilla_rnn = shakespeare.build_vanilla_rnn(bidirectional=True)
    lstm_model = shakespeare.build_lstm_model(bidirectional=True)
    gru_model = shakespeare.build_gru_model(bidirectional=True)
    
    # Train models (usually you'd train one at a time)
    # Choose which model to train
    model_to_train = lstm_model  # Change to train different models
    
    trained_model = shakespeare.train_model(
        model=model_to_train,
        optimizer_name='adam',
        learning_rate=0.001,
        epochs=20,  # Reduced for demonstration
        gradient_clipping=True,
        clip_value=1.0,
        teacher_forcing=True,
        teacher_forcing_ratio=0.5
    )
    
    # Generate text with different temperatures
    seed_text = "to be or not to be"
    
    print("\n--- Temperature = 0.2 (more focused) ---")
    generated_text_low_temp = shakespeare.generate_text(
        seed_text=seed_text,
        temperature=0.2
    )
    print(generated_text_low_temp)
    
    print("\n--- Temperature = 1.0 (balanced) ---")
    generated_text_med_temp = shakespeare.generate_text(
        seed_text=seed_text,
        temperature=1.0
    )
    print(generated_text_med_temp)
    
    print("\n--- Temperature = 1.5 (more random) ---")
    generated_text_high_temp = shakespeare.generate_text(
        seed_text=seed_text,
        temperature=1.5
    )
    print(generated_text_high_temp)
    
    print("\n--- Beam Search ---")
    generated_text_beam = shakespeare.generate_text(
        seed_text=seed_text,
        use_beam_search=True,
        beam_width=3
    )
    print(generated_text_beam)
    
    # Sample from original text for style comparison
    original_sample = shakespeare.raw_text[1000:3000]  # Sample from Shakespeare's text
    
    # Analyze and compare styles
    style_analysis = shakespeare.analyze_style(original_sample, generated_text_med_temp)
    
    print("\n--- Style Analysis ---")
    print(f"Average Word Length: Original={style_analysis['avg_word_length']['original']:.2f}, Generated={style_analysis['avg_word_length']['generated']:.2f}")
    print(f"Vocabulary Diversity: Original={style_analysis['vocabulary_diversity']['original']:.2f}, Generated={style_analysis['vocabulary_diversity']['generated']:.2f}")
    print(f"Average Sentence Length: Original={style_analysis['avg_sentence_length']['original']:.2f}, Generated={style_analysis['avg_sentence_length']['generated']:.2f}")
    print(f"Common Words Overlap: {style_analysis['common_words']['overlap_percentage']:.2f}%")
    
    # Calculate perplexity for evaluation
    perplexity = shakespeare.calculate_perplexity(generated_text_med_temp)
    print(f"\nPerplexity of generated text: {perplexity:.2f}")
    
    print("\nExecution complete!")

if __name__ == "__main__":
    main()