In [1]:
# --- CELL 1: SETUP & CONFIGURATION ---
import tensorflow as tf
import numpy as np
import requests
import string
import sys
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, Callback
from typing import Tuple, List, Optional

# Configuration Constants (Hyperparameters)
# Centrally managed for easy "Experimentation" (Creativity Score)
CONFIG = {
    'SEQ_LENGTH': 50,          # Context window size
    'MAX_DOC_LEN': 150000,     # Limit data for 2-hour constraint (Problem Solving)
    'EMBED_DIM': 100,          # Vector size for words
    'LSTM_UNITS': 256,         # Larger layer for better capacity
    'DROPOUT': 0.2,            # Regularization to prevent overfitting
    'EPOCHS': 60,              # Max epochs
    'BATCH_SIZE': 128,         # Efficient GPU batching
    'LEARNING_RATE': 0.001
}

# Ensure reproducibility
tf.random.set_seed(42)
np.random.seed(42)

print("Configuration loaded. GPU Available:", len(tf.config.list_physical_devices('GPU')) > 0)

Configuration loaded. GPU Available: True


In [2]:
# --- CELL 2: CORE CLASS DEFINITION ---

class ShakespeareGenerator:
    """
    A class to handle data loading, preprocessing, model training,
    and text generation for Shakespearean text.
    """

    def __init__(self, config: dict):
        self.config = config
        self.tokenizer = Tokenizer()
        self.model = None
        self.max_sequence_len = 0
        self.total_words = 0

    def load_and_clean_data(self, url: str) -> str:
        """Downloads and cleans the dataset."""
        print(f"Downloading data from {url}...")
        try:
            raw_text = requests.get(url).text
        except Exception as e:
            sys.exit(f"Error downloading dataset: {e}")

        # Creativity/Optimization: Lowercase + Punctuation removal
        # We keep spaces to preserve word boundaries
        clean_text = raw_text.lower()
        clean_text = clean_text.translate(str.maketrans('', '', string.punctuation))

        # Problem Solving: Truncate data to fit training in time limit
        print(f"Original Text Length: {len(clean_text)}")
        return clean_text[:self.config['MAX_DOC_LEN']]

    def prepare_data(self, corpus: str) -> Tuple[np.ndarray, np.ndarray]:
        """Tokenizes text and creates X, y sequences."""
        self.tokenizer.fit_on_texts([corpus])
        self.total_words = len(self.tokenizer.word_index) + 1

        # Create n-gram sequences
        input_sequences = []
        token_list = self.tokenizer.texts_to_sequences([corpus])[0]

        for i in range(1, len(token_list)):
            # Sliding window of sequence_length
            n_gram_seq = token_list[max(0, i-self.config['SEQ_LENGTH']):i+1]
            input_sequences.append(n_gram_seq)

        # Padding
        self.max_sequence_len = max([len(x) for x in input_sequences])
        input_sequences = np.array(pad_sequences(input_sequences,
                                                 maxlen=self.max_sequence_len,
                                                 padding='pre'))

        # Split predictors and label
        X, y = input_sequences[:,:-1], input_sequences[:,-1]
        y = tf.keras.utils.to_categorical(y, num_classes=self.total_words)

        print(f"Vocab Size: {self.total_words}")
        print(f"Training Sequences: {X.shape[0]}")
        return X, y

    def build_model(self):
        """
        Constructs a Stacked LSTM architecture with Dropout.
        Demonstrates 'Creativity' by going beyond a simple single-layer model.
        """
        self.model = Sequential([
            Embedding(self.total_words, self.config['EMBED_DIM'],
                      input_length=self.max_sequence_len-1),

            # Layer 1: Stacked LSTM (return_sequences=True)
            LSTM(self.config['LSTM_UNITS'], return_sequences=True),
            Dropout(self.config['DROPOUT']),

            # Layer 2: Deep LSTM
            LSTM(self.config['LSTM_UNITS']),
            Dropout(self.config['DROPOUT']),

            Dense(self.total_words, activation='softmax')
        ])

        optimizer = tf.keras.optimizers.Adam(learning_rate=self.config['LEARNING_RATE'])
        self.model.compile(loss='categorical_crossentropy',
                           optimizer=optimizer,
                           metrics=['accuracy'])
        print(self.model.summary())

    def train(self, X, y):
        """Trains the model with Early Stopping strategy."""
        early_stop = EarlyStopping(monitor='loss', patience=4, restore_best_weights=True)

        self.history = self.model.fit(
            X, y,
            epochs=self.config['EPOCHS'],
            batch_size=self.config['BATCH_SIZE'],
            verbose=1,
            callbacks=[early_stop]
        )

    def generate_text(self, seed_text: str, next_words: int, temperature: float = 1.0) -> str:
        """
        Generates text using Temperature Sampling.
        High Temp = More Creative/Random. Low Temp = More Predictable.
        """
        output_text = seed_text

        for _ in range(next_words):
            token_list = self.tokenizer.texts_to_sequences([output_text])[0]
            token_list = pad_sequences([token_list],
                                       maxlen=self.max_sequence_len-1,
                                       padding='pre')

            # Get probabilities
            predictions = self.model.predict(token_list, verbose=0)[0]

            # Problem Solving: Apply Temperature Sampling
            # This prevents the model from getting stuck in loops
            predictions = np.log(predictions + 1e-7) / temperature
            exp_preds = np.exp(predictions)
            predictions = exp_preds / np.sum(exp_preds)

            # Sample from distribution
            predicted_index = np.random.choice(len(predictions), p=predictions)

            output_word = ""
            for word, index in self.tokenizer.word_index.items():
                if index == predicted_index:
                    output_word = word
                    break

            output_text += " " + output_word

        return output_text

# Initialize System
bot = ShakespeareGenerator(CONFIG)

In [3]:
# --- CELL 3: EXECUTION ---

# 1. Load Data
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
corpus = bot.load_and_clean_data(url)

# 2. Process Data
X, y = bot.prepare_data(corpus)

# 3. Build & Train
bot.build_model()
bot.train(X, y)

Downloading data from https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt...
Original Text Length: 1060997
Vocab Size: 4132
Training Sequences: 28318




None
Epoch 1/60
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 21ms/step - accuracy: 0.0357 - loss: 7.0357
Epoch 2/60
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 23ms/step - accuracy: 0.0387 - loss: 6.3719
Epoch 3/60
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 20ms/step - accuracy: 0.0434 - loss: 6.2759
Epoch 4/60
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 20ms/step - accuracy: 0.0468 - loss: 6.1438
Epoch 5/60
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.0470 - loss: 6.0560
Epoch 6/60
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 20ms/step - accuracy: 0.0479 - loss: 5.9784
Epoch 7/60
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.0486 - loss: 5.8888
Epoch 8/60
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.0493 - loss: 5.8088
Epoch 9/60
[1m222/222[0m

In [4]:
# --- CELL 4: GENERATION DEMO ---

print("\n--- GENERATION RESULTS (Problem Solving: Temperature Sampling) ---\n")

seeds = ["the king said", "shall i compare", "to be or not"]

for seed in seeds:
    print(f"Seed: '{seed}'")
    # Low Temp (Safe, Repetitive)
    safe = bot.generate_text(seed, next_words=15, temperature=0.5)
    print(f"  [Temp 0.5]: {safe}")

    # Medium Temp (Balanced - Best for Shakespeare)
    creative = bot.generate_text(seed, next_words=15, temperature=0.8)
    print(f"  [Temp 0.8]: {creative}")

    print("-" * 50)


--- GENERATION RESULTS (Problem Solving: Temperature Sampling) ---

Seed: 'the king said'
  [Temp 0.5]: the king said you would miss a noble man and appointed have wealsmen to him first servingman you
  [Temp 0.8]: the king said you have shut you be banishd to be been prithee virgilia you are no not
--------------------------------------------------
Seed: 'shall i compare'
  [Temp 0.5]: shall i compare am third servingman why twere me in a crackd heart and claim to set and
  [Temp 0.8]: shall i compare think him one so sicinius prithee tis reverend can faults where have you a highest
--------------------------------------------------
Seed: 'to be or not'
  [Temp 0.5]: to be or not a scourge to the capitol which i know the remove keep the good tongue that
  [Temp 0.8]: to be or not hear them to his general does many so a brave man brutus then at this
--------------------------------------------------
