In [2]:
import re
import numpy as np
import nltk
from nltk.tokenize import word_tokenize

# Download the NLTK tokenizer
nltk.download('punkt')

# Function to clean and preprocess the text
def preprocess_text(file_path):
    # Read the file
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # Remove special characters and digits
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = re.sub(r'\d+', '', text)

    # Convert text to lowercase
    text = text.lower()

    # Tokenize text
    tokens = word_tokenize(text)

    return tokens

# Path to your Gutenberg text file
gutenberg_text_path = 'siddhartha.txt'
tokens = preprocess_text(gutenberg_text_path)

# Example: Print first 50 tokens
print(tokens[:50])

# Create a vocabulary and mapping from words to indices
vocab = sorted(set(tokens))
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for idx, word in enumerate(vocab)}

# Convert tokens to integers
token_indices = [word_to_idx[word] for word in tokens]

# Print vocabulary size and token indices example
print(f"Vocabulary Size: {len(vocab)}")
print(f"Example token indices: {token_indices[:50]}")


['the', 'project', 'gutenberg', 'ebook', 'of', 'siddhartha', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'united', 'states', 'and', 'most', 'other', 'parts', 'of', 'the', 'world', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or', 'reuse', 'it', 'under', 'the', 'terms', 'of', 'the']
Vocabulary Size: 4028
Example token indices: [3518, 2684, 1572, 1067, 2360, 3145, 3548, 1067, 1871, 1396, 3518, 3740, 2360, 138, 140, 1772, 3518, 3695, 3333, 111, 2228, 2416, 2470, 2360, 3518, 3967, 207, 2300, 724, 111, 3937, 92, 2300, 2885, 3891, 4015, 2138, 713, 1875, 1498, 1875, 236, 2404, 2895, 1875, 3685, 3518, 3503, 2360, 3518]


[nltk_data] Downloading package punkt to /Users/sudarshan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# Hyperparameters
vocab_size = len(vocab)
embedding_dim = 64
rnn_units = 256
batch_size = 64

# Define the RNN model
def build_rnn_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = Sequential([
        Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        SimpleRNN(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        Dense(vocab_size)
    ])
    return model

# Build the model
model = build_rnn_model(vocab_size, embedding_dim, rnn_units, batch_size)

# Compile the model with a loss function and optimizer
model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

# Print the model summary
model.summary()


ValueError: Unrecognized keyword arguments passed to Embedding: {'batch_input_shape': [64, None]}

In [10]:
# Sequence length for input
seq_length = 100

# Create sequences of fixed length
def create_sequences(token_indices, seq_length):
    sequences = []
    next_words = []
    for i in range(0, len(token_indices) - seq_length, 1):
        sequences.append(token_indices[i:i + seq_length])
        next_words.append(token_indices[i + seq_length])
    
    return np.array(sequences), np.array(next_words)

sequences, next_words = create_sequences(token_indices, seq_length)

# Reshape and prepare for model
dataset = tf.data.Dataset.from_tensor_slices((sequences, next_words))
dataset = dataset.shuffle(buffer_size=len(sequences)).batch(batch_size, drop_remainder=True)

# Example: Print shape of dataset
for input_seq, target_seq in dataset.take(1):
    print(f"Input shape: {input_seq.shape}, Target shape: {target_seq.shape}")


Input shape: (64, 100), Target shape: (64,)


2024-10-16 02:45:56.339172: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [None]:
# # Training the model with BPTT
# epochs = 10

# # Train the model
# history = model.fit(dataset, epochs=epochs)

# # Save the trained model
# model.save('trained_rnn_model.h5')


Glove embeddings

In [11]:
import numpy as np

# Path to your GloVe embeddings (ensure the file 'glove.6B.100d.txt' is inside this directory)
glove_dir = "data/glove.6B.100d.txt"
embedding_dim = 100  # GloVe embedding dimension

# Load the GloVe embeddings into a dictionary
def load_glove_embeddings(glove_dir):
    embeddings_index = {}
    with open(glove_dir, encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Load GloVe vectors
embeddings_index = load_glove_embeddings(glove_dir)
print(f"Found {len(embeddings_index)} word vectors.")


Found 400000 word vectors.


In [12]:
# Create an embedding matrix for our vocabulary
def create_embedding_matrix(vocab, embeddings_index, embedding_dim):
    embedding_matrix = np.zeros((len(vocab), embedding_dim))
    for word, i in word_to_idx.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words found in GloVe are added to the embedding matrix
            embedding_matrix[i] = embedding_vector
        else:
            # Words not found in GloVe are initialized randomly
            embedding_matrix[i] = np.random.uniform(-0.1, 0.1, embedding_dim)
    return embedding_matrix

embedding_matrix = create_embedding_matrix(vocab, embeddings_index, embedding_dim)

# Print shape of the embedding matrix
print(f"Embedding matrix shape: {embedding_matrix.shape}")


Embedding matrix shape: (4028, 100)


In [15]:
# Define the RNN model with GloVe embeddings
def build_rnn_model_with_glove(vocab_size, embedding_dim, rnn_units, embedding_matrix):
    inputs = Input(shape=(None,), batch_size=batch_size)

    # Embedding layer
    embedding_layer = Embedding(vocab_size, embedding_dim,
                                embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                                trainable=False)(inputs)

    # RNN layer
    rnn_layer = SimpleRNN(rnn_units, stateful=True, recurrent_initializer='glorot_uniform')(embedding_layer)

    # Output layer
    outputs = Dense(vocab_size)(rnn_layer)

    # Create the model
    model = tf.keras.Model(inputs, outputs)
    return model

# Build the model using GloVe embeddings
model = build_rnn_model_with_glove(vocab_size, embedding_dim, rnn_units, embedding_matrix)

# Compile the model
model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

# Print the model summary
model.summary()


In [16]:
# Create sequences of fixed length
def create_sequences(token_indices, seq_length):
    sequences = []
    next_words = []
    for i in range(0, len(token_indices) - seq_length):
        sequences.append(token_indices[i:i + seq_length])
        next_words.append(token_indices[i + seq_length])  # This is the target (next word)
    
    return np.array(sequences), np.array(next_words)

sequences, next_words = create_sequences(token_indices, seq_length)

# Reshape and prepare for model
dataset = tf.data.Dataset.from_tensor_slices((sequences, next_words))
dataset = dataset.shuffle(buffer_size=len(sequences)).batch(batch_size, drop_remainder=True)

# Example: Print shape of dataset
for input_seq, target_seq in dataset.take(1):
    print(f"Input shape: {input_seq.shape}, Target shape: {target_seq.shape}")


Input shape: (64, 100), Target shape: (64,)


2024-10-16 02:47:03.269838: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [17]:
# Training the model
epochs = 10

# Train the model
history = model.fit(dataset, epochs=epochs)

# Save the trained model
model.save('rnn_model_with_glove.h5')


Epoch 1/10
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 50ms/step - loss: 6.4779
Epoch 2/10
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 51ms/step - loss: 5.2396
Epoch 3/10
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 51ms/step - loss: 4.6226
Epoch 4/10
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 51ms/step - loss: 4.0837
Epoch 5/10
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 53ms/step - loss: 3.5752
Epoch 6/10
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 59ms/step - loss: 3.1523
Epoch 7/10
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 58ms/step - loss: 2.8223
Epoch 8/10
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 61ms/step - loss: 2.4332
Epoch 9/10
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 60ms/step - loss: 2.1427
Epoch 10/10
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38



In [19]:
import numpy as np

def generate_text(model, start_string, num_generate=100):
    # Convert start string to numbers (tokenization)
    input_eval = [word_to_idx[s] for s in start_string.split()]
    input_eval = np.array(input_eval)[np.newaxis, :]  # Add batch dimension

    # Generate text
    text_generated = []

    # Temperature parameter for randomness
    temperature = 1.0

    model.reset_states()
    predictions = model(input_eval)
    predictions = tf.squeeze(predictions, 0)

    # Sample a word from the predictions
    predicted_id = tf.random.categorical(predictions[-1, :], num_samples=1)[-1, 0].numpy()
    text_generated.append(idx_to_word[predicted_id])

    for _ in range(num_generate):
        input_eval = tf.expand_dims([predicted_id], 0)  # Reshape for the model
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)

        # Sample a new word
        predicted_id = tf.random.categorical(predictions[-1, :], num_samples=1)[-1, 0].numpy()
        text_generated.append(idx_to_word[predicted_id])

    return ' '.join(text_generated)

# Generate text based on a question
question = "who are the key characters in the story?"
response = generate_text(model, question, num_generate=50)
print(f"Model Response: {response}")


KeyError: 'key'

In [21]:
# Create word-to-index and index-to-word mappings
words = list(set(token_indices))  # Assuming token_indices is your list of words
word_to_idx = {word: index for index, word in enumerate(words)}
word_to_idx['<UNK>'] = len(word_to_idx)  # Add an unknown token
idx_to_word = {index: word for word, index in word_to_idx.items()}


In [24]:
import numpy as np
import tensorflow as tf

def generate_text(model, start_string, num_generate=100):
    # Convert start string to numbers (tokenization), handle unknown words
    input_eval = []
    for word in start_string.split():
        input_eval.append(word_to_idx.get(word, word_to_idx['<UNK>']))  # Use <UNK> index for unknown words

    input_eval = np.array(input_eval)[np.newaxis, :]  # Add batch dimension

    # Generate text
    text_generated = []

    # Temperature parameter for randomness
    temperature = 1.0

    # Do not reset states if the model is stateless
    # model.reset_states()  # Only if using stateful LSTM

    # Use model to predict
    predictions = model(input_eval)

    # Squeeze only if the output has size 1 in that dimension
    predictions = tf.squeeze(predictions, axis=0)  # This will work if you have a batch size of 1 or no batch dimension

    # Sample a word from the predictions
    predicted_id = tf.random.categorical(predictions[-1, :], num_samples=1)[-1, 0].numpy()
    text_generated.append(idx_to_word[predicted_id])

    for _ in range(num_generate):
        input_eval = tf.expand_dims([predicted_id], 0)  # Reshape for the model
        predictions = model(input_eval)

        # We don't squeeze here since we want to keep the batch dimension
        predictions = tf.squeeze(predictions, axis=0)

        # Sample a new word
        predicted_id = tf.random.categorical(predictions[-1, :], num_samples=1)[-1, 0].numpy()
        text_generated.append(idx_to_word[predicted_id])

    return ' '.join(text_generated)

# Example of generating text
question = "Who are the key characters in the story?"
response = generate_text(model, question, num_generate=50)
print(f"Model Response: {response}")


2024-10-16 02:56:11.443339: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: INVALID_ARGUMENT: Can not squeeze dim[0], expected a dimension of 1, got 64


InvalidArgumentError: {{function_node __wrapped__Squeeze_device_/job:localhost/replica:0/task:0/device:CPU:0}} Can not squeeze dim[0], expected a dimension of 1, got 64 [Op:Squeeze] name: 

In [25]:
import numpy as np
import re
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# Step 1: Data Preprocessing
def preprocess(text):
    text = re.sub(r'[" "]+', " ", text)  # Replace multiple spaces with a single space
    text = re.sub(r"[^a-zA-Z?.!,]+", " ", text)  # Keep only certain characters
    return text.lower()  # Convert to lowercase

# Sample text
text = """
From fairest creatures we desire increase. 
That thereby beauty’s rose might never die. 
But as the riper should by time decease.
"""

# Preprocess the text
text_cleaned = preprocess(text)
tokens = text_cleaned.split()  # Simple tokenization

# Step 2: Create word-to-index and index-to-word mappings
word_to_index = {word: i for i, word in enumerate(set(tokens))}
index_to_word = {i: word for word, i in word_to_index.items()}

vocabulary_size = len(word_to_index)
print("Vocabulary Size:", vocabulary_size)

# Step 3: Prepare Training Data with Trigrams
def create_ngrams(tokens, n=3):
    return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

def create_sequences(tokens, n=3):
    sequences = create_ngrams(tokens, n)
    X = []
    y = []
    
    for seq in sequences:
        X.append([word_to_index[word] for word in seq[:-1]])  # First two words as input
        y.append(word_to_index[seq[-1]])  # Last word as output
    
    return np.array(X), np.array(y)

# Create sequences
X, y = create_sequences(tokens, n=3)
print("X shape:", X.shape)
print("y shape:", y.shape)

# Convert y to categorical
y = to_categorical(y, num_classes=vocabulary_size)

# Step 4: Build the RNN Model
def create_model(vocabulary_size, embedding_dim=50, rnn_units=100):
    model = Sequential()
    model.add(Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, input_length=X.shape[1]))
    model.add(SimpleRNN(units=rnn_units))
    model.add(Dense(vocabulary_size, activation='softmax'))
    return model

model = create_model(vocabulary_size)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Step 5: Train the Model
epochs = 100
model.fit(X, y, batch_size=32, epochs=epochs)

# Step 6: Generate Text
def generate_text(model, seed_text, n_words, word_to_index, index_to_word):
    for _ in range(n_words):
        tokenized_seed = [word_to_index[word] for word in seed_text.split()]
        tokenized_seed = pad_sequences([tokenized_seed], maxlen=X.shape[1], padding='pre')
        predicted_probs = model.predict(tokenized_seed, verbose=0)[0]
        
        # Sample a word based on probabilities
        sampled_word = np.random.choice(vocabulary_size, p=predicted_probs)
        
        # Add sampled word to the seed text
        seed_text += ' ' + index_to_word[sampled_word]
    
    return seed_text

# Generate text starting with a specific seed
seed_text = 'from fairest'  # Change this to your desired start words
generated_text = generate_text(model, seed_text, 10, word_to_index, index_to_word)
print("Generated Text:", generated_text)


Vocabulary Size: 22
X shape: (20, 2)
y shape: (20,)


ValueError: Unrecognized keyword arguments passed to Embedding: {'input_length': 2}