In [38]:
# https://karpathy.github.io/2015/05/21/rnn-effectiveness/
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [39]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
with open("./tiny_tiny_shakespere.txt" ) as f:
    text = f.read()
tokens = text.lower().split()

In [40]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

In [41]:
sequences = []
window_size = 5  # number of input words

encoded = tokenizer.texts_to_sequences([text])[0]

for i in range(window_size, len(encoded)):
    seq = encoded[i-window_size:i+1]   # previous N words + next word
    sequences.append(seq)

print(sequences[0])

[5, 9, 276, 26, 129, 130]


In [42]:
max_len = max(len(seq) for seq in sequences)
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

In [43]:
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

In [44]:
from sklearn.model_selection import train_test_split

# Split data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [45]:
mlp = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(vocab_size, activation='softmax')
])

mlp.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
mlp.fit(X_train, y_train, epochs=200, verbose=0)


<keras.src.callbacks.history.History at 0x1d2cefa4790>

In [46]:
pred_probs = mlp.predict(X_test)
y_pred = np.argmax(pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)  # convert one-hot back to integers
accuracy = np.mean(y_true == y_pred)
print("Accuracy:", accuracy)

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Accuracy: 0.02053388090349076


In [47]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

# RNN model (vanilla RNN)
rnn = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 50, input_length=X.shape[1]), # Embedding converts each word ID into a dense vector of features (e.g., 50-dim). words with similar contexts get similar embeddings.
    tf.keras.layers.SimpleRNN(100),   # <- plain RNN here
    tf.keras.layers.Dense(vocab_size, activation='softmax')
])

rnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
rnn.fit(X_train, y_train, epochs=100, verbose=0)


<keras.src.callbacks.history.History at 0x1d2cdf07090>

In [48]:
pred_probs = rnn.predict(X_test)
y_pred = np.argmax(pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)  # convert one-hot back to integers
accuracy = np.mean(y_true == y_pred)
print("Accuracy:", accuracy)

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
Accuracy: 0.006160164271047228


In [49]:
def predict_next_word(model, tokenizer, text_seq, max_len):
    encoded = tokenizer.texts_to_sequences([text_seq])[0]
    
    encoded = pad_sequences([encoded], maxlen=max_len-1, padding='pre')
    
    y_pred = model.predict(encoded, verbose=0).argmax()
    
    for word, index in tokenizer.word_index.items():
        if index == y_pred:
            return word


In [50]:
seed_text = "No more"
next_word = ""
for i in range(10):
    next_word = predict_next_word(mlp, tokenizer, seed_text + next_word, max_len)
    seed_text += " " + next_word

print(seed_text, next_word)

No more the the hunt own whether well thither flatter broke and and


In [51]:
seed_text = "No more"
next_word = ""
for i in range(10):
    next_word = predict_next_word(rnn, tokenizer, seed_text + next_word, max_len)
    seed_text += " " + next_word

print(seed_text, next_word)

No more for for to speak it what very speak first bread bread
