In [4]:
# https://karpathy.github.io/2015/05/21/rnn-effectiveness/
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
with open("./shakespere.txt" ) as f:
    text = f.read()
tokens = text.lower().split()

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

In [8]:
sequences = []
window_size = 5  # number of input words

encoded = tokenizer.texts_to_sequences([text])[0]

for i in range(window_size, len(encoded)):
    seq = encoded[i-window_size:i+1]   # previous N words + next word
    sequences.append(seq)

print(sequences[0])

[88, 269, 139, 35, 969, 143]


In [9]:
max_len = max(len(seq) for seq in sequences)
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

In [10]:
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

In [11]:
mlp = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(vocab_size, activation='softmax')
])

mlp.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
mlp.fit(X, y, epochs=200, verbose=0)


  bias_constraint=None,


KeyboardInterrupt: 

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

# RNN model (vanilla RNN)
rnn = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 50, input_length=X.shape[1]), # Embedding converts each word ID into a dense vector of features (e.g., 50-dim). words with similar contexts get similar embeddings.
    tf.keras.layers.SimpleRNN(100),   # <- plain RNN here
    tf.keras.layers.Dense(vocab_size, activation='softmax')
])

rnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
rnn.fit(X, y, epochs=50, verbose=1)   # fewer epochs usually enough


In [None]:
def predict_next_word(model, tokenizer, text_seq, max_len):
    encoded = tokenizer.texts_to_sequences([text_seq])[0]
    
    encoded = pad_sequences([encoded], maxlen=max_len-1, padding='pre')
    
    y_pred = model.predict(encoded, verbose=0).argmax()
    
    for word, index in tokenizer.word_index.items():
        if index == y_pred:
            return word


In [None]:
seed_text = "to be or not"
next_word = predict_next_word(mlp, tokenizer, seed_text, max_len)
print(seed_text, next_word)
