In [10]:
pip install watermark

Note: you may need to restart the kernel to use updated packages.


In [None]:
!pip install tensorflow

In [9]:
import os
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time
from collections import Counter
from keras.utils import to_categorical
from tensorflow.keras.utils import get_file
from keras.models import Sequential, load_model
from keras.layers import Embedding, LSTM, Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint

%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
%load_ext watermark
%watermark -a 'Ethen' -d -t -v -p keras,numpy,matplotlib,tensorflow

def elapsed(sec):
    """
    Converts elapsed time into a more human readable format.
    from time import time

    start = time()
    # do something that's worth timing, like training a model
    elapse = time() - start
    elapsed(elapse)
    """
    if sec < 60:
        return str(sec) + ' seconds'
    elif sec < (60 * 60):
        return str(sec / 60) + ' minutes'
    else:
        return str(sec / (60 * 60)) + ' hours'

path = get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
with open(path, encoding='utf-8') as f:
    raw_text = f.read()

print('corpus length:', len(raw_text))
print('example text:', raw_text[:150])

# ideally, we would save the cleaned text, to prevent
# doing this step every single time
tokens = raw_text.replace('--', ' ').split()
cleaned_tokens = []
table = str.maketrans('', '', string.punctuation)
for word in tokens:
    word = word.translate(table)
    if word.isalpha():
        cleaned_tokens.append(word.lower())

print('sampled original text: ', tokens[:10])
print('sampled cleaned text: ', cleaned_tokens[:10])

# build up vocabulary,
# rare words will also be considered out of vocabulary words,
# this will be represented by an unknown token
min_count = 2
unknown_token = '<unk>'
word2index = {unknown_token: 0}
index2word = [unknown_token]

filtered_words = 0
counter = Counter(cleaned_tokens)
for word, count in counter.items():
    if count >= min_count:
        index2word.append(word)
        word2index[word] = len(word2index)
    else:
        filtered_words += 1

num_classes = len(word2index)
print('vocabulary size: ', num_classes)
print('filtered words: ', filtered_words)

# create semi-overlapping sequences of words with
# a fixed length specified by the maxlen parameter
step = 3
maxlen = 40
X = []
y = []
for i in range(0, len(cleaned_tokens) - maxlen, step):
    sentence = cleaned_tokens[i:i + maxlen]
    next_word = cleaned_tokens[i + maxlen]
    X.append([word2index.get(word, 0) for word in sentence])
    y.append(word2index.get(next_word, 0))

# keras expects the target to be in one-hot encoded format,
# ideally we would use a generator that performs this conversion
# only on the batch of data that is currently required by the model
# to be more memory-efficient
X = np.array(X)
Y = to_categorical(y, num_classes)
print('sequence dimension: ', X.shape)
print('target dimension: ', Y.shape)
print('example sequence:\n', X[0])


Author: Ethen

Python implementation: CPython
Python version       : 3.12.7
IPython version      : 8.30.0

keras     : 3.7.0
numpy     : 1.26.4
matplotlib: 3.9.2
tensorflow: 2.18.0

Downloading data from https://s3.amazonaws.com/text-datasets/nietzsche.txt
[1m600901/600901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3us/step
corpus length: 600893
example text: PREFACE


SUPPOSING that Truth is a woman--what then? Is there not ground
for suspecting that all philosophers, in so far as they have been
dogmatists
sampled original text:  ['PREFACE', 'SUPPOSING', 'that', 'Truth', 'is', 'a', 'woman', 'what', 'then?', 'Is']
sampled cleaned text:  ['preface', 'supposing', 'that', 'truth', 'is', 'a', 'woman', 'what', 'then', 'is']
vocabulary size:  5090
filtered words:  5097
sequence dimension:  (33342, 40)
target dimension:  (33342, 5090)
example sequence:
 [ 1  2  3  4  5  6  7  8  9  5 10 11 12 13  0  3 14 15 16 17 18 19 20 21
 22 23 21 24 25 26 27  3 28 29 30 31 32  0 33 34]
