<a href="https://colab.research.google.com/github/SveaWilkening/Gilmore_Girls_Episode_Writer/blob/main/GG_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import sys

In [3]:
from google.colab import files
uploaded = files.upload()

Saving gg_s1.txt to gg_s1.txt


In [4]:
file = open(r"gg_s1.txt", encoding="latin-1").read()


In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

turn the words into tokens:

In [6]:
def tokenize_words(input):
    input = input.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

processed_inputs = tokenize_words(file)

build a dictionary representing every character as a number

In [7]:
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

input_len = len(processed_inputs)
vocab_len = len(chars)

print(input_len) #number of words in file
print(vocab_len) #number of unique characters

513230
45


determine the sequence length for the problem, in this case 100

In [8]:
seq_length = 100
x_data = []
y_data = []

for i in range(0, input_len - seq_length, 1):
    in_seq = processed_inputs[i:i + seq_length]
    out_seq = processed_inputs[i + seq_length]
    
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [9]:
n_patterns = len(x_data)
X = np.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

one-hot encode y-data

In [10]:
y = np_utils.to_categorical(y_data)

build the model

In [11]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [12]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [19]:
model.fit(X, y, epochs=4, batch_size=256, callbacks=desired_callbacks)

Epoch 1/4

Epoch 00001: loss improved from 1.91114 to 1.84607, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 1.84607 to 1.79827, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 1.79827 to 1.75968, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 1.75968 to 1.72771, saving model to model_weights_saved.hdf5


<tensorflow.python.keras.callbacks.History at 0x7fbce5572780>

Reload the weights

In [20]:
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

#assign numbers to characters
num_to_char = dict((i, c) for i, c in enumerate(chars))

generate a random text sequence

In [21]:
start = np.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" small children four baby chicks rory nice time lorelai bad rory big smile bad night lorelai yeah wel "


generate the dialogue from the predictions of the model

In [22]:
for i in range(1000):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

l lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lorelai oh go lore