##IMPORTS

In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re

In [3]:
dataset = open("corpus.txt", "r", encoding = "utf-8-sig").read()

In [4]:
dataset[:100]

'In the Devoid of Sorrow\nThe Air and my Heart full of Luster\nBehind the Mountains\nThere must be a key'

##PREPROCESSING

In [5]:
# preserves the \n character, unlike the regular .split()
tokens = re.findall(r"\S+|\n", dataset)

In [6]:
tokens[:10]

['In', 'the', 'Devoid', 'of', 'Sorrow', '\n', 'The', 'Air', 'and', 'my']

In [7]:
def clean_text(text):
    # punctuation, except ' - and \
    text = re.sub(r"[^\a-zA-Z'-]", " ", text)
    # double whitespaces
    text = re.sub(r" +", " ", text)
    # lower-case
    text = text.lower()
    # clean archaisms and other special cases
    text = re.sub(r"thy|thine", "your", text)
    text = re.sub(r"thou|thee|ye", "you", text)
    text = re.sub(r"'cause", "because", text)
    text = re.sub(r"shalt", "shall", text)
    #text = re.sub(r"", "", text)
    #ADD OTHERS
  
    return text

In [8]:
text = [clean_text(token) for token in tokens]

In [9]:
text[:10]

['in', 'the', 'devoid', 'of', 'sorrow', '\n', 'the', 'air', 'and', 'my']

##EXPLORATORY DATA ANALYSIS

In [10]:
# total tokens
len(tokens)

390866

In [11]:
# unique tokens
len(set(tokens))

29614

In [12]:
# hapax legomena - outliers - possibly remove?

In [13]:
# vocab analysis - lemmatize

##SEQUENCES

In [14]:
seq_len = 50
X_items = []
y = []

for i in range(0, len(tokens)-seq_len):
	X_items.append(tokens[i:i+seq_len])
	y.append(tokens[i+seq_len])

In [16]:
X = [" ".join(item) for item in X_items]

In [17]:
X[0]

'In the Devoid of Sorrow \n The Air and my Heart full of Luster \n Behind the Mountains \n There must be a key to the Gates \n Right passed easy Talk \n We can once again walk the Path of Sorrow \n Let the Turn of Search bring you'

In [18]:
y[0]

'away'

In [19]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

In [20]:
X_embedded = tokenizer.texts_to_sequences(X)
y_embedded = tokenizer.texts_to_sequences(y)

In [None]:
X_embedded[0]

In [48]:
# CONVERT TO TENSORS
type(X_embedded)

list

In [None]:
y_embedded[0]

In [49]:
# CONVERT TO (CATEGORICAL) TENSORS
type(y_embedded)

list

##MODEL

In [23]:
from keras.models import Sequential
from keras.layers import Input, Embedding, LSTM, Dense, Dropout

In [24]:
INPUT_SHAPE = seq_len
VOCABULARY_SIZE = len(tokenizer.word_index)
EMBEDDING_DIMENSION = 50
LSTM_UNITS = 50
DROPOUT_RATE = 0.2

In [34]:
rnn = Sequential()

rnn.add(Input(INPUT_SHAPE,))
# +1 here resolves the indexing problem during training
rnn.add(Embedding(VOCABULARY_SIZE+1, EMBEDDING_DIMENSION))
rnn.add(LSTM(LSTM_UNITS, return_sequences=True))
rnn.add(Dense(VOCABULARY_SIZE, activation="softmax"))

In [35]:
rnn.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [36]:
rnn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 50)            886750    
                                                                 
 lstm_1 (LSTM)               (None, 50, 50)            20200     
                                                                 
 dense (Dense)               (None, 50, 17734)         904434    
                                                                 
Total params: 1,811,384
Trainable params: 1,811,384
Non-trainable params: 0
_________________________________________________________________


##MODEL TRAINING

##GENERATION