##IMPORTS

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import re

In [None]:
dataset = open("corpus.txt", "r", encoding = "utf-8-sig").read()

In [None]:
dataset[:100]

##PREPROCESSING

In [None]:
def clean_text(text):
    # lower-case
    text = text.lower()

    # clean archaisms and other special cases
    text = re.sub(r"'cause ", "because", text)
    text = re.sub(r"shalt", "shall", text)
    text = re.sub(r"kaos", "chaos", text)
    text = re.sub(r"'till", "until", text)
    text = re.sub(r"fuckin'", "fucking", text)
    text = re.sub(r"couldst", "could", text)
    text = re.sub(r"sayeth", "says", text)
    text = re.sub(r"calleth", "calls", text)
    text = re.sub(r"&", " and ", text)
    text = re.sub(r"sathan", "satan", text)
    text = re.sub(r"'em", "them", text)

    # possible parts of other words
    text = re.sub(r"(?<![a-zA-Z])thy(?![a-zA-Z])|thine", "your", text)
    text = re.sub(r"thou|thee|(?<![a-zA-Z])ye(?![a-zA-Z])", "you", text)
    text = re.sub(r"(?<![a-zA-Z])o(?![a-zA-Z])", "oh", text)
    text = re.sub(r"(?<![a-zA-Z])'tis(?![a-zA-Z])", "it is", text)
    text = re.sub(r"(?<![a-zA-Z])thru(?![a-zA-Z])", "through", text)
    text = re.sub(r"(?<![a-zA-Z])ov(?![a-zA-Z])", "of", text)
    text = re.sub(r"(?<![a-zA-Z])hast(?![a-zA-Z])", "have", text)

    # punctuation, except \n and '
    text = re.sub(r"[^\na-zA-Z']", " ", text)

    # double whitespaces
    text = re.sub(r" +", " ", text)


  
    return text

In [None]:
text = clean_text(dataset)

In [None]:
tokens = re.findall(r"\S+|\n", text)

In [None]:
tokens[600:666]

##EXPLORATORY DATA ANALYSIS

In [None]:
# total tokens
len(tokens)

In [None]:
# unique tokens
len(set(tokens))

In [None]:
# hapax legomena - outliers - possibly remove?

In [None]:
# vocab analysis - lemmatize

##SEQUENCES

In [None]:
seq_len = 50
X_items = []
y = []

for i in range(0, len(tokens)-seq_len):
	X_items.append(tokens[i:i+seq_len])
	y.append(tokens[i+seq_len])

In [None]:
X = [" ".join(item) for item in X_items]

In [None]:
X[:5]

In [None]:
y[:5]

In [None]:
# TESTING THE DIMENSIONS

In [None]:
x_test = [re.findall(r"\S+|\n", sequence) for sequence in X]

In [None]:
lengths_x = [len(item) for item in x_test]

In [None]:
set(lengths_x)

In [None]:
y_test = [re.findall(r"\S+|\n", sequence) for sequence in y]

In [None]:
lengths_y = [len(item) for item in y_test]

In [None]:
set(lengths_y)

##TOKENIZATION

In [None]:
from keras.preprocessing.text import Tokenizer

# to keep the \n
filters_ = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t' 
tokenizer = Tokenizer(filters=filters_)

tokenizer.fit_on_texts(X)

In [None]:
X_emb = tokenizer.texts_to_sequences(X)
y_emb = tokenizer.texts_to_sequences(y)

In [None]:
# TESTING THE DIMENSIONS

In [None]:
lengths_X_emb = [len(item) for item in X_emb]

In [None]:
set(lengths_X_emb)

In [None]:
lengths_y_emb = [len(item) for item in y_emb]

In [None]:
set(lengths_y_emb)

In [None]:
from tensorflow.keras.utils import to_categorical
classes = len(tokenizer.word_index)

y_categorical = to_categorical(y_emb, num_classes=classes)

##MODEL

In [None]:
from keras.models import Sequential
from keras.layers import Input, Embedding, LSTM, Dense, Dropout

In [None]:
INPUT_SHAPE = seq_len
VOCABULARY_SIZE = len(tokenizer.word_index)
EMBEDDING_DIMENSION = 50
LSTM_UNITS = 50
DROPOUT_RATE = 0.2

In [None]:
rnn = Sequential()

rnn.add(Input(INPUT_SHAPE,))
# +1 here resolves the indexing problem during training
rnn.add(Embedding(VOCABULARY_SIZE+1, EMBEDDING_DIMENSION))
rnn.add(LSTM(LSTM_UNITS, return_sequences=True))
rnn.add(Dense(VOCABULARY_SIZE, activation="softmax"))

In [None]:
rnn.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [None]:
rnn.summary()

##MODEL TRAINING

In [None]:
# fixes some errors
X_train = np.array(X_emb, dtype=float)
y_train = np.array(y_categorical, dtype=float)

In [None]:
BATCH_SIZE = 32
EPOCHS = 5

In [None]:
rnn.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE)

##GENERATION