In [1]:
import tensorflow as tf
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
import pickle 
from pathlib import Path
from tensorflow import keras
from keras.models import Sequential
from keras.layers import GRU, LSTM, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import os

# This script calculates character embeddings from description texts and 
# saves results in embeddings.html.
# Original code:
# https://keras.io/examples/generative/lstm_character_level_text_generation/
# Hyperparameters that have been little explored in this work are SEQ_LENGTH and 
# EMBED_LEN (see below). SEQ_LENGTH defines length of substring by which 
# the next character is predicted. EMBED_LEN specifies dimension of embedding vector.
# The smaller EMBED_LEN, the more accurate segmentation of MHA layer output will be, 
# but training of FCNN + RNN model may be more difficult.

WORKING_DIRECTORY = 'C:/Pilot/test/'
os.chdir (WORKING_DIRECTORY)

fpini = Path ('inidata.csv')  
df = pd.read_csv (fpini)[['txt']].fillna(0)

ltext = [" ".join(x.split()) for x in df.txt.values.tolist()]
tokenizer = Tokenizer (char_level = True)

tokenizer.fit_on_texts (ltext)

print(tokenizer.word_index)

# Save tokenizer to use in "model p1" script

fpT = Path ('tokenizer.pickle')  
with open (fpT, 'wb') as handle:
    pickle.dump (tokenizer, handle)

SEQ_LENGTH = 8 # hyperparameter to tune 
x_train, y_train = [], []

for text in ltext: 
    for i in range (0, len(text) - SEQ_LENGTH):
        inp_seq = text[i : i + SEQ_LENGTH].lower()
        out_seq = text[i + SEQ_LENGTH].lower()
        x_train.append (inp_seq)
        y_train.append (tokenizer.word_index[out_seq]) 

x_train = tokenizer.texts_to_sequences (x_train) ## Retrieve index for characters from vocabulary

x_train, y_train = np.array (x_train, dtype = np.int32), np.array (y_train)

print ('x_train.shape = ', x_train.shape)#, Y_train.shape
print ('y_train.shape = ', y_train.shape)

EMBED_LEN = 16 # hyperparameter to tune
RNN_UNITS = 256 
EPOCHS = 6

model = Sequential([
                    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=EMBED_LEN,
                              input_length=SEQ_LENGTH),
                    LSTM(RNN_UNITS),
#                    GRU (RNN_UNITS),
                    Dense(len(tokenizer.word_index)+1, activation="softmax")
                ])

model.summary()

model.compile (optimizer=Adam(learning_rate=0.001), loss="sparse_categorical_crossentropy")
model.fit (x_train, y_train, batch_size = 1024, epochs = EPOCHS)

fpE = Path ('embeddings.html')  
dfEmb = pd.DataFrame(tokenizer.word_index.items(), columns=['char', 'tkid'])
lid = list (tokenizer.word_index.values())
lid.append (0)
lid.sort()
tid = tf.constant (lid)
dfEmb_ = pd.DataFrame (tf.nn.embedding_lookup (model.layers[0].embeddings, tid))
dfEmb_['eid'] = tid
dfEmb = dfEmb.merge (dfEmb_, how = 'right', left_on = 'tkid', right_on = 'eid')
dfEmb.to_html (fpE, index = False, encoding ='UTF-8')  


{' ': 1, '0': 2, 'l': 3, 'p': 4, 's': 5, '1': 6, 'u': 7, 'm': 8, 't': 9, '%': 10, 'c': 11, '5': 12, 'r': 13, 'g': 14, 'k': 15, 'e': 16, '.': 17, 'b': 18, 'f': 19, '3': 20, '2': 21, 'h': 22, 'n': 23, 'a': 24, '6': 25, '4': 26, 'v': 27, 'd': 28, 'i': 29, 'o': 30, 'j': 31, '9': 32, 'z': 33}
x_train.shape =  (3999126, 8)
y_train.shape =  (3999126,)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 16)             544       
                                                                 
 lstm (LSTM)                 (None, 256)               279552    
                                                                 
 dense (Dense)               (None, 34)                8738      
                                                                 
Total params: 288,834
Trainable params: 288,834
Non-trainable params: 0
_________________________________

KeyboardInterrupt: 