In [247]:
import tensorflow_datasets as tfds
import numpy as np
import tensorflow as tf
import string

# 1. The Dataset

In [248]:
def preprocess(ds, trans_dict=False, sentence_wise=True):
    # make numpy string array from tfds    
    tfds_to_numpy = lambda x: next(iter(x))['text'].numpy()
    ds = tfds_to_numpy(ds).decode()                             
    
    ds_words = ds.lower().replace('\n', ' ').translate({ord("'"): None})
    exclude = string.punctuation.translate({ord("'"): None})
    table = ds_words.maketrans(exclude, ' '*len(exclude))                   
    ds_words = ds_words.translate(table).split()
    
    # create a list of words split into sentences
    if sentence_wise: 
        ds = ds.lower().replace('\n', ' ').translate({ord("'"): None})
        exclude = string.punctuation.translate({ord("'"): None, ord('.'): None})
        table = ds.maketrans(exclude, ' '*len(exclude))
        ds = ' '.join(ds.translate(table).split()).split('.')
        ds = [sentence.translate({ord("."): None}).split() for sentence in ds]        
    
    # create a list of words concatenated
    else:
        ds = ds_words
    
    if trans_dict:
        # creates two lookup tables, val->id and id->val
        dict_to_id = {val: i for i, val in enumerate(sorted(set(ds_words)))}        
        dict_to_val = {id_: val for val, id_ in dict_to_id.items()}
        
        return ds, dict_to_id, dict_to_val
    
    return ds

In [251]:
test_ds, train_ds = tfds.load(name='tiny_shakespeare',
                        shuffle_files=False, 
                        split=['test', 'train'])


test_ds, test_to_id, test_to_val = preprocess(test_ds, trans_dict=True)
train_ds = preprocess(train_ds)

print(train_ds[0:5])


[['first', 'citizen', 'before', 'we', 'proceed', 'any', 'further', 'hear', 'me', 'speak'], ['all', 'speak', 'speak'], ['first', 'citizen', 'you', 'are', 'all', 'resolved', 'rather', 'to', 'die', 'than', 'to', 'famish', 'all', 'resolved'], ['resolved'], ['first', 'citizen', 'first', 'you', 'know', 'caius', 'marcius', 'is', 'chief', 'enemy', 'to', 'the', 'people']]


In [253]:
ds = test_ds

def gen_word_embeddings():
    while True:
        np.random.shuffle(ds)      
        
        # for each sentence generate one target and make input, target pairs from leftover words within sentence
        for sentence in ds:
            target_id = np.random.randint(0, len(sentence))
            target = sentence[target_id]
            
            context_window = sentence[target_id-2:target_id] + sentence[target_id+1:target_id+3]
            np.random.shuffle(context_window)
            
            for word in context_window:
                yield (word, target)
                
gen = gen_word_embeddings()
for i in range(13):
    print(next(gen))

('ready', 'now')
('am', 'now')
('bring', 'to')
('try', 'to')
('with', 'to')
('her', 'to')
('clouds', 'thy')
('to', 'thy')
('strong', 'thy')
('bidding', 'thy')
('never', 'i')
('had', 'i')
('govern', 'to')
