In [1]:
import tensorflow_datasets as tfds
import numpy as np
import tensorflow as tf
import string
import time

# 1. The Dataset

In [3]:
def preprocess_strings(ds, sentence_wise=True):
    # make numpy string array from tfds    
    tfds_to_numpy = lambda x: next(iter(x))['text'].numpy()
    ds = tfds_to_numpy(ds).decode()                             
    
    # make list of just words
    ds_words = ds.lower().replace('\n', ' ').translate({ord("'"): None})
    exclude = string.punctuation.translate({ord("'"): None})
    table = ds_words.maketrans(exclude, ' '*len(exclude))                   
    ds_words = np.array(ds_words.translate(table).split())
    
    # creates two lookup tables, val->id and id->val
    dict_to_id = {val: i for i, val in enumerate(sorted(set(ds_words)))}        
    dict_to_val = {id_: val for val, id_ in dict_to_id.items()}
    vocab_size = len(ds_words)
    
    # define occurances of each token
    word_freq = [np.count_nonzero(ds_words==val) for _, val in dict_to_val.items()]

    # create a list of words split into sentences
    if sentence_wise: 
        ds = ds.lower().replace('\n', ' ').translate({ord("'"): None})
        exclude = string.punctuation.translate({ord("'"): None, ord('.'): None})
        table = ds.maketrans(exclude, ' '*len(exclude))
        ds = ' '.join(ds.translate(table).split()).split('.')
        ds = [sentence.translate({ord("."): None}).split() for sentence in ds]        
        
        ds = [[dict_to_id[word] for word in sentence] for sentence in ds]
        
    # use list of words
    else:
        ds = [dict_to_id[word] for words in ds_words]
    
    return ds, dict_to_id, dict_to_val, word_freq, vocab_size

In [4]:
def preprocess_tf_dataset(ds, word_to_id, vocab_size, threads=16, batch_size=32):
    # no need to shuffle, as dataset is shuffled within generator
    
    ds = ds.map(lambda x, y: (tf.one_hot(x, depth=vocab_size), 
                              tf.one_hot(y, depth=vocab_size)), 
                num_parallel_calls=threads)
    
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    
    return ds

In [5]:
train_ds = tfds.load(name='tiny_shakespeare',
                    shuffle_files=False, 
                    split='train')

train_ds, train_to_id, train_to_val, word_freq, vocab_size = preprocess_strings(train_ds)

print(train_ds[0:5], vocab_size)

[[3805, 1748, 830, 11048, 7595, 407, 4159, 4686, 6120, 9224], [279, 9224, 9224], [3805, 1748, 11456, 480, 279, 8183, 7913, 10151, 2757, 9960, 10151, 3612, 279, 8183], [8183], [3805, 1748, 3805, 11456, 5520, 1423, 6046, 5321, 1682, 3319, 10151, 9972, 7132]] 183574


In [6]:
ds = train_ds

def gen_word_embeddings():    
    while True:
        np.random.shuffle(ds)      
        
        # for each sentence generate one target and make input, target pairs from leftover words within sentence
        for sentence in ds:
            if len(sentence) == 0: continue
            
            word_id = np.random.randint(0, len(sentence))
            word = sentence[word_id]
            
            context_window = sentence[word_id-2:word_id] + sentence[word_id+1:word_id+3]
            np.random.shuffle(context_window)
            
            for target in context_window:
                yield word, target
                
gen = gen_word_embeddings()
for i in range(13):
    print(next(gen)[0])

11048
11048
11048
11048
10151
10151
10151
10151
6007
6007
6007
6007
2978


In [7]:
train_ds = tf.data.Dataset.from_generator(gen_word_embeddings,
                               output_signature=(tf.TensorSpec(shape=(), dtype=tf.int64),
                                                 tf.TensorSpec(shape=(), dtype=tf.int64)))

for i, x in train_ds.take(5):
    print(i)
    print(i.numpy())
    
train_ds = preprocess_tf_dataset(train_ds, train_to_id, vocab_size)

for x, t in train_ds.take(1):
    print(x)
    print(t)
    break

tf.Tensor(6410, shape=(), dtype=int64)
6410
tf.Tensor(6410, shape=(), dtype=int64)
6410
tf.Tensor(6410, shape=(), dtype=int64)
6410
tf.Tensor(6410, shape=(), dtype=int64)
6410
tf.Tensor(4107, shape=(), dtype=int64)
4107
tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]], shape=(32, 183574), dtype=float32)
tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]], shape=(32, 183574), dtype=float32)


In [8]:
for x, t in train_ds.take(1):
    print(x)
    print(t)
    break

tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]], shape=(32, 183574), dtype=float32)
tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]], shape=(32, 183574), dtype=float32)


# 2. The Model

In [7]:
class SkipGram(tf.keras.layers.Layer):
    
    def __init__(self, vocab_size, embedding_size):
        super(SkipGram, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

        self.act = tf.keras.layers.Softmax()
        
    def build(self, shape):
        self.w_in_embedding = self.add_weight(shape=(self.vocab_size, self.embedding_size), 
                                         initializer="random_normal", 
                                         trainable=True)

        # no bias as recommended
#         self.w_in_embedding_bias = self.add_weight(shape=(self.embedding_size), 
#                                          initializer="zeros", 
#                                          trainable=True)
        
        
        self.w_out_embedding = self.add_weight(shape=(self.embedding_size, self.vocab_size), 
                                   initializer="random_normal", 
                                   trainable=True)

    def call(self, x):
        # standart way
        in_embedding = tf.matmul(x, self.w_in_embedding)
        score_vec = tf.matmul(in_embedding, self.w_out_embedding)
        context_word = self.act(score_vec)

        # recommended with lookup
#         embed = tf.nn.embedding_lookup(self.w_in_embedding, tf.argmax(x, axis=-1))
        
#         loss = tf.nn.nce_loss(weights=self.w_in_embedding,             # [vocab_size, embed_size]
#                              biases=self.w_in_embedding_bias,          # [vocab_size]
#                              labels=target,                            # [bs, 1]
#                              inputs=embed,                                 # [bs, embed_size]
#                              num_sampled=20,
#                              num_classes=self.vocab_size)
       
        return context_word

# 3. Training

In [8]:
class Timer():
    """
    A small class for making timings.
    """
    def __init__(self):
        self._start_time = None

    def start(self):
        """
        Start a new timer
        """
        if self._start_time is not None:
            raise TimerError(f"Timer is running. Use .stop() to stop it")

        self._start_time = time.perf_counter()

    def stop(self):
        """
        Stop the timer, and report the elapsed time
        """
        if self._start_time is None:
            print(f"Timer is not running. Use .start() to start it")
            return 0
    
        elapsed_time = time.perf_counter() - self._start_time
        self._start_time = None
        return elapsed_time  

In [9]:
def SkipGram_train(model, train_ds, loss_function, optimizer, train_loss_metric, train_acc_metric):

    for x, target in train_ds.take(100):
        # forward pass with GradientTape
        with tf.GradientTape() as tape:
            prediction = model(x)
            loss = loss_function(target, prediction)

        # backward pass via GradienTape (auto-gradient calc)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        # update metrics
        train_loss_metric.update_state(loss)
        train_acc_metric.update_state(target, prediction)


In [10]:
epochs = 100
learning_rate = 0.001
beta = 0.95
embedding_dim = 64

tf.keras.backend.clear_session()
SG_model = SkipGram(vocab_size, embedding_dim)
loss_function = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam(learning_rate,  beta)

timer = Timer()

In [11]:
# prepare metrics
train_acc_metric = tf.keras.metrics.CategoricalAccuracy('train_accuracy')
# test_acc_metric = tf.keras.metrics.CategoricalAccuracy('test_accuracy')

train_loss_metric = tf.keras.metrics.Mean('train_loss')
# test_loss_metric = tf.keras.metrics.Mean('test_loss')

# Initialize lists for later visualization.
train_losses = []
train_accuracies = []
# test_losses = []
# test_accuracies = []
times = []

In [12]:
epochs = 240
for epoch in range(epochs):
    print(f'\n[EPOCH] ____________________{epoch}____________________')
    
    # training step with metrics update--------------------------------------------------------
    timer.start()

    SkipGram_train(SG_model, train_ds, loss_function, optimizer, train_loss_metric, train_acc_metric)

    # Evaluating training metrics
    train_loss = train_loss_metric.result()
    train_acc = train_acc_metric.result()
    train_losses.append(train_loss)
    train_accuracies.append(train_acc)
    
    elapsed_time = timer.stop()
    times.append(elapsed_time)
    
    print(f'[{epoch}] - Finished Epoch in {elapsed_time:0.2f} seconds - train_loss: {train_loss:0.4f}, train_acc: {train_acc:0.4f}')
    
    # evaluation step with metrics update--------------------------------------------------------
#     timer.start()

#     eval_step(model, test_ds, loss_function, 
#               loss_metric=test_loss_metric, 
#               acc_metric=test_acc_metric)

#     # Evaluating validation metrics
#     test_loss = test_loss_metric.result()
#     test_acc = test_acc_metric.result()
#     test_losses.append(test_loss)
#     test_accuracies.append(test_acc)
    
#     print(f'\n[{epoch}] - Finished evaluation - test_loss: {test_loss:0.4f}, test_accuracy: {test_acc:0.4f}')
    
    # Resetting train and validation metrics-----------------------------------------------------
    train_acc_metric.reset_states()
#     test_acc_metric.reset_states()
    train_loss_metric.reset_states()
#     test_loss_metric.reset_states()
    
#     elapsed_time = timer.stop()
#     times.append(elapsed_time)
  
    if epoch%3 == 0:
        print(f'\n[INFO] - Total time elapsed: {np.sum(times)/60:0.4f} min. Total time remaining: {(np.sum(times)/(epoch+1))*(epochs-epoch-1)/60:0.4f} min.')

print(f'[INFO] - Total run time: {np.sum(times)/60:0.4f} min.')


[EPOCH] ____________________0____________________
[0] - Finished Epoch in 5.61 seconds - train_loss: 12.1193, train_acc: 0.0000

[INFO] - Total time elapsed: 0.0935 min. Total time remaining: 22.3537 min.

[EPOCH] ____________________1____________________
[1] - Finished Epoch in 5.31 seconds - train_loss: 12.1141, train_acc: 0.0016

[EPOCH] ____________________2____________________
[2] - Finished Epoch in 5.21 seconds - train_loss: 12.0883, train_acc: 0.0131

[EPOCH] ____________________3____________________
[3] - Finished Epoch in 5.19 seconds - train_loss: 11.9829, train_acc: 0.0253

[INFO] - Total time elapsed: 0.3553 min. Total time remaining: 20.9624 min.

[EPOCH] ____________________4____________________
[4] - Finished Epoch in 5.24 seconds - train_loss: 11.6916, train_acc: 0.0231

[EPOCH] ____________________5____________________


KeyboardInterrupt: 