<a href="https://colab.research.google.com/github/Spinkk/Implementing-ANNs-with-Tensorflow/blob/main/HW10_problem02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import string
import time
import datetime
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from sklearn.neighbors import NearestNeighbors

# 1. The Dataset

Load the dataset, create dictionaries and separate each word

In [2]:
def preprocess(ds, trans_dict=False, sentence_wise=True):
    # make numpy string array from tfds    
    tfds_to_numpy = lambda x: next(iter(x))['text'].numpy()
    ds = tfds_to_numpy(ds).decode()                             
    
    ds_words = ds.lower().replace('\n', ' ').translate({ord("'"): None})
    exclude = string.punctuation.translate({ord("'"): None})
    table = ds_words.maketrans(exclude, ' '*len(exclude))                   
    ds_words = ds_words.translate(table).split()
    
    # create a list of words split into sentences
    # TODO: apply subsampling to remove invaluable words
    if sentence_wise: 
        ds = ds.lower().replace('\n', ' ').translate({ord("'"): None})
        exclude = string.punctuation.translate({ord("'"): None, ord('.'): None})
        table = ds.maketrans(exclude, ' '*len(exclude))
        ds = ' '.join(ds.translate(table).split()).split('.')
        ds = [sentence.translate({ord("."): None}).split() for sentence in ds]        
    
    # create a list of words concatenated
    else:
        ds = ds_words
    
    if trans_dict:
        # creates two lookup tables, val->id and id->val
        dict_to_id = {val: i for i, val in enumerate(sorted(set(ds_words)))}        
        dict_to_val = {id_: val for val, id_ in dict_to_id.items()}
        
        return ds, dict_to_id, dict_to_val
    
    return ds

In [3]:
# TODO: preprocess doesn't work if we only call train partition
test_ds, ds = tfds.load(name='tiny_shakespeare',
                              shuffle_files=False, 
                        split=['test', 'train'])

ds, val_to_id, id_to_val = preprocess(ds, trans_dict=True)

print(ds[0:5])

[1mDownloading and preparing dataset tiny_shakespeare/1.0.0 (download: Unknown size, generated: 1.06 MiB, total: 1.06 MiB) to /root/tensorflow_datasets/tiny_shakespeare/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/tiny_shakespeare/1.0.0.incompleteBT3FTF/tiny_shakespeare-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/tiny_shakespeare/1.0.0.incompleteBT3FTF/tiny_shakespeare-validation.tfrecord


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/tiny_shakespeare/1.0.0.incompleteBT3FTF/tiny_shakespeare-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

[1mDataset tiny_shakespeare downloaded and prepared to /root/tensorflow_datasets/tiny_shakespeare/1.0.0. Subsequent calls will reuse this data.[0m
[['first', 'citizen', 'before', 'we', 'proceed', 'any', 'further', 'hear', 'me', 'speak'], ['all', 'speak', 'speak'], ['first', 'citizen', 'you', 'are', 'all', 'resolved', 'rather', 'to', 'die', 'than', 'to', 'famish', 'all', 'resolved'], ['resolved'], ['first', 'citizen', 'first', 'you', 'know', 'caius', 'marcius', 'is', 'chief', 'enemy', 'to', 'the', 'people']]


Create input-output pairs with id

In [4]:
BUFFER_SIZE = 512
BATCH_SIZE = 64

# TODO: maybe treat whole text as one instead of parsing per sentence?
# flatten = lambda *n: (e for a in n 
#                       for e in (flatten(*a) if isinstance(a, (tuple, list)) else (a,)))
# list(flatten(train_ds))

   
def make_gen_callable(ds, id_dict):
    '''
    Wrapper to make the generator callable
    '''

    def gen_word_embeddings_int():
        '''
        Generate a input target pair in int id
        '''
        
        # for each sentence generate one target and make input, target pairs from leftover words within sentence
        for sentence in ds:
            input_id = np.random.randint(0, len(sentence))
            input = sentence[input_id]
            
            # window size of 4
            context_window = sentence[input_id-2:input_id] + sentence[input_id+1:input_id+3]            
            # TODO: sample uniformly
            for target in context_window:
                # instead of the word, yield one-hot indices
                yield (id_dict[input], id_dict[target])
    return gen_word_embeddings_int


ds = tf.data.Dataset.from_generator(make_gen_callable(ds, val_to_id),
                                          output_types=(tf.int32, tf.int32))
ds = ds.shuffle(buffer_size=BUFFER_SIZE).batch(BATCH_SIZE)

In [5]:
for i,t in ds.take(1):
    print(i,t)

tf.Tensor(
[10036   301  6511   351  6410 11150 11269  9972  4653  7713  9029  8921
   480  3178 11379  6849     1  6627  9972  5013  9398  1714  8527  8644
   830     1 11207  6635 10151  3584  8792  4832 10036  3178  1172 11128
  5321  1748  4774 10151  4981  4358 11226  9808  4949   363 11467  8921
 11281 11281 10036 10083  3760  5711  3164  5948  9960 11281  4141  9666
  4668  9972  8792 10151], shape=(64,), dtype=int32) tf.Tensor(
[ 9970 10722 11347 11083  5334  7912  6463  4349  8683  7240   520  6817
 10754  5101 11456  8963  5321  9960  6109  9972 11281 10293  1748  5013
  1748  4668  4437  3245  9972  4358 11379  9880  5101  5339  6511  6178
   520  5013 11048  3319  4554  9972 11269  6815 11467  6881  5438  6495
  9614  4358   772   363  2365  4757 11441 10916  5702  9970  3771 10218
  6046  1500 10014  1076], shape=(64,), dtype=int32)


# 2.2 Model

In [6]:
class SkipGram(tf.keras.layers.Layer):
    def __init__(self, num_vocabulary, embedding_dim=64):
        super(SkipGram, self).__init__()
        self.v = num_vocabulary
        self.h = embedding_dim
    
    def build(self,_):
        self.embedding_mat = self.add_weight(shape=(self.v, self.h),
                                             initializer="random_normal",
                                             trainable=True) 
        self.output_mat = self.add_weight(shape=(self.v, self.h),
                                          initializer="random_normal",
                                          trainable=True) 
        self.output_bias = self.add_weight(shape=(self.v,),
                                           initializer="random_normal",
                                           trainable=True)

    def call(self, input_id, target_id):
        # (batch,h) = from (v,h) select 'batch_num' v* by lookup
        embedding_vec = tf.nn.embedding_lookup(self.embedding_mat, input_id)

        # compute score vector, softmax of it and loss in one function call
        loss = tf.nn.nce_loss(weights=self.output_mat,  # (v,h)
                              biases=self.output_bias,  # (v,)
                              labels=tf.expand_dims(target_id,axis=1),  # (batch,1)
                              inputs=embedding_vec,  # (batch,h)
                              num_sampled=1,  # TODO: negative sampling
                              num_classes=self.v)  
        return tf.math.reduce_mean(loss)  # average over loss of each sample

    def embedding(self, input_id):
        return tf.nn.embedding_lookup(self.embedding_mat, input_id)

In [7]:
ex = SkipGram(num_vocabulary=500, embedding_dim=100)
# example case where batch size is two
ex(input_id=tf.constant([1,2]),  
   target_id=tf.constant([3,99]))

<tf.Tensor: shape=(), dtype=float32, numpy=4.7685738>

# 2.3 Training

In [8]:
def train_step(model, input_batch, target_batch, optimizer):
    '''
    Training for one batch
    '''

    with tf.GradientTape() as tape:
        loss = model(input_batch, target_batch)  # call directly returns the loss
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        return loss


def nearest_neighbours(model, num_vocab, selected_word_id, val_dict, k=5):
    '''
    For selected words, find out k neighbouring words and print them
    '''

    cosine_similarity = lambda x,y : np.dot(x,y) / (np.linalg.norm(x) * np.linalg.norm(y))

    # embeddings of words
    embedding_selected_word = skipgram.embedding(tf.constant(selected_word_id))
    embedding_every_word = skipgram.embedding(tf.constant(list(range(num_vocab))))

    # fit nearest neighbours using cosine similarity and embeddings of all words
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree', metric=cosine_similarity)
    nbrs.fit(embedding_every_word)
    # find k_nearest nbrs of selected words. dim:(num_selected, k)
    id_nbrs = nbrs.kneighbors(embedding_selected_word, n_neighbors=k, return_distance=False)
    
    # print neighbours in words instead of id
    for i, sel_w_id in enumerate(selected_word_id):
        query_w = val_dict[sel_w_id]
        neigh_w = []
        for j in range(k):
            neigh_w.append(val_dict[id_nbrs[i,j]])
        print('{} {} most similar words: {}'.format(query_w, k, neigh_w))

In [9]:
NUM_VOCABULARY = max(val_to_id.values())+1  # number of vocabulary in the whole dataset
LEARNING_RATE = 0.01

# define model
skipgram = SkipGram(num_vocabulary=NUM_VOCABULARY)

# define optimizer
optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)

# initialize the logger for Tensorboard visualization
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/' + current_time + '/train'    
train_summary_writer = tf.summary.create_file_writer(train_log_dir)  

In [10]:
EPOCHS = 20
SELECTED_WORDS = [val_to_id[word] for word in ['queen', 'throne', 'wine', 'poison', 'love', 'strong', 'day']]

for epoch in range(EPOCHS):
    print('Epoch: ', epoch)
    ### Training step
    train_losses = []  # each entry is averaged loss of each batch
    # train over all batches
    for input_batch, target_batch in ds:
        train_losses.append(train_step(skipgram, input_batch, target_batch, optimizer))
    # log train loss
    with train_summary_writer.as_default():  
        tf.summary.scalar('loss', np.mean(train_losses), step=epoch)

    ### Nearest neighbours to check embeddings
    nearest_neighbours(skipgram, NUM_VOCABULARY, SELECTED_WORDS, id_to_val)
    print(' ')

Epoch:  0
queen 5 most similar words: ['ensue', 'aimd', 'ascent', 'thraldom', 'nayward']
throne 5 most similar words: ['discourses', 'ensue', 'shineth', 'aimd', 'concludes']
wine 5 most similar words: ['running', 'couch', 'sternness', 'supposest', 'regiment']
poison 5 most similar words: ['concludes', 'running', 'sufferance', 'ascent', 'already']
love 5 most similar words: ['ensue', 'thraldom', 'sneak', 'aimd', 'shineth']
strong 5 most similar words: ['ensue', 'common', 'discourses', 'aimd', 'ascent']
day 5 most similar words: ['ensue', 'discourses', 'common', 'aimd', 'shineth']
 
Epoch:  1
queen 5 most similar words: ['francisca', 'laughing', 'resemblance', 'monsters', 'simply']
throne 5 most similar words: ['laughing', 'francisca', 'resemblance', 'redeemd', 'race']
wine 5 most similar words: ['laughing', 'francisca', 'simply', 'tellst', 'resemblance']
poison 5 most similar words: ['francisca', 'laughing', 'resemblance', 'tellst', 'simply']
love 5 most similar words: ['laughing', 'fra