<a href="https://colab.research.google.com/github/Spinkk/Implementing-ANNs-with-Tensorflow/blob/main/HW10_Janosch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import string
import time
import datetime
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from sklearn.neighbors import NearestNeighbors

# 1. The Dataset

Load the dataset, create dictionaries and separate each word

In [2]:

def preprocess_strings(ds, sentence_wise=True):
    # make numpy string array from tfds    
    tfds_to_numpy = lambda x: next(iter(x))['text'].numpy()
    ds = tfds_to_numpy(ds).decode()                             
    
    # make list of just words
    ds_words = ds.lower().replace('\n', ' ').translate({ord("'"): None})
    exclude = string.punctuation.translate({ord("'"): None})
    table = ds_words.maketrans(exclude, ' '*len(exclude))                   
    ds_words = np.array(ds_words.translate(table).split())
    
    # creates two lookup tables, val->id and id->val
    dict_to_id = {val: i for i, val in enumerate(sorted(set(ds_words)))}        
    dict_to_val = {id_: val for val, id_ in dict_to_id.items()}
    vocab_size = len(ds_words)
    
    # define occurances of each token
    word_freq = [np.count_nonzero(ds_words==val) for _, val in dict_to_val.items()]

    # create a list of words split into sentences
    if sentence_wise: 
        ds = ds.lower().replace('\n', ' ').translate({ord("'"): None})
        exclude = string.punctuation.translate({ord("'"): None, ord('.'): None})
        table = ds.maketrans(exclude, ' '*len(exclude))
        ds = ' '.join(ds.translate(table).split()).split('.')
        ds = [sentence.translate({ord("."): None}).split() for sentence in ds]        
        
        ds = [[dict_to_id[word] for word in sentence] for sentence in ds]
        
    # use list of words
    else:
        ds = [dict_to_id[word] for words in ds_words]
    
    return ds, dict_to_id, dict_to_val, word_freq, vocab_size
def preprocess_tf_dataset(ds, word_to_id, vocab_size, threads=16, batch_size=32):
    # no need to shuffle, as dataset is shuffled within generator
    
    ds = ds.map(lambda x, y: (tf.one_hot(x, depth=vocab_size), 
                              tf.one_hot(y, depth=vocab_size)), 
                num_parallel_calls=threads)
    
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    
    return ds
train_ds = tfds.load(name='tiny_shakespeare',
                    shuffle_files=False, 
                    split='train')

train_ds, train_to_id, train_to_val, WORD_FREQ, VOCAB_SIZE = preprocess_strings(train_ds)

ds = train_ds

def gen_word_embeddings():    
    while True:
        np.random.shuffle(ds)      
        
        # for each sentence generate one target and make input, target pairs from leftover words within sentence
        for sentence in ds:
            if len(sentence) == 0: continue
            
            word_id = np.random.randint(0, len(sentence))
            word = sentence[word_id]
            
            context_window = sentence[word_id-2:word_id] + sentence[word_id+1:word_id+3]
            np.random.shuffle(context_window)
            
            for target in context_window:
                yield word, target
                
gen = gen_word_embeddings()
for i in range(13):
    print(next(gen)[0])
train_ds = tf.data.Dataset.from_generator(gen_word_embeddings,
                               output_signature=(tf.TensorSpec(shape=(), dtype=tf.int64),
                                                 tf.TensorSpec(shape=(), dtype=tf.int64)))

for i, x in train_ds.take(5):
    print(i)
    print(i.numpy())
    
train_ds = preprocess_tf_dataset(train_ds, train_to_id, VOCAB_SIZE)

for x, t in train_ds.take(1):
    print(x)
    print(t)
    break


[1mDownloading and preparing dataset tiny_shakespeare/1.0.0 (download: Unknown size, generated: 1.06 MiB, total: 1.06 MiB) to /root/tensorflow_datasets/tiny_shakespeare/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/tiny_shakespeare/1.0.0.incompletePZTCCC/tiny_shakespeare-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/tiny_shakespeare/1.0.0.incompletePZTCCC/tiny_shakespeare-validation.tfrecord


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/tiny_shakespeare/1.0.0.incompletePZTCCC/tiny_shakespeare-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

[1mDataset tiny_shakespeare downloaded and prepared to /root/tensorflow_datasets/tiny_shakespeare/1.0.0. Subsequent calls will reuse this data.[0m
11467
11467
5013
5013
5013
5013
10037
10037
10037
10037
4322
4322
10881
tf.Tensor(9149, shape=(), dtype=int64)
9149
tf.Tensor(9149, shape=(), dtype=int64)
9149
tf.Tensor(9149, shape=(), dtype=int64)
9149
tf.Tensor(5013, shape=(), dtype=int64)
5013
tf.Tensor(5013, shape=(), dtype=int64)
5013
tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]], shape=(32, 183574), dtype=float32)
tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]], shape=(32, 183574), dtype=float32)


# 2.2 Model

In [3]:
class SkipGram(tf.keras.layers.Layer):
    def __init__(self, num_vocabulary, embedding_dim=64, num_negative_samples=100, word_frequencies=WORD_FREQ):
        super(SkipGram, self).__init__()
        self.v = num_vocabulary
        self.h = embedding_dim
        self.num_neg = num_negative_samples
        self.word_freq = WORD_FREQ
    
    def build(self,_):
        self.embedding_mat = self.add_weight(shape=(self.v, self.h),
                                             initializer="random_normal",
                                             trainable=True) 
        self.output_mat = self.add_weight(shape=(self.v, self.h),
                                          initializer="random_normal",
                                          trainable=True) 
        self.output_bias = self.add_weight(shape=(self.v,),
                                           initializer="random_normal",
                                           trainable=True)

    def call(self, input_id, target_id):
        # (batch,h) = from (v,h) select 'batch_num' v* by lookup
        embedding_vec = tf.nn.embedding_lookup(self.embedding_mat, input_id)
        
        # output indices
        true_classes = tf.expand_dims(target_id, axis=1) # (batch,1)
        # used for negative sampling based on word frequencies
        negative_sample_dist = tf.random.fixed_unigram_candidate_sampler(true_classes=true_classes,
                                                                         num_true=1,
                                                                         num_sampled=self.num_neg,
                                                                         unique=False,
                                                                         range_max=self.v,
                                                                         unigrams=self.word_freq)
        # compute score vector, softmax of it and loss in one function call
        loss = tf.nn.nce_loss(weights=self.output_mat,  # (v,h)
                              biases=self.output_bias,  # (v,)
                              labels=true_classes,  # (batch,1)
                              inputs=embedding_vec,  # (batch,h)
                              num_sampled=self.num_neg,
                              num_classes=self.v,
                              sampled_values = negative_sample_dist)  
        return tf.math.reduce_mean(loss)  # average over loss of each sample

    def embedding(self, input_id):
        return tf.nn.embedding_lookup(self.embedding_mat, input_id)

In [4]:
# ex = SkipGram(num_vocabulary=500, embedding_dim=100)
# # example case where batch size is two
# ex(input_id=tf.constant([1,2],dtype=tf.int64),  
#    target_id=tf.constant([3,99],dtype=tf.int64))

# 2.3 Training

In [5]:
@tf.function
def train_step(model, input_batch, target_batch, optimizer):
    '''
    Training for one batch
    '''

    with tf.GradientTape() as tape:
        loss = model(input_batch, target_batch)  # call directly returns the loss
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        return loss


def nearest_neighbours(model, num_vocab, selected_word_id, val_dict, k=5):
    '''
    For selected words, find out k neighbouring words and print them
    '''

    cosine_similarity = lambda x,y : np.dot(x,y) / (np.linalg.norm(x) * np.linalg.norm(y))

    # embeddings of words
    embedding_selected_word = skipgram.embedding(tf.constant(selected_word_id))
    embedding_every_word = skipgram.embedding(tf.constant(list(range(num_vocab))))

    # fit nearest neighbours using cosine similarity and embeddings of all words
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree', metric=cosine_similarity)
    nbrs.fit(embedding_every_word)
    # find k_nearest nbrs of selected words. dim:(num_selected, k)
    id_nbrs = nbrs.kneighbors(embedding_selected_word, n_neighbors=k, return_distance=False)
    
    # print neighbours in words instead of id
    for i, sel_w_id in enumerate(selected_word_id):
        query_w = val_dict[sel_w_id]
        neigh_w = []
        for j in range(k):
            neigh_w.append(val_dict[id_nbrs[i,j]])
        print('{} {} most similar words: {}'.format(query_w, k, neigh_w))

In [6]:
LEARNING_RATE = 0.001

# define model
skipgram = SkipGram(num_vocabulary=VOCAB_SIZE)

# define optimizer
optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)

# initialize the logger for Tensorboard visualization
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/' + current_time + '/train'    
train_summary_writer = tf.summary.create_file_writer(train_log_dir)  

In [7]:
EPOCHS = 50
SELECTED_WORDS = [val_to_id[word] for word in ['queen', 'throne', 'wine', 'poison', 'love', 'strong', 'day', 'the']]

for epoch in range(EPOCHS):
    print('Epoch: ', epoch)
    ### Training step
    train_losses = []  # each entry is averaged loss of each batch
    # train over all batches
    for input_batch, target_batch in ds:
        train_losses.append(train_step(skipgram, input_batch, target_batch, optimizer))
    # log train loss
    with train_summary_writer.as_default():  
        tf.summary.scalar('loss', np.mean(train_losses), step=epoch)

    ### Nearest neighbours to check embeddings
    nearest_neighbours(skipgram, VOCAB_SIZE, SELECTED_WORDS, id_to_val)
    print(' ')

NameError: ignored

In [None]:
%reload_ext tensorboard
%tensorboard --logdir logs/