<a href="https://colab.research.google.com/github/Spinkk/Implementing-ANNs-with-Tensorflow/blob/main/HW10_Janosch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import string
import time
import datetime
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from sklearn.neighbors import NearestNeighbors

# 1. The Dataset

Load the dataset, create dictionaries and separate each word

In [2]:
def preprocess_strings(ds, sentence_wise=True):
    # make numpy string array from tfds    
    tfds_to_numpy = lambda x: next(iter(x))['text'].numpy()
    ds = tfds_to_numpy(ds).decode()                             
    
    # make list of just words
    ds_words = ds.lower().replace('\n', ' ').translate({ord("'"): None})
    exclude = string.punctuation.translate({ord("'"): None})
    table = ds_words.maketrans(exclude, ' '*len(exclude))                   
    ds_words = np.array(ds_words.translate(table).split())
    
    # creates two lookup tables, val->id and id->val
    val_to_id = {val: i for i, val in enumerate(sorted(set(ds_words)))}    
    id_to_val = {id_: val for val, id_ in val_to_id.items()}
    vocab_size = len(val_to_id)
    
    # define occurances of each token
    word_freq = [np.count_nonzero(ds_words==val) for _, val in id_to_val.items()]

    # create a list of words split into sentences
    if sentence_wise: 
        ds = ds.lower().replace('\n', ' ').translate({ord("'"): None})
        exclude = string.punctuation.translate({ord("'"): None, ord('.'): None})
        table = ds.maketrans(exclude, ' '*len(exclude))
        ds = ' '.join(ds.translate(table).split()).split('.')
        ds = [sentence.translate({ord("."): None}).split() for sentence in ds]        
        
        ds = [[val_to_id[word] for word in sentence] for sentence in ds]
        
    # use list of words
    else:
        ds = [val_to_id[word] for words in ds_words]
    
    return ds, val_to_id, id_to_val, word_freq, vocab_size

In [3]:
train_ds = tfds.load(name='tiny_shakespeare',
                    shuffle_files=False, 
                    split='train')

ds, val_to_id, id_to_val, word_freq, vocab_size = preprocess_strings(train_ds)

In [4]:
def gen_word_embeddings():    
    while True:
        np.random.shuffle(ds)      
        
        # for each sentence generate one target and make input, target pairs from leftover words within sentence
        for sentence in ds:
            if len(sentence) == 0: continue
            
            word_id = np.random.randint(0, len(sentence))
            word = sentence[word_id]
            
            context_window = sentence[word_id-2:word_id] + sentence[word_id+1:word_id+3]
            np.random.shuffle(context_window)
            
            for target in context_window:
                yield word, target
                
gen = gen_word_embeddings()
    
train_ds = tf.data.Dataset.from_generator(gen_word_embeddings,
                               output_signature=(tf.TensorSpec(shape=(), dtype=tf.int64),
                                                 tf.TensorSpec(shape=(), dtype=tf.int64)))
    
train_ds = train_ds.batch(32).prefetch(tf.data.AUTOTUNE)

# 2.2 Model

In [6]:
class SkipGram(tf.keras.layers.Layer):
    def __init__(self, num_vocabulary, embedding_dim=64, num_negative_samples=100, word_frequencies=word_freq):
        super(SkipGram, self).__init__()
        self.v = num_vocabulary
        self.h = embedding_dim
        self.num_neg = num_negative_samples
        self.word_freq = word_frequencies
    
    def build(self,_):
        self.embedding_mat = self.add_weight(shape=(self.v, self.h),
                                             initializer="random_normal",
                                             trainable=True) 
        self.output_mat = self.add_weight(shape=(self.v, self.h),
                                          initializer="random_normal",
                                          trainable=True) 
        self.output_bias = self.add_weight(shape=(self.v,),
                                           initializer="random_normal",
                                           trainable=True)

    def call(self, input_id, target_id):
        # (batch,h) = from (v,h) select 'batch_num' v* by lookup
        embedding_vec = tf.nn.embedding_lookup(self.embedding_mat, input_id)
        
        # output indices
        true_classes = tf.expand_dims(target_id, axis=1) # (batch,1)
        # used for negative sampling based on word frequencies
        negative_sample_dist = tf.random.fixed_unigram_candidate_sampler(true_classes=true_classes,
                                                                         num_true=1,
                                                                         num_sampled=self.num_neg,
                                                                         unique=False,
                                                                         range_max=self.v,
                                                                         unigrams=self.word_freq)
        # compute score vector, softmax of it and loss in one function call
        loss = tf.nn.nce_loss(weights=self.output_mat,  # (v,h)
                              biases=self.output_bias,  # (v,)
                              labels=true_classes,  # (batch,1)
                              inputs=embedding_vec,  # (batch,h)
                              num_sampled=self.num_neg,
                              num_classes=self.v,
                              sampled_values = negative_sample_dist)  
        return tf.math.reduce_mean(loss)  # average over loss of each sample

    def embedding(self, input_id):
        return tf.nn.embedding_lookup(self.embedding_mat, input_id)

# 2.3 Training

In [7]:
# @tf.function
def train_step(model, input_batch, target_batch, optimizer):
    '''
    Training for one batch
    '''

    with tf.GradientTape() as tape:
        loss = model(input_batch, target_batch)  # call directly returns the loss
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        return loss

def nearest_neighbours(model, num_vocab, selected_word_id, val_dict, k=5):
    '''
    For selected words, find out k neighbouring words and print them
    '''

    cosine_similarity = lambda x,y : np.dot(x,y) / (np.linalg.norm(x) * np.linalg.norm(y))

    # embeddings of words
    embedding_selected_word = skipgram.embedding(tf.constant(selected_word_id))
    embedding_every_word = skipgram.embedding(tf.constant(list(range(num_vocab))))

    # fit nearest neighbours using cosine similarity and embeddings of all words
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree', metric=cosine_similarity)
    nbrs.fit(embedding_every_word)
    # find k_nearest nbrs of selected words. dim:(num_selected, k)
    id_nbrs = nbrs.kneighbors(embedding_selected_word, n_neighbors=k, return_distance=False)
    
    # print neighbours in words instead of id
    for i, sel_w_id in enumerate(selected_word_id):
        query_w = val_dict[sel_w_id]
        neigh_w = []
        for j in range(k):
            neigh_w.append(val_dict[id_nbrs[i,j]])
        print('{} {} most similar words: {}'.format(query_w, k, neigh_w))

In [8]:
learning_rate = 0.001

# define model
skipgram = SkipGram(num_vocabulary=vocab_size)

# define optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate)

# initialize the logger for Tensorboard visualization
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/' + current_time + '/train'    
train_summary_writer = tf.summary.create_file_writer(train_log_dir)  

In [9]:
epochs = 50
selected_words = [val_to_id[word] for word in ['queen', 'throne', 'wine', 'poison', 'love', 'strong', 'day', 'the']]

for epoch in range(epochs):
    print('\nEpoch: ', epoch)
    ### Training step
    train_losses = []  # each entry is averaged loss of each batch
    
    # train over all batches
    for input_batch, target_batch in train_ds.take(100):
        train_losses.append(train_step(skipgram, input_batch, target_batch, optimizer))
    
    # log train loss
    with train_summary_writer.as_default():  
        tf.summary.scalar('loss', np.mean(train_losses), step=epoch)

    ### Nearest neighbours to check embeddings
    nearest_neighbours(skipgram, vocab_size, selected_words, id_to_val)


Epoch:  0
queen 5 most similar words: ['loudst', 'coldest', 'undertaen', 'glimpse', 'pursuit']
throne 5 most similar words: ['coldest', 'scandald', 'lots', 'profit', 'wakefield']
wine 5 most similar words: ['pugging', 'valentine', 'unvulnerable', 'lots', 'unbarbed']
poison 5 most similar words: ['woo', 'fade', 'terrible', 'unskilfully', 'meal']
love 5 most similar words: ['meal', 'strengthend', 'woo', 'ashore', 'lots']
strong 5 most similar words: ['mistaking', 'unskilfully', 'morrow', 'remembrances', 'talks']
day 5 most similar words: ['planets', 'rend', 'omit', 'lots', 'talks']
the 5 most similar words: ['profit', 'morrow', 'mistaking', 'lots', 'unjust']

Epoch:  1
queen 5 most similar words: ['light', 'drawn', 'denials', 'endures', 'tuae']
throne 5 most similar words: ['level', 'sunders', 'wolvish', 'continuance', 'according']
wine 5 most similar words: ['drest', 'expedient', 'level', 'swift', 'unsaluted']
poison 5 most similar words: ['glove', 'offering', 'eunuch', 'earth', 'perch

queen 5 most similar words: ['denying', 'property', 'sugar', 'terrors', 'vault']
throne 5 most similar words: ['covenant', 'foolishly', 'claps', 'workmanly', 'borough']
wine 5 most similar words: ['denying', 'unmoand', 'sugar', 'property', 'perch']
poison 5 most similar words: ['property', 'sugar', 'seemers', 'scroll', 'workmanly']
love 5 most similar words: ['denying', 'property', 'terrors', 'sugar', 'unmoand']
strong 5 most similar words: ['seemers', 'denying', 'sugar', 'terrors', 'scoffing']
day 5 most similar words: ['denying', 'perch', 'vault', 'property', 'unmoand']
the 5 most similar words: ['denying', 'property', 'sugar', 'terrors', 'vault']

Epoch:  14
queen 5 most similar words: ['distemperature', 'butt', 'ifs', 'tying', 'cuckolds']
throne 5 most similar words: ['impartial', 'flung', 'jesus', 'incorporate', 'advantage']
wine 5 most similar words: ['tying', 'butt', 'stays', 'nails', 'ifs']
poison 5 most similar words: ['conquerors', 'tying', 'mount', 'secret', 'ifs']
love 5 mo

queen 5 most similar words: ['seasons', 'oppress', 'handling', 'confesses', 'jested']
throne 5 most similar words: ['cats', 'seasons', 'breathest', 'determine', 'galld']
wine 5 most similar words: ['seasons', 'adoreth', 'tuners', 'grating', 'deputation']
poison 5 most similar words: ['seasons', 'intercepted', 'breathest', 'worships', 'herdsmen']
love 5 most similar words: ['seasons', 'unfeeling', 'poland', 'hyperbolical', 'dangerously']
strong 5 most similar words: ['seasons', 'comb', 'edict', 'doubleness', 'foreign']
day 5 most similar words: ['unfeeling', 'poland', 'seasons', 'meekness', 'hyperbolical']
the 5 most similar words: ['seasons', 'poland', 'unfeeling', 'meekness', 'hyperbolical']

Epoch:  26
queen 5 most similar words: ['hereby', 'orange', 'parchd', 'distinguishd', 'ransackd']
throne 5 most similar words: ['behests', 'wreak', 'mads', 'idles', 'acted']
wine 5 most similar words: ['unarmd', 'importuned', 'named', 'idles', 'behests']
poison 5 most similar words: ['importuned'

queen 5 most similar words: ['recovery', 'boding', 'instrument', 'thankings', 'abusing']
throne 5 most similar words: ['entombed', 'sap', 'ledas', 'ethiopes', 'squarest']
wine 5 most similar words: ['sap', 'ethiopes', 'enfranchised', 'tempers', 'prolixity']
poison 5 most similar words: ['sap', 'exhales', 'continually', 'ginger', 'enfranchised']
love 5 most similar words: ['sap', 'enfranchised', 'quoifs', 'exhales', 'businesses']
strong 5 most similar words: ['sap', 'prolixity', 'enfranchised', 'entombed', 'continually']
day 5 most similar words: ['sap', 'exhales', 'businesses', 'enfranchised', 'quoifs']
the 5 most similar words: ['refuse', 'unvenerable', 'distinguishd', 'orange', 'wave']

Epoch:  38
queen 5 most similar words: ['recovery', 'instrument', 'exiled', 'jacob', 'fairest']
throne 5 most similar words: ['entombed', 'sap', 'ledas', 'ethiopes', 'squarest']
wine 5 most similar words: ['sap', 'enfranchised', 'ethiopes', 'tempers', 'prolixity']
poison 5 most similar words: ['sap', 

queen 5 most similar words: ['solace', 'passengers', 'sciatica', 'vitae', 'softest']
throne 5 most similar words: ['softest', 'slight', 'choked', 'interest', 'unreverent']
wine 5 most similar words: ['softest', 'touched', 'intercepted', 'intents', 'viewing']
poison 5 most similar words: ['dice', 'intercepted', 'softest', 'atlas', 'cue']
love 5 most similar words: ['solace', 'passengers', 'sciatica', 'softest', 'jelly']
strong 5 most similar words: ['poland', 'ifs', 'softest', 'blinding', 'alterd']
day 5 most similar words: ['solace', 'passengers', 'sciatica', 'softest', 'associate']
the 5 most similar words: ['solace', 'passengers', 'sciatica', 'vitae', 'graceless']


In [10]:
%reload_ext tensorboard
%tensorboard --logdir logs/

Reusing TensorBoard on port 6006 (pid 41062), started 0:45:47 ago. (Use '!kill 41062' to kill it.)