<a href="https://colab.research.google.com/github/Spinkk/Implementing-ANNs-with-Tensorflow/blob/main/HW10_Janosch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import string
import time
import datetime
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from sklearn.neighbors import NearestNeighbors
import random

# 1. The Dataset

Load the dataset, create dictionaries and separate each word

In [2]:
def preprocess_strings(ds, sentence_wise=True):
    # make numpy string array from tfds    
    tfds_to_numpy = lambda x: next(iter(x))['text'].numpy()
    ds = tfds_to_numpy(ds).decode()                             
    
    # make list of just words
    ds_words = ds.lower().replace('\n', ' ').translate({ord("'"): None})
    exclude = string.punctuation.translate({ord("'"): None})
    table = ds_words.maketrans(exclude, ' '*len(exclude))                   
    ds_words = np.array(ds_words.translate(table).split())
    
    # creates two lookup tables, val->id and id->val
    val_to_id = {val: i for i, val in enumerate(sorted(set(ds_words)))}    
    id_to_val = {id_: val for val, id_ in val_to_id.items()}
    vocab_size = len(val_to_id)
    
    # define occurances of each token
    word_freq = [np.count_nonzero(ds_words==val) for _, val in id_to_val.items()]

    # create a list of words split into sentences
    if sentence_wise: 
        ds = ds.lower().replace('\n', ' ').translate({ord("'"): None})
        exclude = string.punctuation.translate({ord("'"): None, ord('.'): None})
        table = ds.maketrans(exclude, ' '*len(exclude))
        ds = ' '.join(ds.translate(table).split()).split('.')
        ds = [sentence.translate({ord("."): None}).split() for sentence in ds]        
        
        ds = [[val_to_id[word] for word in sentence] for sentence in ds]
        
    # use list of words
    else:
        ds = [val_to_id[word] for words in ds_words]
    
    return ds, val_to_id, id_to_val, word_freq, vocab_size

In [3]:
train_ds = tfds.load(name='tiny_shakespeare',
                    shuffle_files=False, 
                    split='train')

ds, val_to_id, id_to_val, word_freq, vocab_size = preprocess_strings(train_ds)

In [4]:
s = 0.005
prob = lambda freq : (np.sqrt((freq/s))+1)*(s/freq)

def gen_word_embeddings():    
    while True:
        np.random.shuffle(ds)      
        
        # for each sentence generate one target and make input, target pairs from leftover words within sentence
        for sentence in ds:
            if len(sentence) == 0: continue
            
            word_id = np.random.randint(0, len(sentence))
            word = sentence[word_id]
            
            context_window = sentence[word_id-2:word_id] + sentence[word_id+1:word_id+3]
            np.random.shuffle(context_window)
            
            for target in context_window:
                # subsampling
                if random.random() < prob(word_freq[target]): 
                    continue
                yield word, target
                
gen = gen_word_embeddings()
    
train_ds = tf.data.Dataset.from_generator(gen_word_embeddings,
                               output_signature=(tf.TensorSpec(shape=(), dtype=tf.int64),
                                                 tf.TensorSpec(shape=(), dtype=tf.int64)))
    
train_ds = train_ds.batch(32).prefetch(tf.data.AUTOTUNE)

# 2.2 Model

In [5]:
class SkipGram(tf.keras.layers.Layer):
    def __init__(self, num_vocabulary, embedding_dim=64, num_negative_samples=100, word_frequencies=word_freq):
        super(SkipGram, self).__init__()
        self.v = num_vocabulary
        self.h = embedding_dim
        self.num_neg = num_negative_samples
        self.word_freq = word_frequencies
    
    def build(self,_):
        self.embedding_mat = self.add_weight(shape=(self.v, self.h),
                                             initializer="random_normal",
                                             trainable=True) 
        self.output_mat = self.add_weight(shape=(self.v, self.h),
                                          initializer="random_normal",
                                          trainable=True) 
        self.output_bias = self.add_weight(shape=(self.v,),
                                           initializer="random_normal",
                                           trainable=True)

    def call(self, input_id, target_id):
        # (batch,h) = from (v,h) select 'batch_num' v* by lookup
        embedding_vec = tf.nn.embedding_lookup(self.embedding_mat, input_id)
        
        # output indices
        true_classes = tf.expand_dims(target_id, axis=1) # (batch,1)
        # used for negative sampling based on word frequencies
        negative_sample_dist = tf.random.fixed_unigram_candidate_sampler(true_classes=true_classes,
                                                                         num_true=1,
                                                                         num_sampled=self.num_neg,
                                                                         unique=False,
                                                                         range_max=self.v,
                                                                         unigrams=self.word_freq)
        # compute score vector, softmax of it and loss in one function call
        loss = tf.nn.nce_loss(weights=self.output_mat,  # (v,h)
                              biases=self.output_bias,  # (v,)
                              labels=true_classes,  # (batch,1)
                              inputs=embedding_vec,  # (batch,h)
                              num_sampled=self.num_neg,
                              num_classes=self.v,
                              sampled_values = negative_sample_dist)  
        return tf.math.reduce_mean(loss)  # average over loss of each sample

    def embedding(self, input_id):
        return tf.nn.embedding_lookup(self.embedding_mat, input_id)

# 2.3 Training

In [6]:
# @tf.function
def train_step(model, input_batch, target_batch, optimizer):
    '''
    Training for one batch
    '''

    with tf.GradientTape() as tape:
        loss = model(input_batch, target_batch)  # call directly returns the loss
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        return loss

def nearest_neighbours(model, num_vocab, selected_word_id, val_dict, k=5):
    '''
    For selected words, find out k neighbouring words and print them
    '''

    cosine_similarity = lambda x,y : np.dot(x,y) / (np.linalg.norm(x) * np.linalg.norm(y))

    # embeddings of words
    embedding_selected_word = skipgram.embedding(tf.constant(selected_word_id))
    embedding_every_word = skipgram.embedding(tf.constant(list(range(num_vocab))))

    # fit nearest neighbours using cosine similarity and embeddings of all words
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree', metric=cosine_similarity)
    nbrs.fit(embedding_every_word)
    # find k_nearest nbrs of selected words. dim:(num_selected, k)
    id_nbrs = nbrs.kneighbors(embedding_selected_word, n_neighbors=k, return_distance=False)
    
    # print neighbours in words instead of id
    for i, sel_w_id in enumerate(selected_word_id):
        query_w = val_dict[sel_w_id]
        neigh_w = []
        for j in range(k):
            neigh_w.append(val_dict[id_nbrs[i,j]])
        print('{} {} most similar words: {}'.format(query_w, k, neigh_w))

In [7]:
learning_rate = 0.001

# define model
skipgram = SkipGram(num_vocabulary=vocab_size)

# define optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate)

# initialize the logger for Tensorboard visualization
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/' + current_time + '/train'    
train_summary_writer = tf.summary.create_file_writer(train_log_dir)  

In [8]:
epochs = 50
selected_words = [val_to_id[word] for word in ['queen', 'throne', 'wine', 'poison', 'love', 'strong', 'day', 'the']]

for epoch in range(epochs):
    print('\nEpoch: ', epoch)
    ### Training step
    train_losses = []  # each entry is averaged loss of each batch
    
    # train over all batches
    for input_batch, target_batch in train_ds.take(1000):
        train_losses.append(train_step(skipgram, input_batch, target_batch, optimizer))
    
    # log train loss
    with train_summary_writer.as_default():  
        tf.summary.scalar('loss', np.mean(train_losses), step=epoch)

    ### Nearest neighbours to check embeddings
    nearest_neighbours(skipgram, vocab_size, selected_words, id_to_val)


Epoch:  0
queen 5 most similar words: ['freezes', 'lot', 'marted', 'shivering', 'dismayd']
throne 5 most similar words: ['amerce', 'functions', 'gap', 'enacts', 'mab']
wine 5 most similar words: ['baits', 'complainings', 'counterpoised', 'marted', 'dismayd']
poison 5 most similar words: ['staffords', 'acre', 'enemys', 'spray', 'lot']
love 5 most similar words: ['freezes', 'lot', 'shivering', 'marted', 'pass']
strong 5 most similar words: ['raged', 'ribbons', 'innovator', 'functions', 'complainings']
day 5 most similar words: ['freezes', 'dismayd', 'lot', 'smock', 'marted']
the 5 most similar words: ['freezes', 'lot', 'dismayd', 'marted', 'shivering']

Epoch:  1
queen 5 most similar words: ['goest', 'petticoat', 'glean', 'biancas', 'savoury']
throne 5 most similar words: ['christopher', 'tilts', 'sprung', 'courses', 'savoury']
wine 5 most similar words: ['glean', 'goest', 'petticoat', 'sprung', 'biancas']
poison 5 most similar words: ['christopher', 'goest', 'misdoubteth', 'glean', 'sh

KeyboardInterrupt: 

In [None]:
%reload_ext tensorboard
%tensorboard --logdir logs/

# 3. Text generator

In [None]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [None]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print('Length of text: {} characters'.format(len(text)))

In [None]:
# The unique characters in the file
vocab = sorted(set(text))
print('{} unique characters'.format(len(vocab)))

# prepare the text (sequence of unique indices instead of characters)
# Data pipeline

In [None]:
def get_dictionaries(text):
    """
    Takes a text and maps its character vocabulary to unique indices and also outputs the reverse mapping
    """
    vocab = np.array(list(set(text)))
    token_to_index = {token_type: i for i, token_type in enumerate(vocab)}
    index_to_token = {v: k for k, v in token_to_index.items()}
    
    return token_to_index, index_to_token

In [None]:
token_to_index, index_to_token = get_dictionaries(text)


def char_idx(txt, dictionary = token_to_index):
    return np.vectorize(dictionary.get)(txt)

def idx_char(idx_txt, dictionary = index_to_token):
    return np.vectorize(dictionary.get)(idx_txt)

In [None]:
idx_char(tf.constant(np.array([0,1,4,2])))

In [None]:
text_np = np.array(list(text))

text_indices = char_idx(text_np)

dataset = tf.data.Dataset.from_tensor_slices(text_indices)

In [None]:
# batching
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

dataset = dataset.batch(seq_length+1, drop_remainder=True)

for seq in dataset.take(1):
    print(idx_char(seq))

In [None]:
dataset = dataset.map(lambda x: (x[:-1],x[1:]))

In [None]:
for input_example, target_example in  dataset.take(1):
    print("Input :", idx_char(input_example))
    print("Target:", idx_char(target_example))

In [None]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

In [None]:
class Simple_RNN_CELL(tf.keras.layers.Layer):
    
    def __init__(self, hidden_dim):
        super(Simple_RNN_CELL, self).__init__()
        
        self.units = hidden_dim
        self.dense = tf.keras.layers.Dense(hidden_dim)
        #self.act = tf.keras.layers.Activation(tf.nn.tanh)
    
    
    def call(self, x, state):
        
        hidden_state = state
        concat_input = tf.concat((x, hidden_state), axis=-1)
        out = self.dense(concat_input)
        #act_out = self.act(out)
        
        return out

In [None]:
class RNN(tf.keras.models.Model):
    def __init__(self,cell,context):
        super(RNN, self).__init__()
        self.cell = cell
        self.units = context

    def call(self,x,state):  
        seq_len = tf.shape(x)[1]
        # Tensor Array only needed in graph mode
        outs = tf.TensorArray(dtype=tf.float32, size=seq_len, clear_after_read=True)

        for t in tf.range(seq_len):
            t_out = self.cell(x[:,t,:], state)
            outs = outs.write(t, t_out)
            state = t_out
        out = outs.stack()
        out = tf.transpose(out, perm=[1,0,2])
        return out

    def zero_state(self, batch_size):
        return (tf.zeros((batch_size, self.cell.units)))

In [None]:
########    MODEL TO USE/REFINE (SELF-MADE SIMPLE RNN)
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim,rnn_units):
        super(MyModel, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        self.rnn_cell = Simple_RNN_CELL(embedding_dim)
        self.rnn = RNN(self.rnn_cell, context = 100)
        
        self.out = tf.keras.layers.Dense(vocab_size)
        
        #self.sm = tf.keras.layers.Activation(tf.nn.softmax)
        
    def call(self, x):
        batch_size = tf.shape(x)[0]
        x = self.embedding(x)
        zero_state = self.rnn.zero_state(batch_size)
        x = self.rnn(x, zero_state)
        x = self.out(x)
        #x = self.sm(x)

        return x

In [None]:
# Length of the vocabulary in chars
vocab_size = len(index_to_token.keys())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

model = MyModel(
    vocab_size=len(token_to_index.keys()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [None]:
@tf.function
def train_step(model, train_ds, loss_function, optimizer, train_loss_metric):
    '''
    Training for one epoch.
    '''
    for in_txt, out_txt in train_ds:
        # forward pass with GradientTape
        with tf.GradientTape() as tape:
            prediction = model(in_txt)#, training=True)
            loss = loss_function(out_txt, prediction)
            loss_reg = loss + tf.reduce_sum(model.losses)

        # backward pass via GradienTape (auto-gradient calc)
        gradients = tape.gradient(loss_reg, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        # update metrics
        train_loss_metric.update_state(loss)

In [None]:
import time
import datetime
class Timer():
    """
    A small class for making timings.
    """
    def __init__(self):
        self._start_time = None

    def start(self):
        """
        Start a new timer
        """
        if self._start_time is not None:
            raise TimerError(f"Timer is running. Use .stop() to stop it")

        self._start_time = time.perf_counter()

    def stop(self):
        """
        Stop the timer, and report the elapsed time
        """
        if self._start_time is None:
            print(f"Timer is not running. Use .start() to start it")
            return 0
    
        elapsed_time = time.perf_counter() - self._start_time
        self._start_time = None
        return elapsed_time  

In [None]:
epochs = 25
learning_rate = 0.0005

tf.keras.backend.clear_session()
timer = Timer()

model = MyModel(
    vocab_size=len(token_to_index.keys()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

loss_function = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate)

# prepare metrics
train_loss_metric = tf.keras.metrics.Mean('train_loss')

# Initialize lists for later visualization.
train_losses = []
times = []

In [None]:
# prepare metrics
train_loss_metric = tf.keras.metrics.Mean('train_loss')

# initialize the logger for Tensorboard visualization
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/gradient_tape/' + current_time + '/train_ResNet'      # defining the log dir

train_summary_writer = tf.summary.create_file_writer(train_log_dir)  # training logger

# Initialize lists for later visualization.
train_losses = []
times = []

In [None]:
# Resetting train metrics
train_loss_metric.reset_states()

for epoch in range(epochs):
    print(f'\n[EPOCH] ____________________{epoch}____________________')
    
    # training step with metrics update--------------------------------------------------------
    timer.start()

    train_step(model, dataset, loss_function, optimizer, train_loss_metric)

    # Evaluating training metrics
    train_loss = train_loss_metric.result()
    
    with train_summary_writer.as_default():     # logging our metrics to a file which is used by tensorboard
        tf.summary.scalar('loss', train_loss, step=epoch)

    train_losses.append(train_loss)
    
    elapsed_time = timer.stop()
    times.append(elapsed_time)
    
    print(f'[{epoch}] - Finished Epoch in {elapsed_time:0.2f} seconds - train_loss: {train_loss:0.4f}')

    
    # Resetting train and validation metrics-----------------------------------------------------
    train_loss_metric.reset_states()
    
    elapsed_time = timer.stop()
    times.append(elapsed_time)
  
    if epoch%3 == 0:
        print(f'\n[INFO] - Total time elapsed: {np.sum(times)/60:0.4f} min. Total time remaining: {(np.sum(times)/(epoch+1))*(epochs-epoch-1)/60:0.4f} min.')

print(f'[INFO] - Total run time: {np.sum(times)/60:0.4f} min.')

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

In [None]:
print("Input:\n", idx_char(input_example_batch[0]))
print("\n \n Next Char Predictions:\n", idx_char(sampled_indices))

In [None]:
def generate_next(input_txt, model, temperature, states = None):
    
    predicted_logits = model(inputs = input_txt, states = states)
    predicted_logits = predicted_logits[:, -1, :] # last predicted character
    predicted_logits = tf.nn.softmax(predicted_logits)
    predicted_logits = predicted_logits/temperature

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)
    
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = idx_char(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states