<a href="https://colab.research.google.com/github/Siarzis/ai-tutor/blob/main/numpy_lstm_text_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import libraries

In [1]:
import numpy as np

from torch.utils.data import Dataset

Create a simple dataset of text sequences

In [2]:
np.random.seed(42)

def generate_dataset(num_words=100, max_word_size=10):

    sequences = []

    for _ in range (num_words):

        random_number =  np.random.randint(1, max_word_size)
        sample = ['s'] * random_number + ['t'] * random_number +  ['EOS']
        sequences.append(sample)

    return sequences

num_words=100
max_word_size=10

vocabulary = generate_dataset(num_words, max_word_size)
print(vocabulary[88])

['s', 's', 's', 's', 's', 's', 's', 's', 't', 't', 't', 't', 't', 't', 't', 't', 'EOS']


Correspond Vocabulary Tokens to Indices

In [None]:
def tokens_to_indices(vocabulary):

    token_to_index = {}
    index_to_token = {}

    # flatten the given vocabulary to convert it into tokens
    flatten_vocabulary = sum(vocabulary, [])

    tokens = list(set(flatten_vocabulary))

    # append the unknown-character token
    tokens.append('UNK')

    for i, t in enumerate(tokens):
        token_to_index[t] = i
        index_to_token[i] = t

    return token_to_index, index_to_token

tokens_to_indices(vocabulary)

({'t': 0, 'EOS': 1, 's': 2, 'UNK': 3}, {0: 't', 1: 'EOS', 2: 's', 3: 'UNK'})

Perform one-hot encoding on the generated vocabulary

In [None]:
def seq_to_dict(vocabulary):

    word_to_index = {}
    index_to_word = {}

    str_vocabulary = [''.join(word) for word in vocabulary]

    # remove duplicate words in the dictionary
    unique_vocabulary = set(str_vocabulary)
    unique_vocabulary = list(unique_vocabulary)

    # find the word with the maximum length in the vocabulary
    max_word_size = max(unique_vocabulary, key=len)
    # compute the maximum word length
    max_word_size = int((len(max_word_size) - 3 ) / 2)

    # create a diagonal list of lists
    encoding = [['0' if i != j else '1' for j in range(max_word_size)] for i in range(max_word_size)]
    encoding = [''.join(row) for row in encoding]

    for word in unique_vocabulary:

        word_length = int((len(word) - 3 ) / 2)
        word_to_index[word] = encoding[word_length-1]
        index_to_word[encoding[word_length-1]] = word

    return word_to_index, index_to_word

word_to_index, index_to_word = seq_to_dict(vocabulary)

print(word_to_index)
print(index_to_word)

index_to_word = 1

In [None]:
class SimpleSequencesDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets

    def __len__(self):
        return len(self.data)

    def __getitem__(self, id):
        sample = {'data': self.data[id], 'target': self.targets[id]}


Initialization of the RNN

In [None]:
def rnn_init(num_hidden_nodes, num_tokens):

    # initialize input weight matrix
    W_x = np.random.uniform(-1, 1, size=(num_hidden_nodes, num_tokens))

    # initialize hidden state weight matrix
    W_h = np.zeros((num_hidden_nodes, num_hidden_nodes))

    # initialize output weight matrix
    W_y = np.random.uniform(-1, 1, size=(num_tokens, num_hidden_nodes))

    return W_x, W_h, W_y

rnn_init(3, 4)

(array([[ 0.33797651,  0.16137324, -0.25543447,  0.88026688],
        [ 0.94732767, -0.43215805, -0.38927228, -0.02877249],
        [-0.10315171,  0.98891493, -0.64814949, -0.96384927]]),
 array([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]),
 array([[-0.01221257, -0.64235458, -0.26706243],
        [ 0.48834105,  0.44187985, -0.38387842],
        [ 0.08508046,  0.01762815,  0.27266524],
        [-0.49907636,  0.1797417 ,  0.95778572]]))

Definition of activation functions

In [None]:
def softmax(x):
    # calculate the exponential of each element in the array
    exp_x = np.exp(x)

    # calculate the sum of the exponentials
    sum_exp_x = np.sum(exp_x)

    # calculate the softmax values by dividing each exponential by the sum of exponentials
    softmax_values = exp_x / sum_exp_x

    return softmax_values

Forward pass

In [None]:
def rnn_forward_pass():

    # calculate hidden nodes (a_t)
    a_t = W_h @ h_t_1 + W_x @ x_t

    # calculate hidden state
    h_t = np.tanh(a_t) # TO CHECK

    # calculate RNN output
    # softmax maps a given vector to range [0.0, 1.0]. Since we have a one-hot
    # encoding representation, we need to find which index of the generated
    # output has the largest probability to be predicted
    y_t = softmax(W_y @ h_t)



In [None]:
#