In [44]:
import re
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch.nn import Embedding

In [51]:
text_corpus = open("data/quotes.txt", "r").readlines()
text_corpus

['Let us see what love can do.\n',
 'We can’t heal the world today. But we can begin with a voice of compassion, a heart of love, and an act of kindness.\n',
 'Listen with curiosity. Speak with honesty. Act with integrity.\n',
 'The most basic and powerful way to connect to another person is to listen. Just listen.\n',
 'Knowledge speaks, but wisdom listens.\n',
 'Deep listening is the kind of listening that can help relieve the suffering of another person.\n',
 'Every person in this life has something to teach—and as soon as you accept that, you open yourself to truly listening.\n',
 'There are no problems we cannot solve together, and very few that we can solve by ourselves.\n',
 'I always wondered why somebody didn’t do something about that; then I realized that I am somebody.\n',
 'Let us develop respect for all living things. Let us try to replace violence and intolerance with understanding, compassion and love.\n',
 "We say 'things will turn out', but no. It's what we do that act

In [47]:
def clean_str(string, tolower=True):
	"""
	Tokenization/string cleaning.
	Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
	"""
	string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
	string = re.sub(r"\'s", " \'s", string)
	string = re.sub(r"\'ve", " \'ve", string)
	string = re.sub(r"n\'t", " n\'t", string)
	string = re.sub(r"\'re", " \'re", string)
	string = re.sub(r"\'d", " \'d", string)
	string = re.sub(r"\'ll", " \'ll", string)
	string = re.sub(r",", " , ", string)
	string = re.sub(r"!", " ! ", string)
	string = re.sub(r"\(", " \( ", string)
	string = re.sub(r"\)", " \) ", string)
	string = re.sub(r"\?", " \? ", string)
	string = re.sub(r"\s{2,}", " ", string)
	if tolower:
		string = string.lower()
	return string.split()

In [48]:
# tokenize each sentence
tokenized = [[token for token in clean_str(sent)] for sent in text_corpus]
# We create a vocabulary and assign a unique id to each word
PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"
vocab = {PAD_TOKEN:0, UNK_TOKEN:1}
for sentence in tokenized:
    for word in sentence:
        if word not in vocab:
            vocab[word] = len(vocab)
            
# Now we recreate the sentences with obtained ids
max_len = max(len(sentence) for sentence in tokenized)
tensor_data = []
for sentence in tokenized:
    sentence_indices = []
    for token in sentence:
        sentence_indices.append(vocab[token])
    if len(sentence_indices) < max_len:
        sentence_indices += [vocab[PAD_TOKEN]]*(max_len-len(sentence_indices))
    tensor_data.append(sentence_indices)
    
tensor_data = torch.tensor(tensor_data, dtype=torch.int32)

In [43]:
emb = Embedding(embedding_dim=6, num_embeddings=len(vocab), padding_idx=0)
embeddings = emb(tensor_data)
embeddings

tensor([[[-0.8351,  1.6095, -0.0973,  0.0694, -1.0906, -0.2793],
         [-0.1760,  0.7173,  1.2187,  0.6141, -1.1585,  1.3559],
         [ 0.8960,  1.6278, -1.3412, -1.9711, -0.3956,  1.4752],
         [ 0.4667,  0.4189, -0.5867,  0.2211,  0.9855, -0.4954],
         [ 0.7711, -0.1208, -0.4734, -1.2218,  0.9545,  0.4131],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 1.2328,  0.5914, -0.0406, -0.0970,  0.1206,  0.5278],
         [-0.1760,  0.7173,  1.2187,  0.6141, -1.1585,  1.3559],
         [-1.3019,  0.0800,  0.1454, -0.7564,  0.5704,  1.5285],
         [ 1.0322,  1.0707, -0.9803,  0.1859,  1.3251, -1.0462],
         [ 1.3646, -0.4283,  0.6444, -0.3330,  1.7856,  1.7103],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0

In [None]:
##### STEP 2: Transformers model #####
class Transformers:
    def __init__(self):
        pass
    def positional_enc(self):
        pass
    def self_attn(self):
        pass
    def multi_head_self_attn(self):
        pass
    def encoder(self):
        pass
    def decoder(self):
        pass
    def forward(self, input):
        pass