In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import Counter
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class scaledDotProduct(nn.Module):
    '''
        Attention(Q, K, V ) = softmax( QK^T/√d_k)V 
    
    '''
    def __init__(self, dim, drop=0.0):
        super(scaledDotProduct, self).__init__()
        #dim is (d_k) when sqrt'd it is meant to counter small gradients in large sets of queries and keys
        self.d_k = np.sqrt(dim)
        #Simple drop out 
        self.drop = nn.Dropout(drop)

    def forward(self, q, k, v, mask=None):
        #first two dimensions are batch and number of heads?
        n = torch.matmul(q, k.transpose(2,3)) / self.d_k

        if mask != None:
            n = n.masked_fill_(mask==0, -1e9)
        #Drop out referenced later in paper but not in original diagram
        att = self.drop(F.softmax(n, -1))

        out = torch.matmul(n, v)

        return out, att 
        
        


In [None]:
#Scaled dot product attention testing
#dim should be size of q and k
scaled_dot = scaledDotProduct(3)
q = torch.rand(1,1,2,3)
k = torch.rand(1,1,2,3)
v = torch.rand(1,1,2,4)


scaled_dot(q,k,v)

In [None]:
class multiHeadedAttention(nn.Module):
    def __init__(self, n_heads, dims, d_k, d_v, dropout=0.0):
        super(multiHeadedAttention, self).__init__()
        #d_k=d_v = dims/h

        self.n_heads = n_heads
        self.d_k = d_k
        self.d_v = d_v
        #Pre-attention projection matrices
        self.w_q = nn.Linear(dims, n_heads * d_k, bias=False)
        self.w_k = nn.Linear(dims, n_heads * d_k, bias=False)
        self.w_v = nn.Linear(dims, n_heads * d_v, bias=False)

        self.att = scaledDotProduct(d_k)
        #Final linear layer after concat and attention
        self.fc = nn.Linear(n_heads*d_v, dims)

        self.drop = nn.Dropout(dropout)
        self.norm = nn.LayerNorm(dims,eps=1e-6)

    def forward(self, q, k, v, mask=None):
        d_k, d_v, heads = self.d_k, self.d_v, self.n_heads
        batch_len, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)

        res = q

        #Pass through projection layers prior to attention layer batch x length of query x (nheads x value dimensionality)
        #View as batches x len of query x numbers of heads x dimensionality to sperate out heads dimension
        #print(q.shape)
        q = self.w_q(q).view(batch_len, len_q, heads, d_k)
        k = self.w_k(k).view(batch_len, len_k, heads, d_k)
        v = self.w_v(v).view(batch_len, len_v, heads, d_v)


        #Transpose for attention
        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)

        if mask != None:
            mask = mask.unsqueeze(1)

        q, attn = self.att(q, k, v, mask=mask)
        #Move head dim back - batch x len query x heads x dimensionality
        #Combined all heads into one - batch x len query x (heads x dimensionality)
        q = q.transpose(1,2).contiguous().view(batch_len, len_q, -1)
        q = self.drop(self.fc(q))
        q += res

        q = self.norm(q)

        return q, attn

        


In [None]:
#heads, d_model, d_km d_v as per the paper
multiHead = multiHeadedAttention(8, 512, 64, 64)
#batches, dims, dimensionalityxn_heads
q = torch.rand(1,512,512)
k = torch.rand(1,512,512)
v = torch.rand(1,512,512)


multiHead(q,k,v)

In [None]:
class positionFeedFoward(nn.Module):
    def __init__(self, inp, hid, drop=0.0):
        super(positionFeedFoward, self).__init__()
        self.w1 = nn.Linear(inp,hid)
        self.w2 = nn.Linear(hid,inp)
        self.norm = nn.LayerNorm(inp, eps=1e-6)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        res = x

        x = self.w2(F.relu(self.w1(x)))
        x = self.drop(x)
        x += res
        x = self.norm(x)

        return x

In [None]:
class EncoderLayer(nn.Module):
    '''Combinds MultiHeadedAttention and FeedForward, two layers'''
    def __init__(self, dims, hid, nheads, d_k, d_v, drop=0.0):
        super(EncoderLayer, self).__init__()
        self.attn = multiHeadedAttention(nheads, dims,d_k, d_v, dropout=drop)
        self.ffn = positionFeedFoward(dims, hid, drop=drop)

    def forward(self, inp, mask=None):
        out, attn = self.attn(
            inp, inp, inp, mask
        )
        out = self.ffn(out)

        return out, attn
    
class DecoderLayer(nn.Module):
    '''Combinds MultiHeadedAttention and FeeForward, three layers'''
    def __init__(self, dims, hid, nheads, d_k, d_v, drop=0.0):
        super(DecoderLayer, self).__init__()
        self.slf_attn = multiHeadedAttention(nheads, dims,d_k, d_v, dropout=drop)
        self.enc_attn = multiHeadedAttention(nheads, dims,d_k, d_v, dropout=drop)
        self.ffn = positionFeedFoward(dims, hid, drop=drop)

    def forward(self, inp, enc_out, slf_mask, enc_mask=None):
        dec_out, dec_attn = self.slf_attn(
            inp, inp, inp, slf_mask
        )

        dec_out, enc_attn = self.enc_attn(
            dec_out, enc_out, enc_out, enc_mask
        )
        dec_out = self.ffn(dec_out)

        return dec_out, dec_attn, enc_attn

In [None]:
#heads, d_model, d_km d_v as per the paper
enc = EncoderLayer(512, 20, 8, 64, 64)
#batches, dims, dimensionalityxn_heads
q = torch.rand(1,512,512)
k = torch.rand(1,512,512)
v = torch.rand(1,512,512)


enc(v)

In [None]:
#Pytoch version adapted from here https://pub.aimind.so/creating-sinusoidal-positional-embedding-from-scratch-in-pytorch-98c49e153d6
class PosEncoding(nn.Module):
    def __init__(self, hid, n_pos=200):
        super(PosEncoding, self).__init__()

        self.register_buffer('pos_table', self._get_sinusoid_encoding_table(n_pos, hid))

    def _get_sinusoid_encoding_table(self, n_pos, hid):

        if hid %2 != 0:
            raise ValueError("Sinusoidal positional embedding cannot apply to odd token embedding dim={}".format(hid))
        
        positions = torch.arange(0,n_pos).unsqueeze_(1)
        embeds = torch.zeros(n_pos, hid)

        denom = torch.pow(10000, 2 * torch.arange(0, hid//2)/2)
        embeds[:, 0::2] = torch.sin(positions/denom)
        embeds[:, 1::2] = torch.cos(positions/denom)
        embeds = embeds.unsqueeze(0)

        return embeds
    
    def forward(self, x):
        return x + self.pos_table[:, :x.size(1)].clone().detach()




In [None]:
class Encoder(nn.Module):
    '''Encoder model'''
    def __init__(
            self, n_vocab, d_word, n_layers, n_head, d_k, d_v, dims, hid, pad, dropout=0.0, n_pos=200, scale_emb=False
    ):
        super(Encoder, self).__init__()

        self.word_emb = nn.Embedding(n_vocab, d_word, padding_idx=pad)
        self.pos_enc = PosEncoding(d_word, n_pos=n_pos)
        self.drop = nn.Dropout(p=dropout)
        self.stack = nn.ModuleList([
            EncoderLayer(dims, hid, n_head, d_k, d_v, drop=dropout)
            for _ in range(n_layers)
        ])
        self.layer_norm = nn.LayerNorm(dims, eps=1e-6)
        self.scale_emb = scale_emb
        self.dims = dims

    def forward(self, seq, mask, ret_attns=False):
        enc_slf_attn_list = []

        enc_out = self.word_emb(seq)
        if self.scale_emb:
            enc_out *= self.dims ** 0.5
        enc_out = self.pos_enc(enc_out)
        enc_out = self.drop(enc_out)
        enc_out = self.layer_norm(enc_out)

        for enc_layer in self.stack:
            enc_out, enc_slf_attn = enc_layer(enc_out, mask=mask)
            enc_slf_attn_list += [enc_slf_attn] if ret_attns else []

        #if ret_attns:
            #return enc_out, enc_slf_attn_list
        return enc_out, enc_slf_attn_list

class Decoder(nn.Module):
    '''Decoder model'''
    def __init__(
            self, n_vocab, d_word, n_layers, n_head, d_k, d_v, dims, hid, pad, dropout=0.0 , n_pos=200, scale_emb=False
    ):
        super(Decoder, self).__init__()

        self.word_emb = nn.Embedding(n_vocab, d_word, padding_idx=pad)
        self.pos_enc = PosEncoding(d_word, n_pos=n_pos)
        self.drop = nn.Dropout(p=dropout)
        self.stack = nn.ModuleList([
            DecoderLayer(dims, hid, n_head, d_k, d_v, drop=dropout)
            for _ in range(n_layers)
        ])
        self.layer_norm = nn.LayerNorm(dims, eps=1e-6)
        self.scale_emb = scale_emb
        self.dims = dims

    def forward(self, seq, mask, enc_out, src_mask, ret_attns=False):
        dec_slf_attn_list, dec_enc_attn_list = [],[]

        dec_out = self.word_emb(seq)
        if self.scale_emb:
            dec_out *= self.dims ** 0.5
        dec_out = self.pos_enc(dec_out)
        dec_out = self.drop(dec_out)
        dec_out = self.layer_norm(dec_out)

        for dec_layer in self.stack:
            dec_out, dec_self_attn, dec_enc_attn = dec_layer(
                dec_out, enc_out, slf_mask=mask, enc_mask=src_mask
            )
            dec_slf_attn_list += [dec_self_attn] if ret_attns else []
            dec_enc_attn_list += [dec_enc_attn] if ret_attns else []

        #if ret_attns:
            #return dec_out, dec_slf_attn_list, dec_enc_attn_list
        return dec_out, dec_slf_attn_list, dec_enc_attn_list



In [None]:
def get_pad_mask(seq, pad_idx):
    return (seq != pad_idx).unsqueeze(-2)

def get_subsequent_mask(seq):
    ''' For masking out the subsequent info. '''
    len_s = seq.size(-1)
    subsequent_mask = (1 - torch.triu(
        torch.ones((1, len_s, len_s), device=seq.device), diagonal=1)).bool()
    return subsequent_mask

In [None]:
class Transformer(nn.Module):
    '''Attempt at recreating the sequence to sequence model'''
    def __init__(
            self, src_vocab, trg_vocab, src_pad, trg_pad, d_word=512, dims=512, hid=2048, n_layers=6, n_heads=8, d_k=64, d_v=64, drop=0.0, n_pos=200, trg_emb_prj_weight_sharing=True, emb_src_trg_weight_sharing=True, scale_emb_or_prj='prj'
    ):
        super(Transformer, self).__init__()

        self.src_pad, self.trg_pad = src_pad, trg_pad

        assert scale_emb_or_prj in ['emb', 'prj', 'none']
        scale_emb = (scale_emb_or_prj=='emb') if trg_emb_prj_weight_sharing else False
        self.scale_prj = (scale_emb_or_prj == 'prj') if trg_emb_prj_weight_sharing else False
        self.dims = dims

        self.encoder = Encoder(
            n_vocab=src_vocab, n_pos=n_pos,
            d_word=d_word, dims=dims, hid=hid,
            n_layers=n_layers, n_head=n_heads, d_k=d_k, d_v=d_v,
            pad=src_pad, dropout=drop, scale_emb=scale_emb
        )

        self.decoder = Decoder(
            n_vocab=trg_vocab, n_pos=n_pos,
            d_word=d_word, dims=dims, hid=hid,
            n_layers=n_layers, n_head=n_heads, d_k=d_k, d_v=d_v,
            pad=trg_pad, dropout=drop, scale_emb=scale_emb
        )

        self.trg_word_prj = nn.Linear(dims, trg_vocab, bias=False)

        for j in self.parameters():
            if j.dim()>1:
                nn.init.xavier_uniform_(j)

        assert dims == d_word

        if trg_emb_prj_weight_sharing:
            self.trg_word_prj.weight = self.decoder.word_emb.weight

        if emb_src_trg_weight_sharing:
            self.encoder.word_emb.weight = self.decoder.word_emb.weight

    def forward(self, src_seq, trg_seq):
        src_mask = get_pad_mask(src_seq, self.src_pad)
        trg_mask = get_pad_mask(trg_seq, self.trg_pad) & get_subsequent_mask(trg_seq)

        enc_out, *_ = self.encoder(src_seq, src_mask)
        dec_out, *_ = self.decoder(trg_seq, trg_mask, enc_out, src_mask)
        seq_logit = self.trg_word_prj(dec_out)
        if self.scale_prj:
            seq_logit *= self.dims ** -0.5

        return seq_logit.view(-1, seq_logit.size(2))

In [None]:
#Text file containing english sentences
file_path = './raw_sentences.txt'

sentences = []
for line in open(file_path):
    words = line.split()
    sentence = [word.lower() for word in words]
    sentences.append(sentence)

vocab = set([w for s in sentences for w in s])

print(len(sentences)) # 97162
print(len(vocab))

In [None]:
test, valid, train = sentences[:10000], sentences[10000:20000], sentences[20000:]

for i in range(10):
    print(train[i])

In [None]:
count = Counter()
n=0
sum = 0
for n in range(len(sentences)):
    sum+=len(sentences[n])
    count.update(sentences[n])
print('avg: ' + str(sum/n))
print('unique: '+str(len(count)))
print('10 most common: ')
keys = sorted(count, key=count.get, reverse=True)[:10]
print(keys)
print("total words: ")
sum = 0
for val in list(count.values()):
    sum+=val
print(sum)
count_percent ={}
for key, val in list(count.items()):
    count_percent[key]=round((val/sum)*100,2)


print("Percentages")
print(count_percent)

In [None]:
vocab_itos = dict(enumerate(vocab))
# A mapping of word => its index
vocab_stoi = {word:index for index, word in vocab_itos.items()}

def convert_words_to_indices(sents):
    """
    This function takes a list of sentences (list of list of words)
    and returns a new list with the same structure, but where each word
    is replaced by its index in `vocab_stoi`.

    Example:
    >>> convert_words_to_indices([['one', 'in', 'five', 'are', 'over', 'here'],
                                  ['other', 'one', 'since', 'yesterday'],
                                  ['you']])
    [[148, 98, 70, 23, 154, 89], [151, 148, 181, 246], [248]]
    """
    sent_inds=[]
    for sent in sents:
        sent_ind = []
        for word in sent:
            sent_ind.append(vocab_stoi[word])
        sent_inds.append(sent_ind)
    return sent_inds

def generate_4grams(seqs):
    """
    This function takes a list of sentences (list of lists) and returns
    a new list containing the 4-grams (four consequentively occuring words)
    that appear in the sentences. Note that a unique 4-gram can appear multiple
    times, one per each time that the 4-gram appears in the data parameter `seqs`.

    Example:

    >>> generate_4grams([[148, 98, 70, 23, 154, 89], [151, 148, 181, 246], [248]])
    [[148, 98, 70, 23], [98, 70, 23, 154], [70, 23, 154, 89], [151, 148, 181, 246]]
    >>> generate_4grams([[1, 1, 1, 1, 1]])
    [[1, 1, 1, 1], [1, 1, 1, 1]]
    """

    fourgrams =[]
    for seq in seqs:
        while len(seq)>3:
            fourgrams.append(seq[:4])
            seq=seq[1:]
    return fourgrams

def process_data(sents):
    """
    This function takes a list of sentences (list of lists), and generates an
    numpy matrix with shape [N, 4] containing indices of words in 4-grams.
    """
    indices = convert_words_to_indices(sents)
    fourgrams = generate_4grams(indices)
    return np.array(fourgrams)

print(generate_4grams([[148, 98, 70, 23, 154, 89], [151, 148, 181, 246], [248]]))
train4grams = process_data(train)
valid4grams = process_data(valid)
test4grams = process_data(test)

In [None]:
train4grams[0]

In [None]:
model = Transformer(
    len(vocab), len(vocab), 0,0, n_heads=2,n_layers=2, n_pos=3
    ).to(device)



In [None]:
test_inp = torch.tensor(train4grams[:2]).to(device)
out = model(test_inp[:,:3], test_inp[:,1:])
out

In [None]:
def estimate_accuracy_torch(model, data, batch_size=5000, max_N=100000):
    """
    Estimate the accuracy of the model on the data. To reduce
    computation time, use at most `max_N` elements of `data` to
    produce the estimate.
    """
    correct = 0
    N = 0
    for i in range(0, data.shape[0], batch_size):
        # get a batch of data
        xs, ts = data[i:i+batch_size,:3], data[i:i+batch_size,3]
        
        # forward pass prediction
        z = model(torch.Tensor(xs).long().to(device),torch.Tensor(xs).long().to(device))
        z = z.cpu().detach().numpy() # convert the PyTorch tensor => numpy array
        pred = np.argmax(z, axis=1)
        correct += np.sum(pred == ts)
        N += ts.shape[0]

        if N > max_N:
            break
    return correct / N

In [None]:
def train(model, train_data=train4grams, validation_data = valid4grams, batch_size=300, lr=0.003, weight_decay=0, max_iters=2500, checkpoint_path=None):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    iters, losses = [], []
    iters_sub, train_accs, val_accs  = [], [] ,[]

    n = 0

    while True:
        for i in range(0, train_data.shape[0], batch_size):
            if (i + batch_size) > train_data.shape[0]:
                break
            xs, ts = train_data[i:i+batch_size,:3], train_data[i:i+batch_size,3]

            # convert from numpy arrays to PyTorch tensors
            xs = torch.Tensor(xs).long().to(device)
            ts = torch.Tensor(ts).long().to(device)

            zs = model(xs,xs)[:batch_size]
            loss = criterion(zs, ts) # compute the total loss
            loss.backward()          # compute updates for each parameter
            optimizer.step()         # make the updates for each parameter
            optimizer.zero_grad()    # a clean up step for PyTorch

            # save the current training information
            iters.append(n)
            losses.append(float(loss)/batch_size)  # compute *average* loss

            if n % 500 == 0:
                iters_sub.append(n)
                train_cost = float(loss.cpu().detach().numpy())
                train_acc = estimate_accuracy_torch(model, train_data)
                train_accs.append(train_acc)
                val_acc = estimate_accuracy_torch(model, validation_data)
                val_accs.append(val_acc)
                print("Iter %d. [Val Acc %.0f%%] [Train Acc %.0f%%, Loss %f]" % (
                      n, val_acc * 100, train_acc * 100, train_cost))

                if (checkpoint_path is not None) and n > 0:
                    torch.save(model.state_dict(), checkpoint_path.format(n))

            # increment the iteration number
            n += 1

            if n > max_iters:
                return iters, losses, iters_sub, train_accs, val_accs



In [None]:
def plot_learning_curve(iters, losses, iters_sub, train_accs, val_accs):
    """
    Plot the learning curve.
    """
    plt.title("Learning Curve: Loss per Iteration")
    plt.plot(iters, losses, label="Train")
    plt.xlabel("Iterations")
    plt.ylabel("Loss")
    plt.show()

    plt.title("Learning Curve: Accuracy per Iteration")
    plt.plot(iters_sub, train_accs, label="Train")
    plt.plot(iters_sub, val_accs, label="Validation")
    plt.xlabel("Iterations")
    plt.ylabel("Accuracy")
    plt.legend(loc='best')
    plt.show()

In [None]:
train_info = train(model)
plot_learning_curve(*train_info)