In [24]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import gensim.downloader as gd

## Step 1: Convert text into embeddings

In [2]:
print(list(gd.info()['models']))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [3]:
embedding_model = gd.load('glove-wiki-gigaword-50')



In [4]:
text = "Hello! How are you? Don't you have some work to do?"
text = text.split()
print(text)

['Hello!', 'How', 'are', 'you?', "Don't", 'you', 'have', 'some', 'work', 'to', 'do?']


In [5]:
import re

# function to remove punctuations
def clean_word(word):
    return re.sub(r'[^\w\s]', "", word)

text = list(map(clean_word, text))
print(text)

['Hello', 'How', 'are', 'you', 'Dont', 'you', 'have', 'some', 'work', 'to', 'do']


In [6]:
vector = list(map(lambda x : embedding_model[x.lower()], text))
print(vector)

[array([-0.38497 ,  0.80092 ,  0.064106, -0.28355 , -0.026759, -0.34532 ,
       -0.64253 , -0.11729 , -0.33257 ,  0.55243 , -0.087813,  0.9035  ,
        0.47102 ,  0.56657 ,  0.6985  , -0.35229 , -0.86542 ,  0.90573 ,
        0.03576 , -0.071705, -0.12327 ,  0.54923 ,  0.47005 ,  0.35572 ,
        1.2611  , -0.67581 , -0.94983 ,  0.68666 ,  0.3871  , -1.3492  ,
        0.63512 ,  0.46416 , -0.48814 ,  0.83827 , -0.9246  , -0.33722 ,
        0.53741 , -1.0616  , -0.081403, -0.67111 ,  0.30923 , -0.3923  ,
       -0.55002 , -0.68827 ,  0.58049 , -0.11626 ,  0.013139, -0.57654 ,
        0.048833,  0.67204 ], dtype=float32), array([ 6.8938e-01, -1.0644e-01,  1.7083e-01, -3.7583e-01,  7.5170e-01,
        7.8149e-04, -5.3102e-01, -1.9903e-01, -1.4419e-01,  1.2748e-01,
       -2.8038e-01,  7.0723e-01, -5.4100e-01,  1.9625e-01,  9.6635e-01,
        6.0519e-01,  4.0918e-01, -3.1612e-02,  5.3900e-01, -8.7086e-01,
       -2.0912e-01,  5.6853e-01,  6.5983e-01,  1.4583e-01,  1.0112e+00,
       -2

## Transformer Model Implementation

### Embedding Layer

In [7]:
class EmbeddingLayer(nn.Module):
    def __init__(self):
        super(EmbeddingLayer, self).__init__()
        self.model = gd.load('glove-wiki-gigaword-50')

    def forward(self, x):
        x = list(map(lambda x : self.model[x], x))
        x = torch.Tensor(x)
        return x

### Positional Encoding 

In [8]:
class PositionalEncoding(nn.Module):
    def __init__(self, max_words, d_model):
        super(PositionalEncoding, self).__init__() # skip the positionalEncoding inside super (after python 3)

        self.matrix = torch.zeros(max_words, d_model)
        val = torch.Tensor([pos / (10000 ** ((2*i) / d_model)) for pos in torch.arange(max_words) for i in torch.arange(0, d_model, 2)]).reshape(max_words, d_model//2)
        self.matrix[:, 0::2] = torch.sin(val)
        self.matrix[:, 1::2] = torch.cos(val)

    def forward(self, x):
        x += self.matrix
        return x

### Self attention block

In [9]:
class SelfAttention(nn.Module):
    def __init__(self, d_model, d_k, d_v):
        super(SelfAttention, self).__init__() 
        self.d_k = d_k
        self.d_v = d_v
        self.d_model = d_model
        
        self.w_q = nn.Linear(d_model, d_k)
        self.w_k = nn.Linear(d_model, d_k)
        self.w_v = nn.Linear(d_model, d_v)

    def forward(self, q, k, v, mask=None):

        q, k, v = self.w_q(q), self.w_k(k), self.w_v(v)

        score = (q@k.T) / math.sqrt(self.d_k)

        if mask is not None:
            score = score.masked_fill(mask == 0, -100000)

        x = F.softmax(score, dim=-1) @ v 
        return x

### Multi-Head attention block

In [10]:
class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, d_model):
        super(MultiHeadAttention, self).__init__()

        self.n_head = n_heads
        self.d_model = d_model
        self.k = self.v = d_model // n_heads

        self.attentions = [SelfAttention(self.d_model, self.k, self.v) for _ in range(n_heads)]
        self.w_o = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        
        x = [attention(q, k, v, mask) for attention in self.attentions]
        x = torch.cat(x, dim=1)
        x = self.w_o(x)
        
        return x

### FeedForward Network

In [11]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(FeedForward, self).__init__()

        self.layer1 = nn.Linear(d_model, d_ff)
        self.layer2 = nn.Linear(d_ff, d_model)

    def forward(self,x):
        
        x = self.layer1(x)
        x = F.relu(x)
        x = self.layer2(x)
        
        return x

### Encoder Block

In [19]:
class Encoder(nn.Module):
    def __init__(self, n_heads, d_model, d_ff):
        super(Encoder, self).__init__()

        self.mha = MultiHeadAttention(n_heads, d_model)
        self.layernorm1 = nn.LayerNorm(d_model)

        self.feedforward = FeedForward(d_model, d_ff)
        self.layernorm2 = nn.LayerNorm(d_model)

    def forward(self, x):
        # MHA and (Skip + layernorm)
        x = self.layernorm1(x + self.mha(x, x, x))

        # FeedForward and (Skip + layernorm)
        x = self.layernorm2(x + self.feedforward(x))
        
        return x

### Decoder Block

In [27]:
class Decoder(nn.Module):
    def __init__(self, n_heads, d_model, d_ff):
        super(Decoder, self).__init__()

        self.masked_mha = MultiHeadAttention(n_heads, d_model)
        self.layernorm1 = nn.LayerNorm(d_model)
        self.mha = MultiHeadAttention(n_heads, d_model)
        self.layernorm2 = nn.LayerNorm(d_model)
        self.ff = FeedForward(d_model, d_ff)
        self.layernorm3 = nn.LayerNorm(d_model)

    def forward(self, x_encoder, x_prev):

        # masked mha + add & norm
        mask = np.ones(x_prev.shape)
        for i in range(x_prev.shape[0]):
            for j in range(x_prev.shape[1]):
                mask[i][j] = i <= j
        
        x = self.layernorm1(x_prev + self.masked_mha(x_prev, x_prev, x_prev, mask))

        # encoder mha + add & norm
        x = self.layernorm2(x + self.mha(x, x_encoder, x_encoder))

        # feedforward + add & norm
        x = self.layernorm3(x + self.ff(x))
        
        return x

### Transformer Implementation 

In [40]:
class MyTransformer(nn.Module):
    def __init__(self, 
                 max_words, # total words at a time
                 d_model, # embedding dimension
                 n_encoder, # number of encoders
                 n_decoder, # number of decoders
                 n_heads,  # for multi_head_attention block
                 d_ff # for hidden layer in feed forward block
                ):
        super(MyTransformer, self).__init__()

        self.n_encoder = n_encoder
        self.n_decoder = n_decoder
        
        # embedding layer
        self.embedding_layer = EmbeddingLayer()

        # positional encoding
        self.positional_encoding = PositionalEncoding(max_words, d_model)

        # encoders
        self.encoders = [Encoder(n_heads, d_model, d_ff) for _ in range(self.n_encoder)]

        # decoders
        self.decoders = [Decoder(n_heads, d_model, d_ff) for _ in range(self.n_decoder)]

        # linear
        self.linear = nn.Linear(d_model, d_model)

        # softmax
        self.sftmax = nn.Softmax(dim=1)

    def forward(self, x_input, x_output):
        # input embedding
        # x_input = self.embedding_layer(x_input)
        x_input = self.positional_encoding(x_input)

        # output embedding
        # x_output = self.embedding_layer(x_output)
        x_output = self.positional_encoding(x_output)

        # encoder pass
        for encoder in self.encoders:
            x_input = encoder(x_input)

        
        # decoder pass
        for decoder in self.decoders:
            x_output = decoder(x_input, x_output)

        # linear paas
        x = self.linear(x_output)
        # x = self.sftmax(x)
        
        return x

## Rough work

In [14]:
max_words = 10
d_model = 6

matrix = torch.zeros(max_words, d_model)
val = torch.Tensor([pos / (10000 ** ((2*i) / d_model)) for pos in torch.arange(max_words) for i in torch.arange(0, d_model, 2)]).reshape(max_words, d_model//2)
matrix[:, 0::2] = torch.sin(val)
matrix[:, 1::2] = torch.cos(val)

print(matrix.shape)

torch.Size([10, 6])


In [15]:
class SimpleModel(nn.Module):
    def __init__(self, max_words, d_model):
        super().__init__()

        self.layer = nn.Linear(d_model, d_model//2)

    def forward(self, x):
        x = self.layer(x)
        return x

In [41]:
d_ff = 2 * d_model
n_encoders = n_decoders = 3
n_heads = d_model // 2

model = MyTransformer(max_words, d_model, n_encoders, n_decoders, n_heads, d_ff)
x = model(matrix, matrix)
print(x)

tensor([[ 0.3100,  0.5627,  1.1991, -0.5743, -1.0250, -0.1144],
        [ 0.1725,  0.6664,  0.7969, -0.4591, -1.1124,  0.0212],
        [ 0.0804,  0.4723,  0.4589, -0.6279, -0.9467,  0.1105],
        [ 0.0753,  0.2308,  0.5175, -1.0255, -0.6617,  0.0654],
        [ 0.1655,  0.1761,  0.9299, -1.2916, -0.4912, -0.0833],
        [ 0.3047,  0.3369,  1.3149, -1.0990, -0.6280, -0.2151],
        [ 0.3426,  0.5006,  1.2809, -0.6603, -0.9456, -0.1604],
        [ 0.2157,  0.6602,  0.9143, -0.4557, -1.1163, -0.0189],
        [ 0.0965,  0.5471,  0.5243, -0.5591, -1.0085,  0.0944],
        [ 0.0754,  0.2810,  0.4591, -0.8967, -0.7446,  0.0856]],
       grad_fn=<AddmmBackward0>)


In [42]:
reply = list(map(lambda word : embedding_model.similar_by_vector(word, topn=1), x))
print(reply)

KeyError: "Key '0.31003668904304504' not present in vocabulary"

In [None]:
a = torch.Tensor([[1,2, 3, 4],
                 [5, 6, 7, 8],
                 [9, 10, 11, 12],
                 [13, 14, 15, 16]])
# soft0 = F.softmax(a, dim=-1)
# print(soft0)
b = a * 2
# print(b)

c = torch.cat((a, b), dim=1)
print(c)