In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
import copy

In [3]:
import numpy as np
import math

In [4]:
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout = 0.1):
        super().__init__()
        
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        
        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)
    
    def forward(self, q, k, v, mask=None):
        
        bs = q.size(0)
        
        # perform linear operation and split into h heads
        
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
        
        # transpose to get dimensions bs * h * sl * d_model
       
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)
# calculate attention using function we will define next
        scores = attention(q, k, v, self.d_k, mask, self.dropout)
        
        # concatenate heads and put through final linear layer
        concat = scores.transpose(1,2).contiguous()\
        .view(bs, -1, self.d_model)
        
        output = self.out(concat)
    
        return output

In [5]:
def attention(q, k, v, d_k, mask=None, dropout=None):
    
    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
    if mask is not None:
            mask = mask.unsqueeze(1)
            scores = scores.masked_fill(mask == 0, -1e9)
    scores = F.softmax(scores, dim=-1)

    if dropout is not None:
        scores = dropout(scores)

    output = torch.matmul(scores, v)
    return output

In [6]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
        super().__init__() 
        # We set d_ff as a default to 2048
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)
    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x

In [7]:
class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()
    
        self.size = d_model
        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        self.eps = eps
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

In [8]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout = 0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        return x

In [9]:
# class Encoder(nn.Module):
#     def __init__(self, in_size, d_model, N, heads):
#         super().__init__()
#         self.N = N
# #         self.embed = nn.Embedding(num_embeddings=in_size, embedding_dim=d_model)
#         self.embed = nn.Linear(in_size, d_model)
# #         self.pe = PositionalEncoder(d_model)
#         self.layers = get_clones(EncoderLayer(d_model, heads), N)
#         self.norm = Norm(d_model)
        
#     def forward(self, src, mask):
#         x = self.embed(src)
# #         x = self.pe(x)
#         for i in range(self.N):
#             x = self.layers[i](x, mask)
#         return self.norm(x)

In [10]:
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

In [22]:
class MyModel(nn.Module):
    def __init__(self, in_size, d_model, N, heads, n_hid):
        super().__init__()
        self.N = N
#         self.embed = nn.Embedding(num_embeddings=in_size, embedding_dim=d_model)
        self.embed = nn.Linear(in_size, d_model)
#         self.pe = PositionalEncoder(d_model)
        self.layers = get_clones(EncoderLayer(d_model, heads), N)
        self.norm = Norm(d_model)
        self.hid = nn.Linear(d_model, n_hid)
        self.out = nn.Linear(n_hid, 1)
        self.act1 = nn.ReLU()
        self.act2 = nn.Sigmoid()
        
    def forward(self, src, mask):
        x = self.embed(src)
#         x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, mask)
        x = self.norm(x)
        # FC layers
        x = self.act1(self.hid(x))
        return self.act2(self.out(x))

In [23]:
# class MyModel(nn.Module):
#     def __init__(self, in_size, d_model, N, heads, n_hid):
#         self.encoder = Encoder(in_size, d_model, N, heads)
#         self.hid = nn.ReLU(nn.Linear(n_enc, n_hid))
#         self.out = nn.Sigmoid(nn.Linear(n_hid, 1))
        
#     def forward(self, x, mask=None):
#         x = self.encoder(x, mask)
#         x = self.hid(x)
#         return self.out(x)

In [24]:
# tmp = EncoderLayer(d_model=10, heads=1, dropout=0.1)
# encoder = Encoder(in_size=10, d_model=15, N=6, heads=1)
model = MyModel(in_size=10, d_model=15, N=6, heads=1, n_hid=5)

In [25]:
x = np.random.randn(5, 20, 10)

In [26]:
x.shape

(5, 20, 10)

In [27]:
x = torch.from_numpy(x)

In [28]:
y = model(x.float(), mask=None)

In [29]:
y.shape

torch.Size([5, 20, 1])

In [30]:
x[0, 0, :]

tensor([ 0.2241, -0.1174,  0.6142,  1.0847, -0.9897, -1.2112,  0.4271,  1.6907,
         0.5544, -0.3430], dtype=torch.float64)

In [31]:
y[0, 0, :]

tensor([0.6723], grad_fn=<SliceBackward>)