<a href="https://colab.research.google.com/github/Ravikiran2611/opencv/blob/master/transformer(cuda).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import torch.nn as nn

In [2]:
!python -m spacy download de

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')


In [0]:
# datacreating
from torchtext import data, datasets
if True:
    import spacy
    spacy_de = spacy.load('de')
    spacy_en = spacy.load('en')

    def tokenize_de(text):
        return [tok.text for tok in spacy_de.tokenizer(text)]

    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(text)]

    BOS_WORD = '<s>'
    EOS_WORD = '</s>'
    BLANK_WORD = "<blank>"
    SRC = data.Field(tokenize=tokenize_de, pad_token=BLANK_WORD)
    TGT = data.Field(tokenize=tokenize_en, init_token = BOS_WORD, 
                     eos_token = EOS_WORD, pad_token=BLANK_WORD)

    MAX_LEN = 100
    train, val, test = datasets.IWSLT.splits(
        exts=('.en', '.de'), fields=(SRC, TGT), 
        filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
            len(vars(x)['trg']) <= MAX_LEN)
    MIN_FREQ = 2
    SRC.build_vocab(train.src, min_freq=MIN_FREQ)
    TGT.build_vocab(train.trg, min_freq=MIN_FREQ)

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator,val_iterator,text_iterator = data.BucketIterator.splits((train, val, test), batch_size = 64,sort_within_batch = True, device = device)

In [5]:
len(train_iterator)

3072

In [6]:
train_iterator

<torchtext.data.iterator.BucketIterator at 0x7fec21006940>

In [0]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
    def forward(self, x):
        return self.embed(x)


In [0]:
import math
from torch.autograd import Variable
class PositionalEncoder(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, max_len=5000):
        super().__init__()
                
#         since in transformer there is no recurrence involved , it does know about the order of the words in the sentence.
#         For that purpose we add postional encoding that contains inforation about the relative or absolute postions of the words in the sentence.
#         the size of postional encoding is equal to the word embedding size
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0.0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0.0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], requires_grad=False)
        return x

In [0]:
import copy
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) 
# provides multiple copies of the given modules

In [0]:
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout = 0.1):
        super().__init__()
#         core part of transformer and this function is just multiple time  attention function
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        
        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)
    
    def forward(self, q, k, v, mask=None):
        
        bs = q.size(0)
              
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
        
        
       
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)

        scores = attention(q, k, v, self.d_k, mask, self.dropout)
        
        concat = scores.transpose(1,2).contiguous().view(bs, -1, self.d_model)
        output = self.out(concat)
        return output

In [0]:
def attention(q, k, v, d_k, mask=None, dropout=None):
#   in attention function it tries to calculate , how important the other words are to specific words in the sentence
#  so at a single time step  each word will be able to see all the other words in the sentence 
    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
    if mask is not None:
        mask = mask.unsqueeze(1)
        scores = scores.masked_fill(mask == 0, -1e9)
    scores = torch.nn.functional.softmax(scores, dim=-1)
    if dropout is not None:
        scores = dropout(scores)
        
    output = torch.matmul(scores, v)
    return output

In [0]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
        super().__init__() 
        # We set d_ff as a default to 2048
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)
    def forward(self, x):
        x = self.dropout(torch.nn.functional.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x


In [0]:
class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()
    
        self.size = d_model
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        self.eps = eps
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm


In [0]:
class EncoderLayer(nn.Module):
#   the encoder layer consist of two functions, one is multihead attention anf another is feed forward layer 
    def __init__(self, d_model, heads, dropout = 0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        return x

In [0]:
class DecoderLayer(nn.Module):
#   the decoder layer consist of three functions ,one is masked multi head attention which makes sures that the word in the 
# target sentence is only based on the previous words and this is done by masking the future words in te target sentence.
# The second is a normal multihead attention layer followed by the feed forward layer
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.norm_3 = Norm(d_model)
        
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)
        
        self.attn_1 = MultiHeadAttention(heads, d_model)
        self.attn_2 = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model)
    def forward(self, x, e_outputs, src_mask, trg_mask):
#         print(x.shape)
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs,
        src_mask))
        x2 = self.norm_3(x)
        x = x + self.dropout_3(self.ff(x2))
        return x

In [0]:
class Encoder(nn.Module):
#   consist of  N number of encoder layer (here the value of  N is 6)
    def __init__(self, vocab_size, d_model, N, heads):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.layers = get_clones(EncoderLayer(d_model, heads), N)
        self.norm = Norm(d_model)
    def forward(self, src, mask):
        x = self.embed(src)
        x = self.pe(x)
        for i in range(N):
            x = self.layers[i](x, mask)
        return self.norm(x)

In [0]:
class Decoder(nn.Module):
#   consist of  N number of decoder layer (here the value of  N is 6)
  
    def __init__(self, vocab_size, d_model, N, heads):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.layers = get_clones(DecoderLayer(d_model, heads), N)
        self.norm = Norm(d_model)
    def forward(self, trg, e_outputs, src_mask, trg_mask):
        x = self.embed(trg)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
        return self.norm(x)

In [0]:
class Transformer(nn.Module):
#   the transformer consist of encoder followed by the decoder .The decoder inputs is passed to the last linear layer 
# whose size is equal to the vocab size of target language. Softmax is applied by the loss function (cross entropy)
    def __init__(self, src_vocab, trg_vocab, d_model, N, heads):
        super().__init__()
        self.encoder = Encoder(src_vocab, d_model, N, heads)
        self.decoder = Decoder(trg_vocab, d_model, N, heads)
        self.out = nn.Linear(d_model, trg_vocab)
    def forward(self, src, trg, src_mask, trg_mask):
        e_outputs = self.encoder(src, src_mask)
        print("encoder over")
        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
        print("decoder over")
        output = self.out(d_output)
        return output

In [0]:
d_model = 512
heads = 8
N = 6

In [0]:
src_vocab = len(SRC.vocab)
trg_vocab = len(TGT.vocab)
model = Transformer(src_vocab, trg_vocab, d_model, N, heads)
model = model.to(device)


In [21]:
import numpy as np
for i, batch in enumerate(train_iterator):
  src = batch.src.transpose(0,1)
  trg = batch.trg.transpose(0,1)
          
  trg_input = trg[:, :-1]
          
  optim = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

  targets = trg[:, 1:].contiguous().view(-1)
        

            
  input_pad = SRC.vocab.stoi['<blank>']
  input_msk = (src != input_pad).unsqueeze(1)

            
  target_pad = TGT.vocab.stoi['<blank>']
  target_msk = (trg_input != target_pad).unsqueeze(1)
  size = trg_input.size(1)
  shape = (1,size,size)
  nopeak_mask = np.triu(np.ones(shape),k=1).astype('uint8')
  nopeak_mask = Variable(torch.from_numpy(nopeak_mask) == 0)
  target_msk = target_msk & nopeak_mask.cuda()

 
  preds = model(src, trg_input, input_msk, target_msk)
  optim.zero_grad()
  loss = torch.nn.functional.cross_entropy(preds.view(-1, preds.size(-1)),
  targets, ignore_index=target_pad)
  loss.backward()
  optim.step()
  print(i)
  if(i == 2000):
       break

encoder over
decoder over
0
encoder over
decoder over
1
encoder over
decoder over
2
encoder over
decoder over
3
encoder over
decoder over
4
encoder over
decoder over
5
encoder over
decoder over
6
encoder over
decoder over
7
encoder over
decoder over
8
encoder over
decoder over
9
encoder over
decoder over
10
encoder over
decoder over
11
encoder over
decoder over
12
encoder over
decoder over
13
encoder over
decoder over
14
encoder over
decoder over
15
encoder over
decoder over
16
encoder over
decoder over
17
encoder over
decoder over
18
encoder over
decoder over
19
encoder over
decoder over
20
encoder over
decoder over
21
encoder over
decoder over
22
encoder over
decoder over
23
encoder over
decoder over
24
encoder over
decoder over
25
encoder over
decoder over
26
encoder over
decoder over
27
encoder over
decoder over
28
encoder over
decoder over
29
encoder over
decoder over
30
encoder over
decoder over
31
encoder over
decoder over
32
encoder over
decoder over
33
encoder over
decoder ove

RuntimeError: ignored

In [0]:
import numpy as np
def translate(model, src, max_len = 10, custom_string=False):
    device1 =  torch.device('cpu')
    model.eval()
    model = model.to(device1)
    input_pad = SRC.vocab.stoi['<blank>']
    if custom_string == True:
        src = tokenize_en(src)
        sentence=Variable(torch.LongTensor([[SRC.vocab.stoi[tok] for tok in src]]))
    src_mask = (sentence != input_pad).unsqueeze(-2)
    e_outputs = model.encoder(sentence, src_mask)
    outputs = torch.zeros(max_len).type_as(sentence.data)
    outputs[0] = torch.LongTensor([TGT.vocab.stoi['<sos>']])
    for i in range(1, max_len):    
        shape = (1,i,i)    
        testtrg_mask = np.triu(np.ones(shape), k=1).astype('uint8')
        
        testtrg_mask = Variable(torch.from_numpy(testtrg_mask) == 0)
        out = model.decoder(outputs[:i].unsqueeze(0), e_outputs, src_mask, testtrg_mask)
        out = model.out(out)
        out = torch.nn.functional.softmax(out, dim=-1)
        val, ix = out[:, -1].data.topk(1) # returns the max value and the position of the max value
        outputs[i] = ix[0][0]
        if ix[0][0] == TGT.vocab.stoi['<eos>']:
            break
    return ' '.join([TGT.vocab.itos[ix] for ix in outputs[:i]] )

In [0]:
translated_sentence = translate(model,'have a nice day',40,custom_string= True)

In [0]:
translated_sentence