In [1]:
import torch
import torchtext
from torchtext import data
from torchtext import datasets
import re
from torch.autograd import Variable
import numpy as np
import torch.nn as nn
import math, copy, time
import torch.nn.functional as F
import matplotlib.pyplot as plt
import dill as pickle
import spacy
import torchtext
from torchtext.data import Field, BucketIterator, TabularDataset

In [2]:
import spacy

class tokenize(object):
    
    def __init__(self, lang):
        self.nlp = spacy.load(lang)
            
    def tokenizer(self, sentence):
        sentence = re.sub(
        r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(sentence))
        sentence = re.sub(r"[ ]+", " ", sentence)
        sentence = re.sub(r"\!+", "!", sentence)
        sentence = re.sub(r"\,+", ",", sentence)
        sentence = re.sub(r"\?+", "?", sentence)
        
        return [tok.text for tok in self.nlp.tokenizer(sentence) if tok.text != " "]

In [3]:
tokenize_fr = tokenize('fr')
tokenize_en = tokenize('en')

In [4]:
FR_TEXT = data.Field(lower=True, tokenize=tokenize_fr.tokenizer, init_token='<sos>', eos_token='<eos>', pad_token="<pad>")
EN_TEXT = data.Field(lower=True, tokenize=tokenize_en.tokenizer, pad_token="<pad>")

In [None]:
enlines = open()

In [5]:
EN_TEXT = pickle.load(open('./pytorch_transformer/data/pickles/EN_IN.pkl', 'rb'))
#pickle.dump(EN_IN, open('./pytorch_transformer/data/pickles/EN_IN.pkl', 'wb'))

In [23]:
#EN_IN = data.Field(lower=True, tokenize=tokenize_en.tokenizer, pad_token="<blank>")
#EN_OUT = data.Field(lower=True, tokenize=tokenize_en.tokenizer, init_token='<sos>', eos_token='<eos>', pad_token="<blank")

In [30]:
data_fields = [('English', EN_IN), ('out', EN_OUT)]
trn = TabularDataset('./pytorch_transformer/data/en-en.csv', format='csv', fields=data_fields)

In [31]:
EN_IN.build_vocab(trn)
EN_OUT.build_vocab(trn)

In [3]:
EN_TEXT = pickle.load(open('./pytorch_transformer/data/pickles/EN_TEXT_big.pkl', 'rb'))
FR_TEXT = pickle.load(open('./pytorch_transformer/data/pickles/FR_TEXT_big.pkl', 'rb'))

In [4]:
data_fields = [('id', None), ('English', EN_TEXT), ('French', FR_TEXT)]
trn, vl = data.TabularDataset.splits(path='./pytorch_transformer/data/', train='train_short.csv', validation='val_short.csv', format='csv', fields=data_fields) 
vl.examples = torch.load("./pytorch_transformer/data/pickles/eurofra_examples_vl")
trn.examples = torch.load("./pytorch_transformer/data/pickles/eurofra_examples_trn")

In [7]:
#torch.save(trn.examples, "./pytorch_transformer/data/pickles/eurofra_examples_trn")
#torch.save(vl.examples, "./pytorch_transformer/data/pickles/eurofra_examples_vl")

In [5]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embed = nn.Embedding(vocab_size, d_model)
    def forward(self, x):
        return self.embed(x)

In [6]:
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len = 200, dropout = 0.1):
        super().__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)
        # create constant 'pe' matrix with values dependant on 
        # pos and i
        pe = torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = \
                math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = \
                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
 
    
    def forward(self, x):
        # make embeddings relatively larger
        x = x * math.sqrt(self.d_model)
        #add constant to embedding
        seq_len = x.size(1)
        x = x + Variable(self.pe[:,:seq_len], requires_grad=False).cuda()
        return self.dropout(x)

In [7]:
SRC = EN_TEXT
TRG = FR_TEXT

In [8]:
input_pad = SRC.vocab.stoi['<pad>']
target_pad = TRG.vocab.stoi['<pad>']

def create_masks(src, trg=None):
    
    src_mask = (src != input_pad).unsqueeze(-2)

    if trg is not None:
        trg_mask = (trg != target_pad).unsqueeze(-2)
        size = trg.size(1) # get seq_len for matrix
        nopeak_mask = np.triu(np.ones((1, size, size)), k=1).astype('uint8')
        nopeak_mask = Variable(torch.from_numpy(nopeak_mask) == 0).cuda()
        trg_mask = trg_mask & nopeak_mask
        
    else:
        trg_mask = None
    return src_mask, trg_mask

In [9]:
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout = 0.1):
        super().__init__()
        
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        
        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)
    def forward(self, q, k, v, mask=None):
        
        bs = q.size(0)
        
        # perform linear operation and split into N heads
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
        # transpose to get dimensions bs * N * sl * d_model
       
       
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)
        

        # calculate attention using function we will define next
        scores = attention(q, k, v, self.d_k, mask, self.dropout)
        # concatenate heads and put through final linear layer
        concat = scores.transpose(1,2).contiguous()\
        .view(bs, -1, self.d_model)
        output = self.out(concat)
    
        return output

In [10]:
def attention(q, k, v, d_k, mask=None, dropout=None):
    
    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
    

    if mask is not None:
        mask = mask.unsqueeze(1)
        scores = scores.masked_fill(mask == 0, -1e9)
    

    scores = F.softmax(scores, dim=-1)
    
    if dropout is not None:
        scores = dropout(scores)
        
    output = torch.matmul(scores, v)
    return output

In [11]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
        super().__init__() 
        # We set d_ff as a default to 2048
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)
    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x

In [12]:
class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()
    
        self.size = d_model
        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        self.eps = eps
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

In [13]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout = 0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        return x
    
# build a decoder layer with two multi-head attention layers and
# one feed-forward layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.norm_3 = Norm(d_model)
        
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)
        
        self.attn_1 = MultiHeadAttention(heads, d_model)
        self.attn_2 = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model).cuda()

    def forward(self, x, e_outputs, src_mask, trg_mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \
        src_mask))
        x2 = self.norm_3(x)
        x = x + self.dropout_3(self.ff(x2))
        return x

In [14]:
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

In [15]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.layers = get_clones(EncoderLayer(d_model, heads), N)
        self.norm = Norm(d_model)
    def forward(self, src, mask):
        x = self.embed(src)
        x = self.pe(x)
        for i in range(N):
            x = self.layers[i](x, mask)
        return self.norm(x)
    
class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.layers = get_clones(DecoderLayer(d_model, heads), N)
        self.norm = Norm(d_model)
    def forward(self, trg, e_outputs, src_mask, trg_mask):
        x = self.embed(trg)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
        return self.norm(x)

In [16]:
class Transformer(nn.Module):
    def __init__(self, src_vocab, trg_vocab, d_model, N, heads):
        super().__init__()
        self.encoder = Encoder(src_vocab, d_model, N, heads)
        self.decoder = Decoder(trg_vocab, d_model, N, heads)
        self.out = nn.Linear(d_model, trg_vocab)
    def forward(self, src, trg, src_mask, trg_mask):
        e_outputs = self.encoder(src, src_mask)
        #print("DECODER")
        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
        output = self.out(d_output)
        return output

In [17]:
d_model = 512
heads = 8
N = 6
src_vocab = len(SRC.vocab)
trg_vocab = len(TRG.vocab)
model = Transformer(src_vocab, trg_vocab, d_model, N, heads).cuda()
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)


In [20]:
model.load_state_dict(torch.load('./pytorch_transformer/data/weights/europarl_weigthse13'))

In [26]:
def train_model(epochs, print_every=100):
    
    model.train()
    
    start = time.time()
    temp = time.time()
    save_last = time.time()
    
    total_loss = 0
    
    for epoch in range(epochs):
        
       
        for i, batch in enumerate(train_iter): 

            src = batch.English.transpose(0,1)
            trg = batch.French.transpose(0,1)
            #print(trg[0, 1:])
            #batch = rebatch(target_pad, b)
    
            # the French sentence we input will have all words except
            # the last, as it is using each word to predict the next
            trg_input = trg[:, :-1]
            
           
            
            # the words we are trying to predict
            results = trg[:, 1:].contiguous().view(-1)
            
            # create function to make masks using mask code above
            src_mask, trg_mask = create_masks(src, trg_input)
            preds = model(src, trg_input, src_mask, trg_mask)
         #   val, ix = preds.data.topk(1)
            
            optimizer.zero_grad()
            
            # when calculating loss we use all words except the first
            # as this will be in line with the predictions
            loss = F.cross_entropy(preds.view(-1, preds.size(-1)), results, ignore_index=target_pad)
            #val, ix = preds[0].data.topk(1)
            #print(ix)
            loss.backward()
            optimizer.step()
            sched.step()
            
            total_loss += loss.item()
            if (i + 1) % print_every == 0:
                loss_avg = total_loss / print_every
                print("time = %dm, epoch %d, iter = %d, loss = %.3f, %ds per %d iters" % \
                ((time.time() - start) // 60, epoch + 1, i + 1, loss_avg, time.time() - temp, print_every))
                total_loss = 0
                temp = time.time()
                if (temp - save_last) // 60 >= 45:
                    torch.save(model.state_dict(), './pytorch_transformer/data/weights/europarl_weigths')
                    print("saved")
                    save_last = time.time()
           
        
        


In [19]:
class MyIterator(data.Iterator):
    def create_batches(self):
        if self.train:
            def pool(d, random_shuffler):
                for p in data.batch(d, self.batch_size * 100):
                    p_batch = data.batch(
                        sorted(p, key=self.sort_key),
                        self.batch_size, self.batch_size_fn)
                    for b in random_shuffler(list(p_batch)):
                        yield b
            self.batches = pool(self.data(), self.random_shuffler)
            
        else:
            self.batches = []
            for b in data.batch(self.data(), self.batch_size,
                                          self.batch_size_fn):
                self.batches.append(sorted(b, key=self.sort_key))

In [20]:
global max_src_in_batch, max_tgt_in_batch
def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    global max_src_in_batch, max_tgt_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
    max_src_in_batch = max(max_src_in_batch,  len(new.English))
    max_tgt_in_batch = max(max_tgt_in_batch,  len(new.French) + 2)
    src_elements = count * max_src_in_batch
    tgt_elements = count * max_tgt_in_batch
    return max(src_elements, tgt_elements)

In [24]:
def get_val_loss():
    
    model.eval()

    total_loss = 0
    
    for i, b in enumerate(val_iter):

            src = b.English.transpose(0,1)
            trg = b.French.transpose(0,1)
            trg_input = trg[:, :-1]
            
           
            
            # the words we are trying to predict
            results = trg[:, 1:].contiguous().view(-1)
            
            # create function to make masks using mask code above
            src_mask, trg_mask = create_masks(src, trg_input)
            
            
            preds = model(src, trg_input, src_mask, trg_mask)
            #loss = F.cross_entropy(preds.view(-1, preds.size(-1)), batch.trg_y.contiguous().view(-1), ignore_index=pad_idx)
            loss = F.cross_entropy(preds.view(-1, preds.size(-1)), results, ignore_index=target_pad)
            total_loss += loss.data[0]
    
    model.train()
    return (total_loss / (i + 1))

In [25]:
val_iter = MyIterator(vl, batch_size=1300, device=0,
                        repeat=False, sort_key=lambda x: (len(x.English), len(x.French)),
                        batch_size_fn=batch_size_fn, train=True, shuffle=True)

In [26]:
print(get_val_loss())



tensor(1.3302, device='cuda:0')


In [21]:
train_iter = MyIterator(trn, batch_size=1300, device=0,
                        repeat=False, sort_key=lambda x: (len(x.English), len(x.French)),
                        batch_size_fn=batch_size_fn, train=True, shuffle=True)

In [22]:
class CosineWithRestarts(torch.optim.lr_scheduler._LRScheduler):
    """
    Cosine annealing with restarts.

    Parameters
    ----------
    optimizer : torch.optim.Optimizer

    T_max : int
        The maximum number of iterations within the first cycle.

    eta_min : float, optional (default: 0)
        The minimum learning rate.

    last_epoch : int, optional (default: -1)
        The index of the last epoch.

    """

    def __init__(self,
                 optimizer: torch.optim.Optimizer,
                 T_max: int,
                 eta_min: float = 0.,
                 last_epoch: int = -1,
                 factor: float = 1.) -> None:
        # pylint: disable=invalid-name
        self.T_max = T_max
        self.eta_min = eta_min
        self.factor = factor
        self._last_restart: int = 0
        self._cycle_counter: int = 0
        self._cycle_factor: float = 1.
        self._updated_cycle_len: int = T_max
        self._initialized: bool = False
        super(CosineWithRestarts, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        """Get updated learning rate."""
        # HACK: We need to check if this is the first time get_lr() was called, since
        # we want to start with step = 0, but _LRScheduler calls get_lr with
        # last_epoch + 1 when initialized.
        if not self._initialized:
            self._initialized = True
            return self.base_lrs

        step = self.last_epoch + 1
        self._cycle_counter = step - self._last_restart

        lrs = [
            (
                self.eta_min + ((lr - self.eta_min) / 2) *
                (
                    np.cos(
                        np.pi *
                        ((self._cycle_counter) % self._updated_cycle_len) /
                        self._updated_cycle_len
                    ) + 1
                )
            ) for lr in self.base_lrs
        ]

        if self._cycle_counter % self._updated_cycle_len == 0:
            # Adjust the cycle length.
            self._cycle_factor *= self.factor
            self._cycle_counter = 0
            self._updated_cycle_len = int(self._cycle_factor * self.T_max)
            self._last_restart = step

        return lrs

In [23]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.000005, betas=(0.9, 0.98), eps=1e-9)
sched = CosineWithRestarts(optimizer, T_max=46646*2)

In [71]:
torch.save(model.state_dict(), './pytorch_transformer/data/weights/europarl_weigthse13')

In [28]:
torch.cuda.empty_cache()

In [30]:
SRC.preprocess("the cat didn't eat the man")

['the', 'cat', 'did', "n't", 'eat', 'the', 'man']

In [27]:
train_model(2, print_every=500)

time = 2m, epoch 1, iter = 500, loss = 10.001, 144s per 500 iters


KeyboardInterrupt: 

In [None]:
time = 225m, epoch 2, iter = 500, loss = 1.725, 190s per 500 iters
time = 227m, epoch 2, iter = 1000, loss = 1.314, 143s per 500 iters
saved
time = 230m, epoch 2, iter = 1500, loss = 1.327, 144s per 500 iters
time = 232m, epoch 2, iter = 2000, loss = 1.324, 143s per 500 iters
time = 234m, epoch 2, iter = 2500, loss = 1.329, 143s per 500 iters
time = 237m, epoch 2, iter = 3000, loss = 1.321, 142s per 500 iters
time = 239m, epoch 2, iter = 3500, loss = 1.322, 143s per 500 iters
time = 242m, epoch 2, iter = 4000, loss = 1.321, 143s per 500 iters
time = 244m, epoch 2, iter = 4500, loss = 1.325, 143s per 500 iters
time = 246m, epoch 2, iter = 5000, loss = 1.334, 143s per 500 iters
time = 249m, epoch 2, iter = 5500, loss = 1.321, 143s per 500 iters
time = 251m, epoch 2, iter = 6000, loss = 1.325, 143s per 500 iters
time = 254m, epoch 2, iter = 6500, loss = 1.333, 144s per 500 iters
time = 256m, epoch 2, iter = 7000, loss = 1.328, 143s per 500 iters
time = 258m, epoch 2, iter = 7500, loss = 1.337, 144s per 500 iters
time = 261m, epoch 2, iter = 8000, loss = 1.307, 143s per 500 iters
time = 263m, epoch 2, iter = 8500, loss = 1.337, 143s per 500 iters
time = 266m, epoch 2, iter = 9000, loss = 1.326, 143s per 500 iters
time = 268m, epoch 2, iter = 9500, loss = 1.328, 143s per 500 iters
time = 270m, epoch 2, iter = 10000, loss = 1.325, 143s per 500 iters
time = 273m, epoch 2, iter = 10500, loss = 1.320, 143s per 500 iters
saved
time = 275m, epoch 2, iter = 11000, loss = 1.328, 144s per 500 iters
time = 278m, epoch 2, iter = 11500, loss = 1.328, 143s per 500 iters
time = 280m, epoch 2, iter = 12000, loss = 1.332, 143s per 500 iters
time = 282m, epoch 2, iter = 12500, loss = 1.319, 142s per 500 iters
time = 285m, epoch 2, iter = 13000, loss = 1.334, 143s per 500 iters
time = 287m, epoch 2, iter = 13500, loss = 1.335, 143s per 500 iters
time = 289m, epoch 2, iter = 14000, loss = 1.326, 142s per 500 iters
time = 292m, epoch 2, iter = 14500, loss = 1.325, 143s per 500 iters
time = 294m, epoch 2, iter = 15000, loss = 1.323, 143s per 500 iters
time = 297m, epoch 2, iter = 15500, loss = 1.316, 142s per 500 iters
time = 299m, epoch 2, iter = 16000, loss = 1.328, 142s per 500 iters
time = 301m, epoch 2, iter = 16500, loss = 1.330, 143s per 500 iters
time = 304m, epoch 2, iter = 17000, loss = 1.327, 142s per 500 iters
time = 306m, epoch 2, iter = 17500, loss = 1.314, 143s per 500 iters
time = 309m, epoch 2, iter = 18000, loss = 1.318, 143s per 500 iters
time = 311m, epoch 2, iter = 18500, loss = 1.326, 142s per 500 iters
time = 313m, epoch 2, iter = 19000, loss = 1.333, 142s per 500 iters
time = 316m, epoch 2, iter = 19500, loss = 1.319, 143s per 500 iters
time = 318m, epoch 2, iter = 20000, loss = 1.317, 142s per 500 iters
saved
time = 320m, epoch 2, iter = 20500, loss = 1.328, 144s per 500 iters
time = 323m, epoch 2, iter = 21000, loss = 1.324, 143s per 500 iters
time = 325m, epoch 2, iter = 21500, loss = 1.324, 143s per 500 iters
time = 328m, epoch 2, iter = 22000, loss = 1.338, 143s per 500 iters
time = 330m, epoch 2, iter = 22500, loss = 1.312, 143s per 500 iters
time = 332m, epoch 2, iter = 23000, loss = 1.310, 143s per 500 iters
time = 335m, epoch 2, iter = 23500, loss = 1.321, 142s per 500 iters
time = 337m, epoch 2, iter = 24000, loss = 1.316, 142s per 500 iters

In [29]:
max_len = 80
eos_tok = FR_TEXT.vocab.stoi['<eos>']

def get_mask(size):
    trg_mask = np.triu(np.ones((1, size, size)),
    k=1).astype('uint8')
    return Variable(torch.from_numpy(trg_mask) == 0).cuda()


def init_vars(e_output, model, src_mask, k):
    
    init_tok = FR_TEXT.vocab.stoi['<sos>']
    
    outputs = torch.LongTensor([[init_tok]]).cuda()
    trg_mask = get_mask(1)
    
    out = model.out(model.decoder(outputs,
    e_output, src_mask, trg_mask))
    out = F.softmax(out, dim=-1)
    
    probs, ix = out[:, -1].data.topk(k)
    log_scores = torch.Tensor([math.log(prob) for prob in probs.data[0]]).unsqueeze(0)
    
    outputs = torch.zeros(k, max_len).long().cuda()
    outputs[:, 0] = 2
    outputs[:, 1] = ix[0]
    
    e_outputs = torch.zeros(k, e_output.size(-2),e_output.size(-1)).cuda()
    e_outputs[:, :] = e_output[0]
    
    return outputs, e_outputs, log_scores

def k_best_outputs(outputs, out, log_scores, i, k):
    
    probs, ix = out[:, -1].data.topk(k)
    log_probs = torch.Tensor([math.log(p) for p in probs.data.view(-1)]).view(k, -1) + log_scores.transpose(0,1)
    k_probs, k_ix = log_probs.view(-1).topk(k)
    
    row = k_ix // k
    col = k_ix % k

    outputs[:, :i] = outputs[row, :i]
    outputs[:, i] = ix[row, col]

    log_scores = k_probs.unsqueeze(0)
    
    return outputs, log_scores

def beam_search(e_output, model, src_mask, k):
    
    outputs, e_outputs, log_scores = init_vars(e_output, model, src_mask, k)

    for i in range(2, max_len):
    
        trg_mask = get_mask(i)

        out = model.out(model.decoder(outputs[:,:i],
        e_outputs, src_mask, trg_mask))

        out = F.softmax(out, dim=-1)
    
        outputs, log_scores = k_best_outputs(outputs, out, log_scores, i, k)
        
        if (outputs==eos_tok).nonzero().size(0) == k:
            alpha = 0.7
            div = 1/((outputs==eos_tok).nonzero()[:,1].type_as(log_scores)**alpha)
            _, ind = torch.max(log_scores * div, 1)
            ind = ind.data[0]
            break
    
    l = (outputs[ind]==eos_tok).nonzero()
    sentence = ' '.join([FR_TEXT.vocab.itos[tok] for tok in outputs[ind][1:l]])
    sentence = multiple_replace({' ?' : '?',' !':'!',' .':'.','\' ':'\'',' ,':','}, sentence)   
    return sentence
    
def translate(src, model, k, custom=False):
    
    model.eval()
    
    if custom == True:
        indexed = []
        sentence = tokenize_en.tokenizer(src)
        for tok in sentence:
            if EN_TEXT.vocab.stoi[tok] != 0:
                indexed.append(EN_TEXT.vocab.stoi[tok])
            else:
                indexed.append(get_synonym(tok))
        src=Variable(torch.LongTensor([indexed])).cuda()
    
    src_mask = (src != input_pad).unsqueeze(-2)
    e_outputs = model.encoder(src, src_mask)
    
    
    if k != 0:
        sentence = beam_search(e_outputs, model, src_mask, k)
        
    return sentence
    

In [55]:
torch.cuda.empty_cache()
#string = 'moscow and beijing came to understand that Western governments will always face pressure to back democracy activists, regardless of what cooperative relations existed beforehand. They saw how media organizations published material that destabilized their regimes. They worried about Google and social-media companies. Crucially, they realized that these companies made their choices independent of Washington. They were an intrinsic part of the liberal order'
#string = 'We expected genetics to reveal the keys to life. And, in a sense, it has; but the keys are too complex to be computed. “The most recent estimate for how many genes are involved in complex traits like height or intelligence is approximately ‘all of them’ – by the latest count, about twenty thousand. There are three billion base pairs in the human genome. But each of those base pairs is a nice, clean, discrete unit with one of four values. In a way, saying ‘everything has three billion possible causes’ is a mercy; it’s placing an upper bound on how terrible genetics can be'
#string= 'Disarming introduction to the new Penguin Book Of Japanese Short Stories. I know next to nothing about modern and contemporary Japanese fiction. I would guess that most readers of this book know as little as I do. Which is why, in this introduction, I am not standing a step above you, but taking a position on the same level, so that together we can think about how best to approach this anthology. Let’s say that you are being guided through a foreign town by someone who lives in the country and speaks the language, but doesn’t know that much about the geography or history'
#string = 'if released today, it would be criticized for its moralising of American nationalism, and for celebrating French colonial rule. Read as a migration narrative, however, Casablanca reminds us that the identification papers we carry were created not to give us freedom but rather to curtail it.'
#string = "my friend eats cheese everyday except wednesday when he goes to fish with his friends"
string = 'A father takes his two small daughters fishing. One of them drowns. Accident or murder? A jury finds for murder. The father goes to jail. This gripping account argues that the death was accidental, and that the prosecution greatly exaggerated the evidence for homicide. But then there is the $50,000 insurance policy which the father took out on his child’s life a week before her death; his history of petty crime; his tempestuous remarriage to a woman with 12 other husbands; and the allegation that the drowning took place in just 17 inches of water'
#string = 'What we can learn about ants’ decision-making by requiring an ant colony to site a new nest. “We know the ants prefer dark nests with small entrances. We found that when light differences are tiny, the colonies make better decisions than individual ants do. But where there’s a big difference in brightness, the individuals working on their own actually get the question right more often than the colony does. That surprised us, because we were expecting the wisdom of crowds to work across the board'
#string = "i will one day take over the world with my hacking abilities"


def translator(text):
    sentences = text.lower().split('.')
    print(text + '\n\n')
    translated = []
    for sentence in sentences:
        translated.append(translate(sentence + '.', model, 5, custom=True).capitalize())

    torch.cuda.empty_cache()
    print(' '.join(translated))
   
        

In [56]:
translator('moscow and beijing came to understand that Western governments will always face pressure to back democracy activists, regardless of what cooperative relations existed beforehand. They saw how media organizations published material that destabilized their regimes. They worried about Google and social-media companies. Crucially, they realized that these companies made their choices independent of Washington. They were an intrinsic part of the liberal order')

moscow and beijing came to understand that Western governments will always face pressure to back democracy activists, regardless of what cooperative relations existed beforehand. They saw how media organizations published material that destabilized their regimes. They worried about Google and social-media companies. Crucially, they realized that these companies made their choices independent of Washington. They were an intrinsic part of the liberal order


Moscou et pékin ont compris que les gouvernements occidentaux seront toujours soumis à des pressions pour soutenir les militants de la démocratie, indépendamment de ce qui existait auparavant dans les relations de coopération. Ils ont vu comment des organisations de médias ont publié des documents qui ont jusqu'à déstabiliser leurs régimes. Ils s'inquiètent de la présence de la société de la presse. Ils ont pris conscience, de manière cruciale, que ces entreprises ont fait leurs choix indépendants de washington. Elles faisaient parti

In [30]:
def multiple_replace(dict, text):
  # Create a regular expression  from the dictionary keys
  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))

  # For each match, look-up corresponding value in dictionary
  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text) 

In [51]:
from nltk.corpus import wordnet
unknown_words = []

def get_synonym(word):
    unknown_words.append(word)
    syns = wordnet.synsets(word)
    for s in syns:
        for l in s.lemmas():
            if EN_TEXT.vocab.stoi[l.name()] != 0:
                return EN_TEXT.vocab.stoi[l.name()]
            
    return 0

In [315]:
sentence = tokenize_en.tokenizer('I am a tireless worker')

In [342]:
sentence = tokenize_en.tokenizer('Cadavers had always made me feel queasy. In the near corner was a group of doctors and nurses, and next to them was a plastic stretcher. Behind the group was a wooden table where a nurse and two medical students were sitting grim-faced')
indexed = []

for tok in sentence:
    if EN_TEXT.vocab.stoi[tok] != 0:
        indexed.append(EN_TEXT.vocab.stoi[tok])
    else:
        indexed.append(get_synonym(tok))
        
print(' '.join([EN_TEXT.vocab.itos[ix] for ix in indexed]))



entered
entered
corpse had always made me feel nauseous . in the near corner was a group of doctors and nurses , and next to them was a plastic stretcher . behind the group was a wooden table where a nurse and two medical students were sitting grim faced


In [263]:
i = 0
for k in sorted(EN_TEXT.vocab.stoi):
    print(k)
    i += 1
    if i == 20000:
        break

  
#
$
%
&
'
''
'd
'em
'll
'm
're
's
've
,
.
..
...
....
.....
......
.......
........
.........
...........
............
.............
.1
.2.that
.a
.after
.as
.at
.avian
.be
.between
.bizarrely
.by
.bytes
.com
.consequently
.constant
.dealing
.eu
.europe
.even
.fifty
.governmental
.here
.how
.i
.i'll
.in
.industrial
.info
.inspired
.it
.job
.kid
.latin
.like
.madam
.mr
.mrs
.net
.nothing
.on
.one
.only
.org
.our
.portugal
.pro
.responding
.sco
.small
.successful
.term
.thanks
.the
.there
.they
.this
.uk
.we
.what
.when
.with
.yesterday
.–
.–at
.–immediately
.–the
.–we
0
0.0
0.00
0.0000003
0.0001
0.00014
0.00029
0.003
0.004
0.005
0.006
0.009
0.01
0.012
0.015
0.018
0.02
0.03
0.036
0.038
0.04
0.042
0.05
0.06
0.06o
0.07
0.08
0.09
0.095
0.0x%
0.1
0.10
0.11
0.12
0.13
0.1397
0.14
0.15
0.16
0.17
0.18
0.19
0.2
0.20
0.21
0.22
0.23
0.24
0.25
0.26
0.27
0.28
0.29
0.2oc
0.3
0.30
0.31
0.32
0.33
0.34
0.35
0.36
0.37
0.38
0.39
0.4
0.40
0.41
0.42
0.43
0.44
0.45
0.46
0.47
0.48
0.5
0.50
0.51
0.54
0.55
0.

131
131.6
1310
13114
1312
13175
13177
1318
132
1320
1322
13245
1325
13254
13258
13262
13274
13292
133
133.7
133.8
13300
1331
1332
13328
1333
1334
13348
1336
13376
1338
13380
13382
1339
133rd
133ºc
134
134.2
134.4
13428
1343
13464
1347
13472
13475
1348
13482
13484
134a
135
135,000
135.13
1350
1351
1355
1359
13593
136
136.4
1360
1364
13651
13659
1368
137
137.5
1371
1372
1373
1374
13740
1376
13767
1378
13781
138
138.b
1383
13836
1384
1386
13874
13886
1389
138a
138b
138d
138e
139
1390
1395
13968
1397
13a
13bn
13th
14
14,000
14.00
14.1
14.1.2
14.10
14.2
14.3
14.30
14.34
14.37
14.4
14.40
14.5
14.55
14.6
14.65
14.7
14.7.1999
14.8
14.9
140
1400
14000
14001
14010
14011
14015
1402
1403
14039
1405
1406
14060
14061
1407
1408
140871
1409
14091
14094
140a
141
141.4
141.8
1412
1419
141bn
142
14203
14207
1422
14247
14248
14249
1426
14263
14272
14287
14288
1429
143
1434
14340
14341
1435
14363
1438
1439
144
144.8
14402
1441
1442
1445
1447
1448
145
145.95
14502
1452
1453
14539
1454
14540
14541
14546
1454

3.5%in
3.50
3.55
3.6
3.6%on
3.60
3.604
3.61
3.63
3.66
3.68
3.7
3.75
3.793
3.8
3.87
3.89
3.9
3.98
3.99
30
30,000
30.1
30.3
30.33
30.4
30.45
30.5
30.76
30.8
30.8.2002
300
300,000
3000
3001
3002
3003
3004
3005
3008
300s
300th
301
3014
3015
302
3020
3021
303
3030
3035
3037
304
3040
3041
3042
305
3052
3053
306
3061
3062
3069
307
307.6
307.9
3072
308
309
309.9
3090
3093
3094
30a
30k
30p
30s
30th
31
31.1.2008
31.12.1996
31.12.2000
31.12.2003
31.12.2006
31.2
31.4
31.5
31.6
31.7
31.8
31.9
310
3100
311
3111
3118
312
313
313th
314
314th
315
316
317
318
3182
319
31p
31st
32
32.1
32.4
32.5
32.6
32.678
320
320,000
3200
3201
3202
321
3212
322
322.8
3223
3227
323
324
324.8
3245
325
3254
3258
3259
326
326.6
3262
327
328
3281
329
3295
32nd
33
33.3
33.4
33.5
33.55
33.6
33.7
33.8
33.88
33.98
330
3300
3304
3307
3308
331
3314
332
3325
333
3330
334
33400
335
336
336.1
336.2
336b
337
338
3381
339
3392
3394
33a
33rd
34
34,000
34.1
34.1.1
34.3
34.4
34.5
34.6
34.75
34.8
34.94
34.the
340
340.75
3400
341
34102
342

988
989
99
99,105
99.1
99.2
99.4
99.5
99.6
99.69
99.7
99.8
99.9
99.99
990
9904
991
9919
992
9923
9924
993
994
9947
995
996
997
998
9980
999
9a
9b
9c
9e
9th
<blank
<pad>
<unk>
>
?
@lis
Eat
Eating
I
We
_
`
a
a.
a.1
a.2
a.4
a.746
a.837
a.a.j.g.
a.d.
a.j.
a.k.a
a.l.
a.m
a.m.
a.m.and
a.m.i.
a.q.
a1
a10
a12
a120
a17
a1900
a2
a20
a200
a22
a2s
a3
a30
a32
a320
a320s
a33
a337
a350
a360
a380
a3xx
a4
a40
a400
a40077
a40108
a5
a5.0376
a5­
a5­0003
a5­0030
a5­0037
a5­0045
a5­0057
a5­0058
a5­0059
a5­0063
a5­0064
a5­0078
a5­0083
a5­0084
a5­0098
a5­0104
a5­0135
a5­0144
a5­0163
a5­0165
a5­0187
a5­0191
a5­0193
a5­0203
a5­0207
a5­0210
a5­0250
a5­0277
a5­0278
a5­0373
a6
a6000
a60115
a60117
a60118
a60120
a60124
a60127
a60128
a60137
a60144
a60147
a6053
a6–0204
a6–0234
a6–0235
a6–0255
a6–0262
a6–0267
a7
a7000
a73
a74
a75
a7s
a8
a9
aa
aaa
aaaltonen
aaas
aachen
aair
aaiun
aaiún
aal
aalborg
aalto
aaltonen
aamer
aan
aanmerking
aap
aargh
aarhus
aaron
aartsen
aau
ab
ababa
abac
abacha
aback
abacus
abad
abairt
abandon

allowable
allowance
allowances
allowed
allowing
allows
alloys
allpervading
allpowerful
allround
alls
allude
alluded
alludes
alluding
allure
allurement
allures
alluring
allusion
allusions
allusive
allusively
alluvial
allwheeldrive
ally
allying
allyson
all­embracing
allègre
allée
alma
almada
almaden
almadraba
almadén
alman
almansa
almaty
almaz
almeida
almeira
almelo
almendralejo
almeria
almería
almighty
almindelige
almond
almondo
almonds
almost
alms
almunia
aloft
alogoskoufis
alois
alojz
alone
along
along100
alongside
aloni
alonso
aloof
aloofness
aloud
aloun
aloyzas
alp
alpe
alpes
alpha
alphabenzopyrenes
alphabet
alphabetical
alphabetically
alphanumeric
alphanumerical
alphaorder
alphen
alphonse
alpine
alps
alqueva
already
alright
alrosa
als
alsace
alsatian
alsatians
alsina
alsmede
also
alsthom
alstom
alstoms
alston
alta
altafaj
altaner
altar
altarpieces
altars
altas
alte
altea
altenburg
altener
altenkirchen
alter
altera
alterable
alteration
alterations
altercation
altercations
altered
al

astoundingly
astounds
astravo
astray
astrazeneca
astreinte
astrid
astride
astrium
astrologers
astrological
astrology
astronaut
astronautics
astronauts
astronomer
astronomers
astronomic
astronomical
astronomically
astronomy
astrophysical
astrophysics
asturian
asturians
asturias
astute
astutely
astuteness
astérix
asumu
asunción
asunder
asunmaa
asylum
asylums
asylumseekers
asymmetric
asymmetrical
asymmetrically
asymmetries
asymmetry
asymptomatic
asymptotically
asynchronicity
asynchronous
asynchrony
asyr
así
at
at12
ata
atabeh
ataf
ataka
atakov
atalanta
atalante
atalay
atanas
atanasof
atassi
ataturk
atatürk
atavism
atavistic
ataxia
atc
atcn
atd
ate
atefeh
atefi
atenco
ateş
ath
athamna
athanasios
athanasiu
athaontú
athar
atheism
atheist
atheistic
atheists
athena
athenian
athenians
athens
atherosclerosis
athis
athisaari
athist
athlete
athletes
athletic
athleticism
athletics
athos
athruithe
athy
atiyah
atjeh
atk
atkins
atlanta
atlantbanan
atlantic
atlantica
atlanticism
atlanticist
atlantique


beida
beidh
beidou
beige
beijing
beilin
being
beings
beira
beiras
beiro
beirut
beit
beitenu
bejing
beka
bekaa
bekele
bekim
beknazarov
bel
bela
belabour
belaboured
belakang
belami
belang
belardinelli
belarus
belarusian
belarusians
belarussian
belarussians
belated
belatedly
belato
belch
belchite
belder
beleaguered
beleid
belen
belene
belenguer
belet
beleta
beleza
belfast
belfort
belga
belgacom
belge
belgian
belgians
belgicism
belgique
belgium
belgrade
belhassen
belhouchet
belie
belied
belief
beliefs
belies
believable
believe
believed
believer
believers
believes
believing
belin
belinda
belittle
belittled
belittles
belittling
belize
belka
belkhadem
bell
bella
bellacruz
bellamy
bellany
bellas
bellboy
belle
belleau
bellerive
bellerè
belleré
belli
belliard
bellicose
bellies
belligerence
belligerency
belligerent
belligerently
belligerents
bellin
bellinger
bellingshausen
bellingwolde
bellini
bellona
bellowed
bellows
bells
bellshill
bellum
belly
bellyaching
belmarsh
belneyski
belo
belohorská
bel

brüning
brăila
brčko
bsc
bse
bsec
bsefree
bseinfected
bses
bskyb
bskyb.
bsp
bss
bst
bt
btb
btf
btob
btp
btwc
buartha
buav
bubble
bubbles
bubbling
bubbly
buber
bubi
bubis
bubonic
buc
bucar
buccaneer
buccaneering
buccaneers
bucella
buch
buchanan
bucharest
buchenwald
buchmesse
buchtar
buck
buckby
bucked
bucket
buckets
buckfast
bucking
buckingham
buckinghamshire
buckle
buckled
buckles
buckling
bucks
buckshot
buckwell
buco
bucolic
bud
buda
budaházy
budakeszi
budanov
budapest
budbergyt
budbergytas
budbergythas
budbergytė
budd
buddha
buddhas
buddhism
buddhist
buddhists
buddies
budding
buddy
budg
budge
budged
budgerigars
budget
budgetarily
budgetarist
budgetary
budgeted
budgeteers
budgeting
budgetisation
budgetise
budgetised
budgetising
budgetization
budgetize
budgetized
budgetizing
budgetpresident
budgets
budgetwatch
budging
budiman
budimir
budreikaitfor
budreikaitis
budreikaiton
budreikaittalked
budreikaitvery
budreikaitè
budreikaité
budreikaitė
budreikatė
budrus
buds
buena
buenas
buenos
bue

caucasian
caucasians
caucasus
cauchon
caucus
caucuses
caucusus
caudillo
caudillos
caudron
caught
cauldron
caulerpa
cauliflower
caulk
caullery
cauquil
causa
causal
causalities
causality
causally
causam
causation
causations
causative
cause
causebook
caused
causes
causing
caustic
cauterised
cauterize
caution
cautionary
cautioned
cautions
cautious
cautiously
cautiousness
cauwenberghe
cavacini
cavaco
cavada
cavalcade
cavalese
cavalier
cavalierly
cavallina
cavalry
cavan
cave
caveant
caveat
caveats
caved
cavemen
caveri
cavern
caverns
caves
caviar
cavil
caving
cavities
cavity
cavour
cawing
cawley
caxito
cayenne
cayman
cazalet
cañete
cb
cbc
cbd
cbfa
cbi
cbrn
cbs
cbss
cc
cca
ccamlr
ccashh
ccb
ccbe
ccc
cccctb
ccctb
ccd
ccee
ccees
cci
ccis
ccl
ccm
cco
cconomic
ccps
ccs
cctb
cctld
cctv
ccv
ccw
ccwc
cd
cda
cdbi
cdc
cde
cdf
cdi
cdm
cdms
cdos
cdp
cds
cdss
cdte
cdu
ce
ce'marking
cead
ceangal
ceann
ceannairí
ceas
cease
ceased
ceasefire
ceasefires
ceaseless
ceaselessly
ceases
ceasing
ceaucescu
ceaucşescu

coercively
coerciveness
coest
coeur
coeuropean
coexist
coexisted
coexistence
coexistent
coexisting
coexists
coextensive
coface
coffee
coffeepot
coffees
coffeeshop
coffer
cofferati
coffers
coffin
coffins
cofin
cofinance
cofinanced
cofinances
cofinanciers
cofinancing
cofounded
cofounder
cofounders
cofunded
cofunding
cog
cogealac
cogeca
cogema
cogen
cogency
cogenerated
cogeneration
cogent
cogently
coggeshall
cognac
cognate
cognisable
cognisance
cognisant
cognition
cognitive
cognizance
cognizant
cogs
cogwheels
cohabit
cohabitation
cohabited
cohabitee
cohabitees
cohabiting
cohen
cohension
cohere
cohered
coherence
coherences
coherency
coherent
coherently
coheres
cohesion
cohesions
cohesive
cohesively
cohesiveness
cohn
cohnbendit
cohn­bendit
cohom
cohort
cohorts
cohuma
coiepa
coil
coillte
coils
coimbra
coimisiúin
coin
coinage
coincide
coincided
coincidence
coincidences
coincidental
coincidentally
coincidently
coincides
coinciding
coincineration
coined
coining
coins
coiste
coitus
cojur
coke
co

In [None]:
requires_grad = false

time = 0m, epoch 1, iter = 100, loss = 7.002, 23s per 100 iters
time = 0m, epoch 1, iter = 200, loss = 5.153, 22s per 100 iters
time = 1m, epoch 1, iter = 300, loss = 4.653, 22s per 100 iters
time = 1m, epoch 1, iter = 400, loss = 4.319, 22s per 100 iters
time = 1m, epoch 1, iter = 500, loss = 4.036, 22s per 100 iters
time = 2m, epoch 1, iter = 600, loss = 3.751, 22s per 100 iters
time = 2m, epoch 1, iter = 700, loss = 3.585, 22s per 100 iters
time = 3m, epoch 1, iter = 800, loss = 3.471, 22s per 100 iters
time = 3m, epoch 1, iter = 900, loss = 3.514, 22s per 100 iters
time = 3m, epoch 1, iter = 1000, loss = 3.498, 22s per 100 iters
time = 4m, epoch 2, iter = 100, loss = 5.285, 35s per 100 iters
time = 4m, epoch 2, iter = 200, loss = 3.166, 22s per 100 iters
time = 5m, epoch 2, iter = 300, loss = 2.950, 22s per 100 iters
time = 5m, epoch 2, iter = 400, loss = 2.898, 22s per 100 iters
time = 5m, epoch 2, iter = 500, loss = 2.778, 23s per 100 iters
time = 6m, epoch 2, iter = 600, loss = 2.695, 22s per 100 iters
time = 6m, epoch 2, iter = 700, loss = 2.631, 22s per 100 iters
time = 7m, epoch 2, iter = 800, loss = 2.561, 22s per 100 iters
time = 7m, epoch 2, iter = 900, loss = 2.583, 22s per 100 iters
time = 7m, epoch 2, iter = 1000, loss = 2.509, 22s per 100 iters
time = 8m, epoch 3, iter = 100, loss = 3.955, 35s per 100 iters
time = 8m, epoch 3, iter = 200, loss = 2.362, 22s per 100 iters
time = 9m, epoch 3, iter = 300, loss = 2.338, 22s per 100 iters
time = 9m, epoch 3, iter = 400, loss = 2.279, 22s per 100 iters
time = 9m, epoch 3, iter = 500, loss = 2.226, 22s per 100 iters
time = 10m, epoch 3, iter = 600, loss = 2.206, 22s per 100 iters
time = 10m, epoch 3, iter = 700, loss = 2.112, 22s per 100 iters
time = 11m, epoch 3, iter = 800, loss = 2.071, 22s per 100 iters
time = 11m, epoch 3, iter = 900, loss = 2.071, 22s per 100 iters
time = 11m, epoch 3, iter = 1000, loss = 2.073, 22s per 100 iters
time = 12m, epoch 4, iter = 100, loss = 3.148, 36s per 100 iters
time = 12m, epoch 4, iter = 200, loss = 1.966, 22s per 100 iters
time = 13m, epoch 4, iter = 300, loss = 2.017, 22s per 100 iters
time = 13m, epoch 4, iter = 400, loss = 1.934, 22s per 100 iters
time = 13m, epoch 4, iter = 500, loss = 1.928, 22s per 100 iters
time = 14m, epoch 4, iter = 600, loss = 1.922, 22s per 100 iters
time = 14m, epoch 4, iter = 700, loss = 1.823, 22s per 100 iters
time = 15m, epoch 4, iter = 800, loss = 1.801, 22s per 100 iters
time = 15m, epoch 4, iter = 900, loss = 1.789, 22s per 100 iters
time = 15m, epoch 4, iter = 1000, loss = 1.732, 22s per 100 iters
time = 16m, epoch 5, iter = 100, loss = 2.737, 36s per 100 iters
time = 16m, epoch 5, iter = 200, loss = 1.767, 22s per 100 iters
time = 17m, epoch 5, iter = 300, loss = 1.794, 23s per 100 iters
time = 17m, epoch 5, iter = 400, loss = 1.674, 22s per 100 iters
time = 17m, epoch 5, iter = 500, loss = 1.654, 22s per 100 iters
time = 18m, epoch 5, iter = 600, loss = 1.663, 22s per 100 iters
time = 18m, epoch 5, iter = 700, loss = 1.639, 22s per 100 iters
time = 19m, epoch 5, iter = 800, loss = 1.599, 22s per 100 iters
time = 19m, epoch 5, iter = 900, loss = 1.608, 22s per 100 iters
time = 19m, epoch 5, iter = 1000, loss = 1.602, 22s per 100 iters
time = 20m, epoch 6, iter = 100, loss = 2.418, 35s per 100 iters
time = 20m, epoch 6, iter = 200, loss = 1.637, 22s per 100 iters
time = 21m, epoch 6, iter = 300, loss = 1.586, 22s per 100 iters
time = 21m, epoch 6, iter = 400, loss = 1.570, 22s per 100 iters
time = 21m, epoch 6, iter = 500, loss = 1.560, 22s per 100 iters
time = 22m, epoch 6, iter = 600, loss = 1.564, 22s per 100 iters
time = 22m, epoch 6, iter = 700, loss = 1.470, 22s per 100 iters
time = 23m, epoch 6, iter = 800, loss = 1.435, 22s per 100 iters
time = 23m, epoch 6, iter = 900, loss = 1.443, 22s per 100 iters
time = 23m, epoch 6, iter = 1000, loss = 1.432, 22s per 100 iters
time = 24m, epoch 7, iter = 100, loss = 2.197, 35s per 100 iters
time = 24m, epoch 7, iter = 200, loss = 1.479, 22s per 100 iters
time = 25m, epoch 7, iter = 300, loss = 1.460, 22s per 100 iters
time = 25m, epoch 7, iter = 400, loss = 1.417, 22s per 100 iters
time = 26m, epoch 7, iter = 500, loss = 1.333, 23s per 100 iters
time = 26m, epoch 7, iter = 600, loss = 1.406, 22s per 100 iters
time = 26m, epoch 7, iter = 700, loss = 1.381, 22s per 100 iters
time = 27m, epoch 7, iter = 800, loss = 1.356, 22s per 100 iters
time = 27m, epoch 7, iter = 900, loss = 1.374, 22s per 100 iters
time = 27m, epoch 7, iter = 1000, loss = 1.354, 22s per 100 iters
time = 28m, epoch 8, iter = 100, loss = 1.977, 36s per 100 iters
time = 28m, epoch 8, iter = 200, loss = 1.321, 22s per 100 iters
time = 29m, epoch 8, iter = 300, loss = 1.338, 22s per 100 iters
time = 29m, epoch 8, iter = 400, loss = 1.375, 22s per 100 iters
time = 30m, epoch 8, iter = 500, loss = 1.330, 23s per 100 iters
time = 30m, epoch 8, iter = 600, loss = 1.318, 22s per 100 iters
time = 30m, epoch 8, iter = 700, loss = 1.292, 22s per 100 iters
time = 31m, epoch 8, iter = 800, loss = 1.280, 22s per 100 iters
time = 31m, epoch 8, iter = 900, loss = 1.258, 22s per 100 iters
time = 31m, epoch 8, iter = 1000, loss = 1.268, 23s per 100 iters
time = 32m, epoch 9, iter = 100, loss = 1.854, 35s per 100 iters
time = 32m, epoch 9, iter = 200, loss = 1.255, 22s per 100 iters
time = 33m, epoch 9, iter = 300, loss = 1.252, 23s per 100 iters
time = 33m, epoch 9, iter = 400, loss = 1.260, 22s per 100 iters
time = 34m, epoch 9, iter = 500, loss = 1.204, 22s per 100 iters
time = 34m, epoch 9, iter = 600, loss = 1.237, 23s per 100 iters
time = 34m, epoch 9, iter = 700, loss = 1.242, 22s per 100 iters
time = 35m, epoch 9, iter = 800, loss = 1.188, 22s per 100 iters
time = 35m, epoch 9, iter = 900, loss = 1.205, 22s per 100 iters
time = 35m, epoch 9, iter = 1000, loss = 1.145, 22s per 100 iters
time = 36m, epoch 10, iter = 100, loss = 1.777, 36s per 100 iters
time = 36m, epoch 10, iter = 200, loss = 1.150, 22s per 100 iters
time = 37m, epoch 10, iter = 300, loss = 1.158, 22s per 100 iters
time = 37m, epoch 10, iter = 400, loss = 1.187, 22s per 100 iters
time = 38m, epoch 10, iter = 500, loss = 1.143, 22s per 100 iters
time = 38m, epoch 10, iter = 600, loss = 1.196, 22s per 100 iters

In [26]:
model.load_state_dict(torch.load("./pytorch_transformer/data/weights/encoder_pretrained_fra_gradFalse40m"))

In [None]:
time = 0m, epoch 1, iter = 100, loss = 7.188, 23s per 100 iters
time = 0m, epoch 1, iter = 200, loss = 5.358, 22s per 100 iters
time = 1m, epoch 1, iter = 300, loss = 4.825, 22s per 100 iters
time = 1m, epoch 1, iter = 400, loss = 4.438, 23s per 100 iters
time = 1m, epoch 1, iter = 500, loss = 4.240, 22s per 100 iters
time = 2m, epoch 1, iter = 600, loss = 3.878, 22s per 100 iters
time = 2m, epoch 1, iter = 700, loss = 3.798, 22s per 100 iters
time = 3m, epoch 1, iter = 800, loss = 3.606, 23s per 100 iters
time = 3m, epoch 1, iter = 900, loss = 3.543, 23s per 100 iters
time = 3m, epoch 1, iter = 1000, loss = 3.443, 23s per 100 iters
time = 4m, epoch 2, iter = 100, loss = 5.295, 39s per 100 iters
time = 4m, epoch 2, iter = 200, loss = 3.056, 23s per 100 iters
time = 5m, epoch 2, iter = 300, loss = 2.992, 23s per 100 iters
time = 5m, epoch 2, iter = 400, loss = 2.855, 23s per 100 iters
time = 6m, epoch 2, iter = 500, loss = 2.774, 23s per 100 iters
time = 6m, epoch 2, iter = 600, loss = 2.719, 23s per 100 iters
time = 6m, epoch 2, iter = 700, loss = 2.569, 23s per 100 iters
time = 7m, epoch 2, iter = 800, loss = 2.510, 23s per 100 iters
time = 7m, epoch 2, iter = 900, loss = 2.482, 23s per 100 iters
time = 7m, epoch 2, iter = 1000, loss = 2.351, 22s per 100 iters
time = 8m, epoch 3, iter = 100, loss = 3.741, 39s per 100 iters
time = 8m, epoch 3, iter = 200, loss = 2.152, 23s per 100 iters
time = 9m, epoch 3, iter = 300, loss = 2.113, 23s per 100 iters
time = 9m, epoch 3, iter = 400, loss = 2.060, 23s per 100 iters
time = 10m, epoch 3, iter = 500, loss = 2.095, 22s per 100 iters
time = 10m, epoch 3, iter = 600, loss = 2.023, 23s per 100 iters
time = 10m, epoch 3, iter = 700, loss = 1.942, 23s per 100 iters
time = 11m, epoch 3, iter = 800, loss = 1.918, 23s per 100 iters
time = 11m, epoch 3, iter = 900, loss = 1.927, 23s per 100 iters
time = 12m, epoch 3, iter = 1000, loss = 1.849, 23s per 100 iters
time = 12m, epoch 4, iter = 100, loss = 2.933, 39s per 100 iters
time = 13m, epoch 4, iter = 200, loss = 1.721, 23s per 100 iters
time = 13m, epoch 4, iter = 300, loss = 1.670, 23s per 100 iters
time = 13m, epoch 4, iter = 400, loss = 1.715, 22s per 100 iters
time = 14m, epoch 4, iter = 500, loss = 1.640, 23s per 100 iters


In [38]:
def translate(model, src, max_len = 80, custom=False):
    
    
    model.eval()
    if custom == True:
        sentence = tokenize_en.tokenizer(src)
        src=\
        Variable(torch.LongTensor([[SRC.vocab.stoi[tok] for tok
        in sentence]])).cuda()
    src_mask = (src != input_pad).unsqueeze(-2)
    e_outputs = model.encoder(src, src_mask)
    
    outputs = torch.zeros(max_len).type_as(src.data)
    outputs[0] = torch.LongTensor([FR_TEXT.vocab.stoi['<sos>']])
    for i in range(1, max_len):    
            
        trg_mask = np.triu(np.ones((1, i, i)),
        k=1).astype('uint8')
        trg_mask= Variable(torch.from_numpy(trg_mask) == 0).cuda()
        
        out = model.out(model.decoder(outputs[:i].unsqueeze(0),
        e_outputs, src_mask, trg_mask))
        out = F.softmax(out, dim=-1)
        val, ix = out[:, -1].data.topk(1)
        
        outputs[i] = ix[0][0]
        if ix[0][0] == TRG.vocab.stoi['<eos>']:
            break
    return ' '.join(
    [SRC.vocab.itos[ix] for ix in src[0]]
    ),' '.join(
    [TRG.vocab.itos[ix] for ix in outputs[1:i]]
    )

In [36]:
results_iter = BucketIterator(trn, batch_size=1, shuffle=True)

In [42]:
#xp = next(iter(results_iter))
in_phrase, out = translate(model, "i will come to the party?", custom=True)
print(in_phrase)
print(out)

i will come to the party ?
je vais venir au parti ?


In [266]:
sp = spacy.load('en')

In [267]:
print([tok for tok in sp.tokenizer('I like cheese and Cheese and CHEESE sentence')])

[I, like, cheese, and, Cheese, and, CHEESE, sentence]
