<a href="https://colab.research.google.com/github/Orient12/HelloWorld/blob/master/transformer1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<p><img alt="Colaboratory logo" height="45px" src="https://colab.research.google.com/img/colab_favicon.ico" align="left" hspace="10px" vspace="0px"></p>

<h1>欢迎使用 Colaboratory！</h1>


Colaboratory 是一个免费的 Jupyter 笔记本环境，不需要进行任何设置就可以使用，并且完全在云端运行。

借助 Colaboratory，您可以编写和执行代码、保存和共享分析结果，以及利用强大的计算资源，所有这些都可通过浏览器免费使用。

In [23]:
!pip3 install https://download.pytorch.org/whl/cu100/torch-1.1.0-cp36-cp36m-linux_x86_64.whl
!pip3 install https://download.pytorch.org/whl/cu100/torchvision-0.3.0-cp36-cp36m-linux_x86_64.whl
!pip install tqdm
!pip install pandas
!pip install numpy
!pip install nltk




In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import torch.nn.functional as F
from nltk.translate.bleu_score import sentence_bleu
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu
import json
import torch.utils.data.dataset as Dataset
import torch.optim as optim

In [0]:
class constants():
  def __init__(self):
    self.PAD = 0
    self.UNK = 1
    self.BOS = 2
    self.EOS = 3

    self.PAD_WORD = '<blank>'
    self.UNK_WORD = '<unk>'
    self.BOS_WORD = '<s>'
    self.EOS_WORD = '</s>'
Constants = constants()

In [0]:
class MultiHeadedAttention(torch.nn.Module):
    def __init__(self, model_dim=512, headed_count=8, dropout=0.0):
        super(MultiHeadedAttention, self).__init__()
        self.headed_count = headed_count
        self.head_dim = model_dim//headed_count
        self.model_dim = model_dim
        self.linear_q = torch.nn.Linear(model_dim, headed_count * self.head_dim)
        self.linear_k = torch.nn.Linear(model_dim, headed_count * self.head_dim)
        self.linear_v = torch.nn.Linear(model_dim, headed_count * self.head_dim)
        self.linear_final = torch.nn.Linear(model_dim, model_dim)
        self.ScaledDotProductAttention = ScaledDotProductAttention(dropout)
        self.dropout = torch.nn.Dropout(dropout)
        self.layer_norm = torch.nn.LayerNorm(model_dim)
    def forward(self, query, key, value, Mask=None):

        """
        :param X: 输入张量， （Batch_size, Sentence_Length, model_dim）
        :param Mask: Mask张量， (Sentence_Length, Sentence_Length)
        :return: output, (batch_size, Sentence_Length, model_dim)
        """
        X_size = query.size()
        batch_size = X_size[0]
        sentence_length = X_size[1]
        model_dim = X_size[2]

        residual = query

        #生成query,key,value向量
        query = self.linear_q(query)
        key = self.linear_k(key)
        value = self.linear_v(value)

        #分出headed_count个头
        query = query.view(batch_size * self.headed_count, -1, self.head_dim)
        key = key.view(batch_size * self.headed_count, -1, self.head_dim)
        value = value.view(batch_size * self.headed_count, -1, self.head_dim)

        # if Mask:
        Mask = Mask.repeat(self.headed_count, 1, 1)

        scale = self.head_dim ** -0.5

        #ScaledDotAttention
        context, attention = self.ScaledDotProductAttention(query, key, value, scale=scale, Mask=Mask)

        #concat head
        context = context.view(batch_size, -1, self.headed_count * self.head_dim)

        output = self.linear_final(context)

        output = self.layer_norm(residual + output)

        return output, attention

In [0]:
class ScaledDotProductAttention(torch.nn.Module):
    def __init__(self, attention_dropout = 0.0):
        super(ScaledDotProductAttention, self).__init__()
        self.dropout = torch.nn.Dropout(attention_dropout)
        self.softmax = torch.nn.Softmax(dim = 2)
    def forward(self, q, k, v, scale=None, Mask=None):

        """
        :param q:查询向量，（Batch_size, Sentence_Length, model_dim/headed_count）
        :param k: 键向量，（Batch_size, Sentence_Length, model_dim/headed_count）
        :param v: 值向量，（Batch_size, Sentence_Length, model_dim/headed_count）
        :param scale: 缩放因子，浮点标量
        :param Mask: Mask矩阵，对Attention矩阵进行Mask,与Attention矩阵的维度相同
        :return: Context，上下文张量；Attention,Attention张量
        """
        Attention = torch.bmm(q, k.transpose(1, 2))
        if scale:
            Attention *= scale
        # if Mask:
        #print(Attention.size(), Mask.size())
        Attention = Attention.masked_fill(Mask, -np.inf)

        Attention = self.softmax(Attention)
        Attention = self.dropout(Attention)
        context = torch.bmm(Attention, v)

        return context, Attention

In [0]:
class embedding_layer(torch.nn.Module):
    def __init__(self, vocab_size, model_dim):
        super(embedding_layer, self).__init__()
        self.vocab_size = vocab_size
        self.model_dim = model_dim
        self.Embedding_Matrix = torch.nn.Embedding(vocab_size, model_dim)
    def forward(self, X):
        return self.Embedding_Matrix(X) * (self.model_dim ** (0.5))
    def share_weight_linear(self, X):
        """
        :param X:Decoder输出，（batch_size, sentence_length, model_dim）
        :return: (batch_size, sentence_length, vocab_size)
        """
        batch_size = X.size(0)
        sentence_length = X.size(1)
        model_dim = X.size(2)
        out = X.view(-1, model_dim)
        out = torch.mm(out, self.Embedding_Matrix.weight.permute(1, 0))
        out = out.view(batch_size, sentence_length, self.vocab_size)
        return  out


In [0]:
class PositionEmbedding(torch.nn.Module):
    def __init__(self, Max_Sentence_Length, Model_dim):
        super(PositionEmbedding, self).__init__()

        #生成位置嵌入矩阵
        position_encoding = np.array([[pos / np.power(10000, 2.0 * (i // 2)/Model_dim) for i in range(Model_dim)]for pos in range(Max_Sentence_Length)])
        position_encoding[:, 0::2] = np.sin(position_encoding[:, 0::2])
        position_encoding[:, 1::2] = np.cos(position_encoding[:, 1::2])
        position_encoding = torch.Tensor(position_encoding)

        pad_row =  torch.zeros((1, Model_dim))

        position_encoding = torch.cat((pad_row, position_encoding), dim=0)

        self.embedding = torch.nn.Embedding.from_pretrained(position_encoding, freeze=True)

    def forward(self, input_len):
        """
        :param input:一个batch的句子长度张量，（batch_size, 1）
        :return: 一个batch句子的位置嵌入矩阵，（batch_size, max(input_len), Model_dim）
        """

        max_len = torch.max(input_len)
        position = torch.LongTensor([list(range(1, k+1)) + [0] * (max_len - k).item() for k in input_len]).cuda()
        out = self.embedding(position)
        return out

In [0]:
def padding_mask(seq_k, seq_q):
    len_q = seq_q.size(1)
    pad_mask = seq_k.eq(0)
    pad_mask = pad_mask.unsqueeze(1).expand(-1, len_q, -1)  # shape [B, L_q, L_k]
    return pad_mask
def sequence_mask(seq):
    batch_size, seq_len = seq.size()
    mask = torch.triu(torch.ones((seq_len, seq_len), dtype=torch.uint8),
                    diagonal=1)
    mask = mask.unsqueeze(0).expand(batch_size, -1, -1)  # [B, L, L]
    return mask
class EncoderLayer(torch.nn.Module):
    def __init__(self, model_dim=512, headed_count=8, ffn_dim=2048, dropout=0.0):
        super(EncoderLayer, self).__init__()
        self.MultiHeadedAttention = MultiHeadedAttention(model_dim, headed_count, dropout)
        self.FeedForward = FeedForward(model_dim, ffn_dim)
    def forward(self, X, Mask=None):
        out, attention = self.MultiHeadedAttention(X, X, X, Mask)
        out = self.FeedForward(out)
        return out, attention
class Encoder(torch.nn.Module):
    def __init__(self, source_vocab_size, Max_Sentence_Length, num_layers=6, model_dim=512, headed_count=8, ffn_dim=2048, dropout=0.0):
        super(Encoder, self).__init__()
        self.encoder = torch.nn.ModuleList(
            [EncoderLayer(model_dim, headed_count, ffn_dim, dropout)  for _ in range(num_layers)]
        )

        self.position_embedding = PositionEmbedding(Max_Sentence_Length=Max_Sentence_Length, Model_dim=model_dim)
        self.word_embedding = torch.nn.Embedding(source_vocab_size, model_dim)#Word_Embedding(pretrained)

    def forward(self,X, input_len, padding_mask):

        word_embedding = self.word_embedding(X)
        position_embedding = self.position_embedding(input_len)
        out = word_embedding + position_embedding

        attentions = []
        for encoder in self.encoder:
            out, attention = encoder(out, padding_mask)
            attentions.append(attention)

        return out, attentions

In [0]:
class DecoderLayer(torch.nn.Module):
    def __init__(self, model_dim, head_count=8, ffn_dim=2048, dropout=0.0):
        super(DecoderLayer, self).__init__()
        self.EncoderDecoderAttention = MultiHeadedAttention(model_dim, head_count, dropout)
        self.Self_Attention = MultiHeadedAttention(model_dim, head_count, dropout)
        self.FeedForward = FeedForward(model_dim, ffn_dim, dropout)
    def forward(self, encoder_output, X=None, self_attention_mask=None, encoder_decoder_attention_mask=None):
        """
        :param X:上一层decoder的输出或者词向量； （batch_size, sentence_length, model_dim）
        :param encoder_output: encoder的输出； (batch_size, sentence_length, model_dim)
        :param mask_pad: 句子补齐产生的0向量， 负无穷填充； (batch_size, sentence_length, sentence_length)
        :param mask_sequence: attention执行过程为防止前面的词语可以看到后面的词语进行负无穷填充； (batch_size, sentence_length, sentence_length)
        :return: 一层decoder的输出，self-attention矩阵，EncoderDecoder_Attention矩阵
        """
        decoder_output, Self_attention = self.Self_Attention(X, X, X, self_attention_mask)

        decoder_output, EncoderDecoder_attention = self.EncoderDecoderAttention(decoder_output, encoder_output, encoder_output, encoder_decoder_attention_mask)

        output = self.FeedForward(decoder_output)

        return output, Self_attention, EncoderDecoder_attention
class Decoder(torch.nn.Module):
    def __init__(self, num_layer=6, model_dim=512, head_count=8, ffn_dim=2048, dropout=0.0):
        super(Decoder, self).__init__()

        self.num_layer = num_layer
        # self.word_embedding = WordEmbedding(pretrained)
        # self.Position_Embedding = PositionEmbedding(max_sentence_length, model_dim)
        self.decoder_layer = torch.nn.ModuleList([DecoderLayer(model_dim, head_count, ffn_dim, dropout) for _ in range(num_layer)])

    def forward(self, encoder_output, self_attention_mask, encoder_decoder_attention_mask, X=None):
        """
        :param X:目标语言输入； （batch_size, Sentence_Length）
        :param input_len: 目标语言句子长度，一个batch中每个句子的长度；（batch_size, 1）
        :param encoder_output: encoder的输出；（batch_size, model_dim）
        :return: Decoder输出
        注：这里的padding有点特殊，在self-attention应用padding_mask和sequence_mask的加和，在encoder-decoder attention中只应用padding_mask
        """
        output = X
        #mask = torch.gt((mask_padding+mask_context), 0)

        self_attention = []
        encoderdecoder_attention = []
        for decoder in self.decoder_layer:
            output, Self_attention, EncoderDecoder_attention = decoder(encoder_output, output, self_attention_mask, encoder_decoder_attention_mask)
            self_attention.append(Self_attention)
            encoderdecoder_attention.append(EncoderDecoder_attention)

        return output, self_attention, encoderdecoder_attention

In [0]:
class FeedForward(torch.nn.Module):
    def __init__(self, model_dim=512, ffn_dim=2048, dropout=0.0):
        super(FeedForward, self).__init__()
        self.w1 = torch.nn.Conv1d(model_dim, ffn_dim, 1)
        self.w2 = torch.nn.Conv1d(ffn_dim, model_dim, 1)
        self.dropout = torch.nn.Dropout(dropout)
        self.layer_norm = torch.nn.LayerNorm(model_dim)
    def forward(self, X):
        """
        :param X:输入， （Batch_size, Sentence_Length, model_dim）
        :return: 输出， （Batch_size, Sentence_Length, model_dim）

        """
        output = self.w2(F.relu(self.w1(X.transpose(1, 2))))
        output = output.transpose(1, 2)
        output = self.dropout(output)
        output = self.layer_norm(X+output)

        return output

In [0]:
class ScheduledOptim():
    '''A simple wrapper class for learning rate scheduling'''

    def __init__(self, optimizer, d_model, n_warmup_steps):
        self._optimizer = optimizer
        self.n_warmup_steps = n_warmup_steps
        self.n_current_steps = 0
        self.init_lr = np.power(d_model, -0.5)

    def step_and_update_lr(self):
        "Step with the inner optimizer"
        self._update_learning_rate()
        self._optimizer.step()

    def zero_grad(self):
        "Zero out the gradients by the inner optimizer"
        self._optimizer.zero_grad()

    def _get_lr_scale(self):
        return np.min([
            np.power(self.n_current_steps, -0.5),
            np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])

    def _update_learning_rate(self):
        ''' Learning rate scheduling per step '''

        self.n_current_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr

In [0]:
def padding_mask(seq_k, seq_q):
    len_q = seq_q.size(1)
    pad_mask = seq_k.eq(0)
    pad_mask = pad_mask.unsqueeze(1).expand(-1, len_q, -1)  # shape [B, L_q, L_k]
    return pad_mask.cuda()
def sequence_mask(seq):
    batch_size, seq_len = seq.size()
    mask = torch.triu(torch.ones((seq_len, seq_len), dtype=torch.uint8),
                    diagonal=1)
    mask = mask.unsqueeze(0).expand(batch_size, -1, -1)  # [B, L, L]
    return mask.cuda()
class transformer(torch.nn.Module):
    def __init__(self, source_vocab_size, target_vocab_size, max_sentence_length, num_layers=6, model_dim=512, headed_count=8, ffn_dim=2048, dropout=0.0):
        super(transformer, self).__init__()
        self.Encoder = Encoder(source_vocab_size, max_sentence_length, num_layers, model_dim, headed_count, ffn_dim, dropout)
        self.Decoder = Decoder(num_layers, model_dim, headed_count, ffn_dim, dropout)
        self.Position_Embedding = PositionEmbedding(max_sentence_length, model_dim)
        self.embedding_layer = embedding_layer(target_vocab_size, model_dim)
        self.linear = torch.nn.Linear(model_dim, target_vocab_size)
    def forward(self, source_word, source_len, target_word, target_len):
        #生成mask

        source_self_attention_mask = padding_mask(source_word, source_word)

        target_padding_mask = padding_mask(target_word, target_word)
        target_sequence_mask = sequence_mask(target_word)
        target_self_attention_mask = (target_padding_mask + target_sequence_mask).gt(0)
        target_encoder_decoder_mask = padding_mask(source_word, target_word)

        #生成嵌入
        #source_word_embedding = self.embedding_layer(source_word)
        #source_position_embedding = self.Position_Embedding(source_len)
        #source_embedding = source_word_embedding + source_position_embedding
        target_word_embedding = self.embedding_layer(target_word)
        target_position_embedding = self.Position_Embedding(target_len)
        target_embedding = target_word_embedding + target_position_embedding

        #encoder
        encoder_output, encoder_attention = self.Encoder(source_word, source_len, source_self_attention_mask)

        #decoder
        decoder_output, decoder_self_attention, decoder_encoder_decoder_attention = self.Decoder(encoder_output, target_self_attention_mask, target_encoder_decoder_mask, target_embedding)

        #生成logits
        logits = self.embedding_layer.share_weight_linear(decoder_output)
        # logits = self.linear(decoder_output)
        # del source_self_attention_mask, target_self_attention_mask, target_encoder_decoder_mask
        # del target_sequence_mask, target_padding_mask, source_embedding
        # del source_position_embedding, target_embedding, target_position_embedding, encoder_output

        return logits, encoder_attention, decoder_self_attention, decoder_encoder_decoder_attention

In [0]:
def loss(logits, target, smoothing, vocabsize):
    t1 = target.size(0)
    t2 = target.size(1)
    confidence = 1 - smoothing
    low_confidence = (1 - confidence)/(vocabsize - 1)
    gold = torch.zeros_like(logits).scatter(2, target.unsqueeze(-1), 1)
    gold = confidence * gold + low_confidence * (1 - gold)
    logits_softmax = F.log_softmax(logits, dim=-1)

    non_pad_mask = target.ne(Constants.PAD)
    count = non_pad_mask.sum(dim=-1).sum(dim=-1)
    loss = -(logits_softmax * gold).sum(dim=-1)
    loss = loss.masked_select(non_pad_mask).sum()

    return loss/count
def get_bleu(logits, target, index2word):
    pre = torch.argmax(logits, dim=2)
    all_bleu = []
    for i in range(logits.size(0)):
        candidate = [index2word[str(k.item())] for k in pre[i]]
        reference = [index2word[str(k.item())] for k in target[i] if k.item()!= Constants.PAD]
        if Constants.EOS_WORD in candidate:c_end = candidate.index(Constants.EOS_WORD)
        else:continue
        can = candidate[:c_end+1]

        if Constants.EOS_WORD in reference:r_end = reference.index(Constants.EOS_WORD)
        else:continue
        ref = reference[:r_end+1]
        bleu = sentence_bleu([ref], can)
        print(ref, can)
        all_bleu.append(bleu)
    return np.average(all_bleu)

In [0]:
def read_from_file(path, max_sentence_length, keep_case):
    All = []
    trimed_sentence_count = 0
    with open(path, encoding="utf-8") as file:
        for i, data in enumerate(file):
            if not keep_case:
                data = str(data).strip().lower()
            words = data.strip()
            if(len(words)>max_sentence_length):
                trimed_sentence_count += 1
            words_trimed = words[:max_sentence_length]
            if words_trimed:
                All += [[Constants.BOS] + words_trimed + [Constants.EOS]]
            else:
                All += [None]
    return All
def build_word2index(path1, path2):
    word2index = {
        Constants.BOS_WORD: Constants.BOS,
        Constants.EOS_WORD: Constants.EOS,
        Constants.PAD_WORD: Constants.PAD,
        Constants.UNK_WORD: Constants.UNK}
    with open(path1, encoding="utf-8") as file:
        for i, data in enumerate(file):
            if (word2index.get(str(data).strip())==None):
                word2index[str(data).strip()] = len(word2index)
            else:
                continue
    with open(path2, encoding="utf-8") as file:
        for i, data in enumerate(file):
            if (word2index.get(str(data).strip())==None):
                word2index[str(data).strip()] = len(word2index)
            else:
                continue
    with open("./data/word2index.json", "w", encoding="utf-8") as file:
        w2i = json.dumps(word2index)
        file.write(w2i)
    return  word2index
def convert_word_2_index(sentence, word2index):
    return [word2index.get(k, Constants.UNK) for k in sentence]
class subDataset(Dataset.Dataset):
    def __init__(self, source, target):
        super(subDataset, self).__init__()
        self.source = open(source, "r", encoding="utf-8")
        self.target = open(target, "r", encoding="utf-8")
        self.len = 0
        a = 0
        for i,data in enumerate(self.source):
            a = i
        self.len = a+1
        self.source = open(source, "r", encoding="utf-8")
        with open("/content/drive/My Drive/data/w2i.json", encoding="utf-8") as file:
            self.w2i = [json.loads(k) for k in file][0]
        with open("/content/drive/My Drive/data/dew2i.json", encoding="utf-8") as file:
            self.dew2i = [json.loads(k) for k in file][0]
    def __len__(self):
        return self.len
    def __getitem__(self, item):
        source = convert_word_2_index(str(self.source.readline()).strip().split(), self.w2i)
        source_len = len(source)
        target = convert_word_2_index(str(self.target.readline()).strip().split(), self.dew2i)
        target_len = len(target)
        return source, source_len, target, target_len
class Dataloader():
    def __init__(self, dataset, batch_size):
        self.dataset = dataset
        self.batch_size = batch_size
        self.len = dataset.len
    def get_batch(self):
        source = []
        source_len = []
        target_input = []
        target_len_input = []
        target_output = []
        target_len_output = []
        for i in range(self.batch_size):
            s1, sl1, t1, tl1 = self.dataset.__getitem__(1)
            source.append(s1)
            source_len.append(sl1)
            target_input.append(t1[:-1])
            target_len_input.append(tl1-1)
            target_output.append(t1[1:])
            target_len_output.append(tl1-1)
        source_max_len = np.max(source_len)
        target_max_len_input = np.max(target_len_input)
        target_max_len_output = np.max(target_len_output)
        source = [k + list(np.zeros(source_max_len-len(k))) for k in source]
        target_input = [k + list(np.zeros(target_max_len_input-len(k))) for k in target_input]
        target_output = [k + list(np.zeros(target_max_len_output-len(k))) for k in target_output]
        source_len = [source_max_len for _ in range(len(source))]
        target_len_input = [target_max_len_input for _ in range(len(target_input))]
        target_len_output = [target_max_len_output for _ in range(len(target_output))]
        return torch.LongTensor(source).cuda(), torch.LongTensor(source_len).cuda(), torch.LongTensor(target_input).cuda(), torch.LongTensor(target_len_input).cuda(), torch.LongTensor(target_output).cuda(), torch.LongTensor(target_len_output).cuda()


In [0]:
class ARG():
  def __init__(self):
    self.batch_size = 40
    self.model_dim = 512
    self.epochs = 10
    self.head_count = 8
    self.dropout = 0.5
    self.save_model = "/content/drive/My Drive/transformer/transformer"
    self.label_smoothing = True
    self.learning_rate = 0.1
    self.max_sentence_length = 110
    self.vocab_size = 85718 + 1
    self.num_layer = 6
    self.ffn_dim = 2048
    self.warm_up_steps = 4000
    self.smoothing = 0.1
    self.source_vocab_size = 25809+1
    self.target_vocab_size = 24333+1
opt = ARG()

In [0]:
def train():

    #build_word2index("./data/vocab.50K.en", "./data/vocab.50K.de")
    with open("/content/drive/My Drive/data/dei2w.json", encoding="utf-8") as file:
        index2word = [json.loads(k) for k in file][0]

    model = transformer(opt.source_vocab_size, opt.target_vocab_size, opt.max_sentence_length, opt.num_layer, opt.model_dim, opt.head_count, opt.ffn_dim, opt.dropout)

    print('# generator parameters:', sum(param.numel() for param in model.parameters()))
    device = torch.device("cuda")
    # model = torch.nn.DataParallel(model, [0,1])
    model.to(device)
    model_state_dict = model.state_dict()
    optimizer = ScheduledOptim(
        optim.Adam(
            filter(lambda x: x.requires_grad, model.parameters()),
            betas=(0.9, 0.98), eps=1e-09),
        opt.model_dim, opt.warm_up_steps)
    checkpoint = {
        'model': model_state_dict,
        'settings': opt,
        'epoch': 0}
    torch.save(checkpoint, "/content/drive/My Drive/Transformer%d" % 0)
    # optimizer = optim.Adam(model.parameters(), lr = 0.9)
    for j in range(opt.epochs):

        dataset = subDataset("/content/drive/My Drive/data/pre/train_pre.en", "/content/drive/My Drive/data/pre/train_pre.de")
        dataloader = Dataloader(dataset, opt.batch_size)
        bar = tqdm(range(dataset.len // opt.batch_size))
        for i in bar:

            source, source_len, target_input, target_len_input, target_output, target_len_output = dataloader.get_batch()
            output, encoder_attention, decoder_self_attention, decoder_encoder_decoder_attention = model(source, source_len, target_input, target_len_input)

            torch.cuda.empty_cache()

            L = loss(output, target_output, opt.smoothing, opt.target_vocab_size)

            if(i%100 == 0):
                bleu = get_bleu(output, target_output, index2word)
                print(bleu)
            bar.set_description("loss:%f" % (L.item()))
            optimizer.zero_grad()
            L.backward()
            optimizer.step_and_update_lr()
            # if(i % 10000 == 0):
            #     temp = ""
            #     input(temp)
            #     if(temp == "go"):
            #         continue
            #     else:
            #         encoder_output = model.Encoder()
            #         model.Decoder()\

        model_state_dict = model.state_dict()
        checkpoint = {
            'model': model_state_dict,
            'settings': opt,
            'epoch': j}
        torch.save(checkpoint, "/content/drive/My Drive/Transformer%d"%j)


In [40]:
train()

# generator parameters: 82409230



  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)

loss:285.284760:   0%|          | 0/111721 [00:00<?, ?it/s][A

nan



loss:285.284760:   0%|          | 1/111721 [00:00<17:52:42,  1.74it/s][A
loss:277.146820:   0%|          | 1/111721 [00:00<17:52:42,  1.74it/s][A
loss:277.146820:   0%|          | 2/111721 [00:00<15:36:14,  1.99it/s][A
loss:261.785828:   0%|          | 2/111721 [00:01<15:36:14,  1.99it/s][A
loss:261.785828:   0%|          | 3/111721 [00:01<13:46:00,  2.25it/s][A
loss:287.094391:   0%|          | 3/111721 [00:01<13:46:00,  2.25it/s][A
loss:287.094391:   0%|          | 4/111721 [00:01<12:46:04,  2.43it/s][A
loss:279.404022:   0%|          | 4/111721 [00:01<12:46:04,  2.43it/s][A
loss:279.404022:   0%|          | 5/111721 [00:01<11:22:12,  2.73it/s][A
loss:263.480103:   0%|          | 5/111721 [00:01<11:22:12,  2.73it/s][A
loss:263.480103:   0%|          | 6/111721 [00:02<10:17:05,  3.02it/s][A
loss:261.482605:   0%|          | 6/111721 [00:02<10:17:05,  3.02it/s][A
loss:261.482605:   0%|          | 7/111721 [00:02<10:00:17,  3.10it/s][A
loss:278.468018:   0%|          | 7/1

['<unk>', '<unk>', 'einiger', '<unk>', 'des', '<unk>', ',', '<unk>', 'statt', '<unk>', '5', '<unk>', 'tatsächlich', '45', '<unk>', '<unk>', 'beim', '<unk>', '.', '</s>'] ['<unk>', 'konto', '<unk>', ',', 'ernannte', 'rechtsvorschrift', '<unk>', ',', '<unk>', '<unk>', 'ausnahmeregelungen', '</s>']
['<unk>', '<unk>', 'und', 'stilvolles', '<unk>', 'mit', '<unk>', '<unk>', '(', '<unk>', 'in', '/', 'im', '<unk>', 'und', '<unk>', ')', 'und', 'gemütlicher', '<unk>', 'im', '<unk>', '.', '</s>'] ['<unk>', 'wortmeldung', 'zentraler', 'und', 'aps', '<unk>', 'kommission', '<unk>', '<unk>', '<unk>', '<unk>', 'briten', '<unk>', 'ausgeräumt', 'entgegengesetzte', 'und', 'syntax', 'nelson', 'und', 'hybrid', '<unk>', 'spaß', 's', 'volksabstimmungen', 'wertvolles', '<unk>', '<unk>', 'even', 'stadium', 'ixquick', 'kultur', 'faro', 'einreichen', 'sinnvoll', 'tauchen', '<unk>', 'begleichen', 'takt', '1920', 'finanzrahmens', '</s>']
['<unk>', '<unk>', '<unk>', 'im', '<unk>', 'war', 'der', '<unk>', 'teuer', '.


loss:40.631134:   0%|          | 101/111721 [00:36<10:19:31,  3.00it/s][A
loss:41.636913:   0%|          | 101/111721 [00:36<10:19:31,  3.00it/s][A
loss:41.636913:   0%|          | 102/111721 [00:37<10:34:31,  2.93it/s][A
loss:43.085751:   0%|          | 102/111721 [00:37<10:34:31,  2.93it/s][A
loss:43.085751:   0%|          | 103/111721 [00:37<10:43:33,  2.89it/s][A
loss:40.738220:   0%|          | 103/111721 [00:37<10:43:33,  2.89it/s][A
loss:40.738220:   0%|          | 104/111721 [00:37<11:51:41,  2.61it/s][A
loss:40.168205:   0%|          | 104/111721 [00:38<11:51:41,  2.61it/s][A
loss:40.168205:   0%|          | 105/111721 [00:38<11:53:10,  2.61it/s][A
loss:44.814365:   0%|          | 105/111721 [00:38<11:53:10,  2.61it/s][A
loss:44.814365:   0%|          | 106/111721 [00:38<12:07:34,  2.56it/s][A
loss:41.672928:   0%|          | 106/111721 [00:38<12:07:34,  2.56it/s][A
loss:41.672928:   0%|          | 107/111721 [00:38<11:03:28,  2.80it/s][A
loss:40.417980:   0%|   

['<unk>', 'vermieten', 'nur', '15', '<unk>', 'zu', '<unk>', 'von', 'der', '<unk>', 'von', '...', '</s>'] ['wortes', ',', ',', 'angesetzt', 'festgestellten', 'schwach', '<unk>', 'substanzielle', 'unklarheit', 'der', '<unk>', '</s>']
['<unk>', 'der', 'mittleren', '<unk>', 'des', '<unk>', '<unk>', '<unk>', 'liegt', '...', '</s>'] ['zweimal', '.', '<unk>', 'ssen', '<unk>', '<unk>', '</s>']
['<unk>', 'dem', 'bekannten', '<unk>', '<unk>', '<unk>', '<unk>', 'liegt', '<unk>', '<unk>', 'im', '...', '</s>'] ['51', 'ssen', 'im', '<unk>', 'auf', 'anzeichen', 'regina', 'meet', 'zu', 'optimierte', 'motto', 'im', 'meeting', 'energieverbrauchs', '&apos;', 'weinen', 'vitorino', 'zauberhaften', 'die', '</s>']
['<unk>', '<unk>', '<unk>', '<unk>', 'befindet', 'sich', 'im', 'oberen', '<unk>', 'von', '<unk>', ',', '...', '</s>'] ['<unk>', 'substanzielle', 'die', ',', 'der', 'kollektiv', 'symbolen', '21', 'auszeichnungen', 'frau', 'von', '<unk>', ',', 'schulen', 'rumänien', 'tan', '.', '</s>']
['<unk>', '<un


loss:28.385872:   0%|          | 201/111721 [01:10<11:07:00,  2.79it/s][A
loss:24.446167:   0%|          | 201/111721 [01:10<11:07:00,  2.79it/s][A
loss:24.446167:   0%|          | 202/111721 [01:11<10:42:00,  2.90it/s][A
loss:26.929201:   0%|          | 202/111721 [01:11<10:42:00,  2.90it/s][A
loss:26.929201:   0%|          | 203/111721 [01:11<10:27:33,  2.96it/s][A
loss:28.133989:   0%|          | 203/111721 [01:11<10:27:33,  2.96it/s][A
loss:28.133989:   0%|          | 204/111721 [01:11<9:35:20,  3.23it/s] [A
loss:23.421118:   0%|          | 204/111721 [01:11<9:35:20,  3.23it/s][A
loss:23.421118:   0%|          | 205/111721 [01:12<9:54:45,  3.12it/s][A
loss:25.803942:   0%|          | 205/111721 [01:12<9:54:45,  3.12it/s][A
loss:25.803942:   0%|          | 206/111721 [01:12<10:28:37,  2.96it/s][A
loss:26.372120:   0%|          | 206/111721 [01:12<10:28:37,  2.96it/s][A
loss:26.372120:   0%|          | 207/111721 [01:12<9:59:44,  3.10it/s] [A
loss:24.594940:   0%|      

['<unk>', '<unk>', 'bietet', '270', '<unk>', 'einschließlich', 'einer', '<unk>', '<unk>', ',', '4', '<unk>', '<unk>', 'und', '12', '<unk>', '.', '</s>'] ['<unk>', 'langes', '<unk>', '<unk>', 'nahezu', 'in', 'buenos', 'und', 'ein', ',', ',', '.', '.', ',', 'und', 'eröffnet', 'friedrichshafen', '</s>']
['<unk>', '<unk>', 'sind', 'aufgeteilt', 'zwischen', 'den', '<unk>', 'aus', 'dem', '17.', '<unk>', 'und', '<unk>', '<unk>', '.', '</s>'] ['<unk>', 'von', '<unk>', 'sind', '<unk>', 'ein', '</s>']
['<unk>', '<unk>', 'bieten', 'ein', 'innovatives', '<unk>', '<unk>', '<unk>', '<unk>', 'während', '<unk>', 'in', 'den', 'älteren', '<unk>', 'antike', '<unk>', 'und', '<unk>', 'bieten', '.', '</s>'] ['<unk>', '<unk>', '<unk>', '<unk>', 'ein', 'kundenzufriedenheit', 'und', 'box', 'teneriffa', 'operator', 'ba', ',', 'in', 'die', '.', 'eröffnet', '.', ',', 'und', '.', 'sind', '</s>']
['<unk>', '<unk>', 'sind', 'mit', 'zahlreichen', 'modernen', '<unk>', 'einschließlich', '<unk>', ',', '<unk>', ',', '<un


loss:19.647888:   0%|          | 301/111721 [01:47<10:45:17,  2.88it/s][A
loss:19.840527:   0%|          | 301/111721 [01:47<10:45:17,  2.88it/s][A
loss:19.840527:   0%|          | 302/111721 [01:47<10:13:54,  3.02it/s][A
loss:20.747631:   0%|          | 302/111721 [01:47<10:13:54,  3.02it/s][A
loss:20.747631:   0%|          | 303/111721 [01:48<10:01:57,  3.08it/s][A
loss:18.672720:   0%|          | 303/111721 [01:48<10:01:57,  3.08it/s][A
loss:18.672720:   0%|          | 304/111721 [01:48<9:51:40,  3.14it/s] [A
loss:21.494415:   0%|          | 304/111721 [01:48<9:51:40,  3.14it/s][A
loss:21.494415:   0%|          | 305/111721 [01:48<9:45:53,  3.17it/s][A
loss:19.327276:   0%|          | 305/111721 [01:48<9:45:53,  3.17it/s][A
loss:19.327276:   0%|          | 306/111721 [01:48<9:48:59,  3.15it/s][A
loss:24.352888:   0%|          | 306/111721 [01:49<9:48:59,  3.15it/s][A
loss:24.352888:   0%|          | 307/111721 [01:49<9:28:15,  3.27it/s][A
loss:21.652935:   0%|         

['<unk>', 'kommt', '<unk>', 'um', '<unk>', '-', 'die', '<unk>', '<unk>', '<unk>', '-', 'eine', 'neue', 'schnelle', 'und', 'unterhaltsame', '<unk>', ',', 'das', 'weltweit', '<unk>', '<unk>', 'zu', 'genießen', '.', '<unk>', '<unk>', 'kann', 'mit', 'allen', '<unk>', 'um', '<unk>', '<unk>', 'gespielt', 'werden', '.', '</s>'] ['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', ',', '<unk>', '<unk>', '<unk>', ',', ',', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '</s>']
['<unk>', 'befinden', 'sich', 'augenblicklich', 'in', 'den', '<unk>', 'of', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', 'für', '<unk>', ',', '2008', '.', '</s>'] ['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', ',', '<unk>', '<unk>', '<unk>', ',', ',', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '</s>']
['<unk>', 'fahren', 'ist', 'einfach', 'auf', '<unk>', '.', '</s>'] ['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<


loss:18.827118:   0%|          | 401/111721 [02:22<11:13:16,  2.76it/s][A
loss:16.658056:   0%|          | 401/111721 [02:23<11:13:16,  2.76it/s][A
loss:16.658056:   0%|          | 402/111721 [02:23<11:46:32,  2.63it/s][A
loss:16.338259:   0%|          | 402/111721 [02:23<11:46:32,  2.63it/s][A
loss:16.338259:   0%|          | 403/111721 [02:23<10:56:22,  2.83it/s][A
loss:18.479395:   0%|          | 403/111721 [02:23<10:56:22,  2.83it/s][A
loss:18.479395:   0%|          | 404/111721 [02:24<11:10:10,  2.77it/s][A
loss:19.436821:   0%|          | 404/111721 [02:24<11:10:10,  2.77it/s][A
loss:19.436821:   0%|          | 405/111721 [02:24<10:19:03,  3.00it/s][A
loss:18.162991:   0%|          | 405/111721 [02:24<10:19:03,  3.00it/s][A
loss:18.162991:   0%|          | 406/111721 [02:24<10:02:47,  3.08it/s][A
loss:18.349997:   0%|          | 406/111721 [02:24<10:02:47,  3.08it/s][A
loss:18.349997:   0%|          | 407/111721 [02:25<10:30:10,  2.94it/s][A
loss:18.309586:   0%|   

['<unk>', 'stilvolle', '<unk>', 'mit', 'privater', '<unk>', 'serviert', 'typische', '<unk>', '<unk>', '.', '</s>'] ['<unk>', '<unk>', '<unk>', ',', '<unk>', '<unk>', ',', '<unk>', '<unk>', '<unk>', 'überarbeiten', '</s>']
['<unk>', '<unk>', 'die', 'romantische', '<unk>', 'auf', 'die', '<unk>', '<unk>', '.', '</s>'] ['<unk>', 'sind', '<unk>', '<unk>', 'nichtständigen', ',', '<unk>', '<unk>', '<unk>', ',', '</s>']
['<unk>', 'dem', '<unk>', 'erholen', '<unk>', 'sich', 'auch', 'bei', 'einem', '<unk>', 'in', 'der', '<unk>', 'mit', '<unk>', '.', '</s>'] ['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', ',', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', ',', '<unk>', '<unk>', '</s>']
['<unk>', 'möchten', '<unk>', 'im', '<unk>', '<unk>', '<unk>', 'übernachten', '?', '</s>'] ['<unk>', ',', '<unk>', '<unk>', '<unk>', '<unk>', ',', '<unk>', '<unk>', '</s>']
['<unk>', ',', '24', '<unk>', '<unk>', '<unk>', '<unk>', ',', '<unk>', ',', '<unk>', ',', 'behindertengerechte', '<unk>', ',', '


loss:15.067760:   0%|          | 501/111721 [02:58<9:41:37,  3.19it/s][A
loss:17.111561:   0%|          | 501/111721 [02:58<9:41:37,  3.19it/s][A
loss:17.111561:   0%|          | 502/111721 [02:58<10:19:43,  2.99it/s][A
loss:19.366932:   0%|          | 502/111721 [02:58<10:19:43,  2.99it/s][A
loss:19.366932:   0%|          | 503/111721 [02:59<10:20:24,  2.99it/s][A
loss:14.952331:   0%|          | 503/111721 [02:59<10:20:24,  2.99it/s][A
loss:14.952331:   0%|          | 504/111721 [02:59<9:47:52,  3.15it/s] [A
loss:16.593266:   0%|          | 504/111721 [02:59<9:47:52,  3.15it/s][A
loss:16.593266:   0%|          | 505/111721 [02:59<10:13:02,  3.02it/s][A
loss:16.240623:   0%|          | 505/111721 [02:59<10:13:02,  3.02it/s][A
loss:16.240623:   0%|          | 506/111721 [03:00<9:44:53,  3.17it/s] [A
loss:15.595231:   0%|          | 506/111721 [03:00<9:44:53,  3.17it/s][A
loss:15.595231:   0%|          | 507/111721 [03:00<9:40:09,  3.19it/s][A
loss:16.769127:   0%|        

[':', 'entfernt', 'alle', '<unk>', 'von', 'einer', 'gegebenen', '<unk>', ',', 'oder', 'von', 'allen', '<unk>', 'mit', '<unk>', '.', '</s>'] ['<unk>', '<unk>', 'der', '<unk>', 'und', 'der', '<unk>', 'der', '.', 'der', '<unk>', '<unk>', '.', '<unk>', '<unk>', '.', '</s>']
['<unk>', 'einigen', '<unk>', ',', 'will', 'man', 'einen', '<unk>', 'erzwingen', 'der', 'verwendet', 'werden', 'soll', ';', 'zum', '<unk>', 'wenn', 'man', 'nur', 'den', '<unk>', '<unk>', 'erlauben', 'will', 'wenn', '<unk>', '<unk>', 'aktiviert', 'ist', '.', '</s>'] ['<unk>', '<unk>', '<unk>', '<unk>', 'der', '.', 'die', '<unk>', '.', '<unk>', '<unk>', '.', '.', '<unk>', '<unk>', '<unk>', '.', '.', 'die', '.', '<unk>', '.', 'der', '.', 'der', 'der', 'der', '<unk>', '<unk>', '.', '</s>']
['<unk>', '<unk>', 'an', '<unk>', 'kann', 'verwendet', 'werden', 'um', 'das', '<unk>', 'des', '<unk>', '<unk>', 'zu', 'verändern', '.', '</s>'] ['<unk>', '<unk>', 'der', 'der', '.', 'der', '<unk>', '.', '.', '<unk>', '<unk>', '<unk>', 'de


loss:16.049452:   1%|          | 601/111721 [03:32<9:55:50,  3.11it/s][A
loss:15.099164:   1%|          | 601/111721 [03:33<9:55:50,  3.11it/s][A
loss:15.099164:   1%|          | 602/111721 [03:33<9:29:41,  3.25it/s][A
loss:15.171709:   1%|          | 602/111721 [03:33<9:29:41,  3.25it/s][A
loss:15.171709:   1%|          | 603/111721 [03:33<9:16:05,  3.33it/s][A
loss:14.495722:   1%|          | 603/111721 [03:33<9:16:05,  3.33it/s][A
loss:14.495722:   1%|          | 604/111721 [03:33<10:05:05,  3.06it/s][A
loss:13.422892:   1%|          | 604/111721 [03:33<10:05:05,  3.06it/s][A
loss:13.422892:   1%|          | 605/111721 [03:34<10:04:04,  3.07it/s][A
loss:13.879802:   1%|          | 605/111721 [03:34<10:04:04,  3.07it/s][A
loss:13.879802:   1%|          | 606/111721 [03:34<10:13:21,  3.02it/s][A
loss:12.743694:   1%|          | 606/111721 [03:34<10:13:21,  3.02it/s][A
loss:12.743694:   1%|          | 607/111721 [03:34<10:43:53,  2.88it/s][A
loss:14.694719:   1%|         

['<unk>', '<unk>', 'gelegen', '!', '<unk>', '<unk>', 'waren', 'ausgesprochen', 'freundlich', 'und', 'hilfsbereit', '!', '!', '!', '!', '</s>'] ['<unk>', '&apos;', ',', ',', ',', ',', ',', ',', ',', ',', '<unk>', ',', '</s>']
['<unk>', '<unk>', 'wurde', 'das', '<unk>', 'frisch', 'gemacht', 'und', 'es', 'gab', 'neue', '<unk>', '.', '</s>'] ['<unk>', ',', 'ist', ',', '<unk>', ',', '&apos;', ',', 'die', ',', ',', '<unk>', ',', '</s>']
['<unk>', '<unk>', 'konnten', 'wir', 'sowohl', 'vor', 'dem', '<unk>', ',', 'als', 'auch', 'am', '<unk>', 'noch', 'dort', 'lassen', '.', '</s>'] ['<unk>', ',', 'im', 'in', ',', 'die', 'der', '<unk>', ',', 'die', '<unk>', ',', 'politische', ',', ',', ',', ',', '</s>']
['<unk>', '<unk>', 'ist', 'recht', 'klein', ',', 'aber', 'neu', 'renoviert', ',', 'neues', '<unk>', ',', '<unk>', ',', 'ausreichend', 'großer', '<unk>', ',', 'sehr', 'nettes', '<unk>', '.', '</s>'] ['<unk>', ',', ',', ',', ',', ',', 'die', 'in', 'in', ',', 'die', ',', ',', 'die', ',', 'die', ',', 


loss:16.312284:   1%|          | 701/111721 [04:08<10:13:30,  3.02it/s][A
loss:12.757774:   1%|          | 701/111721 [04:08<10:13:30,  3.02it/s][A
loss:12.757774:   1%|          | 702/111721 [04:09<9:48:15,  3.15it/s] [A
loss:14.089296:   1%|          | 702/111721 [04:09<9:48:15,  3.15it/s][A
loss:14.089296:   1%|          | 703/111721 [04:09<9:12:46,  3.35it/s][A
loss:16.234241:   1%|          | 703/111721 [04:09<9:12:46,  3.35it/s][A
loss:16.234241:   1%|          | 704/111721 [04:09<10:02:36,  3.07it/s][A
loss:16.557949:   1%|          | 704/111721 [04:09<10:02:36,  3.07it/s][A
loss:16.557949:   1%|          | 705/111721 [04:10<9:41:55,  3.18it/s] [A
loss:11.164313:   1%|          | 705/111721 [04:10<9:41:55,  3.18it/s][A
loss:11.164313:   1%|          | 706/111721 [04:10<10:30:08,  2.94it/s][A
loss:12.740628:   1%|          | 706/111721 [04:10<10:30:08,  2.94it/s][A
loss:12.740628:   1%|          | 707/111721 [04:10<10:11:15,  3.03it/s][A
loss:12.817673:   1%|       

['<unk>', 'entfällt', 'die', '<unk>', 'von', 'zwei', 'verschiedenen', '<unk>', '<unk>', '<unk>', '.', '</s>'] ['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '</s>']
['<unk>', '<unk>', '<unk>', '2', 'für', '<unk>', '/', '<unk>', '<unk>', 'ist', 'zum', '<unk>', 'verfügbar', '.', '</s>'] ['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '</s>']
['<unk>', 'parallele', '<unk>', 'von', '<unk>', 'unter', '<unk>', 'wird', 'nunmehr', 'verbessert', 'unterstützt', '.', '<unk>', 'vorliegende', '<unk>', 'sollte', 'von', 'allen', '<unk>', '<unk>', 'durchgeführt', 'werden', ',', 'um', 'auf', 'dem', 'jeweils', 'neuesten', '<unk>', 'zu', 'sein', '.', '</s>'] ['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '</s>']
['<unk>', 'weitere', '<unk>', 'folgen', '<unk>', 'bitte', '<unk>', '<unk>', '.', '</s>'] ['<unk>', '<un


loss:10.168121:   1%|          | 801/111721 [04:46<10:19:31,  2.98it/s][A
loss:9.534101:   1%|          | 801/111721 [04:46<10:19:31,  2.98it/s] [A
loss:9.534101:   1%|          | 802/111721 [04:46<10:29:08,  2.94it/s][A
loss:10.538352:   1%|          | 802/111721 [04:46<10:29:08,  2.94it/s][A
loss:10.538352:   1%|          | 803/111721 [04:47<10:05:16,  3.05it/s][A
loss:11.937861:   1%|          | 803/111721 [04:47<10:05:16,  3.05it/s][A
loss:11.937861:   1%|          | 804/111721 [04:47<9:49:35,  3.14it/s] [A
loss:11.430323:   1%|          | 804/111721 [04:47<9:49:35,  3.14it/s][A
loss:11.430323:   1%|          | 805/111721 [04:47<10:25:48,  2.95it/s][A
loss:11.545352:   1%|          | 805/111721 [04:47<10:25:48,  2.95it/s][A
loss:11.545352:   1%|          | 806/111721 [04:48<11:22:45,  2.71it/s][A
loss:10.118241:   1%|          | 806/111721 [04:48<11:22:45,  2.71it/s][A
loss:10.118241:   1%|          | 807/111721 [04:48<11:11:46,  2.75it/s][A
loss:9.174225:   1%|      

['<unk>', '<unk>', 'zeigen', 'sich', 'in', 'einen', 'modernen', ',', '<unk>', '<unk>', 'mit', 'verschiedenen', '<unk>', '.', '<unk>', '<unk>', '<unk>', 'und', '<unk>', 'aus', '<unk>', 'und', '<unk>', ',', 'weißen', '<unk>', 'dominieren', 'die', '<unk>', '.', '</s>'] ['<unk>', 'ist', 'und', 'und', 'und', 'der', '.', '.', 'die', '.', '.', 'einem', '.', '.', '</s>']
['<unk>', 'sehen', '<unk>', 'die', '<unk>', 'von', '<unk>', '<unk>', '<unk>', '.', '</s>'] ['<unk>', 'sind', '.', 'und', '.', '.', 'von', '.', '.', '.', '</s>']
['<unk>', '<unk>', 'war', 'so', '<unk>', 'dass', 'noch', 'nicht', 'mal', 'die', '<unk>', 'ganz', 'geöffnet', 'werden', 'konnte', '.', '</s>'] ['<unk>', '.', 'und', 'und', '.', '.', '.', '.', '.', '.', 'dev', '.', '.', '.', '.', '.', '</s>']
['<unk>', '<unk>', 'ist', '<unk>', 'und', 'von', '<unk>', '<unk>', '.', '</s>'] ['<unk>', '.', 'ist', 'die', '.', 'und', '.', '.', '.', '</s>']
['<unk>', '<unk>', 'so', 'laut', 'das', 'an', '<unk>', 'nicht', 'zu', 'denken', 'war', '


loss:11.738765:   1%|          | 901/111721 [05:22<10:47:12,  2.85it/s][A
loss:10.278146:   1%|          | 901/111721 [05:23<10:47:12,  2.85it/s][A
loss:10.278146:   1%|          | 902/111721 [05:23<9:51:03,  3.12it/s] [A
loss:10.140477:   1%|          | 902/111721 [05:23<9:51:03,  3.12it/s][A
loss:10.140477:   1%|          | 903/111721 [05:23<9:45:49,  3.15it/s][A
loss:10.642219:   1%|          | 903/111721 [05:23<9:45:49,  3.15it/s][A
loss:10.642219:   1%|          | 904/111721 [05:23<9:26:49,  3.26it/s][A
loss:10.570476:   1%|          | 904/111721 [05:23<9:26:49,  3.26it/s][A
loss:10.570476:   1%|          | 905/111721 [05:24<9:33:05,  3.22it/s][A
loss:10.951356:   1%|          | 905/111721 [05:24<9:33:05,  3.22it/s][A
loss:10.951356:   1%|          | 906/111721 [05:24<9:43:01,  3.17it/s][A
loss:12.057496:   1%|          | 906/111721 [05:24<9:43:01,  3.17it/s][A
loss:12.057496:   1%|          | 907/111721 [05:24<9:42:18,  3.17it/s][A
loss:11.074536:   1%|          | 9

['<unk>', 'auch', 'die', '<unk>', 'des', '<unk>', 'ist', 'wichtig', ':', '<unk>', '<unk>', '(', '<unk>', ',', '<unk>', ')', 'sind', '<unk>', 'ideal', 'für', 'die', '<unk>', 'geeignet', '.', '</s>'] ['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '</s>']
['<unk>', '<unk>', 'sind', 'natürlich', 'fließend', 'und', 'hängen', 'auch', 'sehr', 'von', 'der', 'jeweiligen', '<unk>', 'ab', ',', 'doch', 'kann', 'hier', 'ein', 'praktischer', '<unk>', ',', 'den', 'wir', 'gerne', 'für', 'sie', 'durchführen', ',', 'schnell', '<unk>', 'schaffen', '.', '</s>'] ['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<


loss:10.082620:   1%|          | 1001/111721 [05:57<11:18:45,  2.72it/s][A
loss:10.151292:   1%|          | 1001/111721 [05:57<11:18:45,  2.72it/s][A
loss:10.151292:   1%|          | 1002/111721 [05:58<11:05:51,  2.77it/s][A
loss:10.080654:   1%|          | 1002/111721 [05:58<11:05:51,  2.77it/s][A
loss:10.080654:   1%|          | 1003/111721 [05:58<11:11:19,  2.75it/s][A
loss:9.455183:   1%|          | 1003/111721 [05:58<11:11:19,  2.75it/s] [A
loss:9.455183:   1%|          | 1004/111721 [05:58<10:49:56,  2.84it/s][A
loss:8.761138:   1%|          | 1004/111721 [05:59<10:49:56,  2.84it/s][A
loss:8.761138:   1%|          | 1005/111721 [05:59<12:13:23,  2.52it/s][A
loss:9.112267:   1%|          | 1005/111721 [05:59<12:13:23,  2.52it/s][A
loss:9.112267:   1%|          | 1006/111721 [05:59<11:36:17,  2.65it/s][A
loss:8.465251:   1%|          | 1006/111721 [05:59<11:36:17,  2.65it/s][A
loss:8.465251:   1%|          | 1007/111721 [06:00<11:22:33,  2.70it/s][A
loss:9.052039:   1

['<unk>', 'jeder', '<unk>', 'gibt', 'es', 'nicht', 'nur', 'im', '<unk>', '<unk>', 'auf', '<unk>', '<unk>', ',', 'sondern', 'so', 'gut', 'wie', 'an', 'jeder', '<unk>', '.', '</s>'] ['<unk>', '<unk>', '<unk>', '<unk>', 'man', 'auch', 'in', 'in', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '</s>']
['<unk>', '<unk>', 'gilt', ':', 'auch', 'beim', 'gepflegten', '<unk>', 'ist', '<unk>', 'nur', '<unk>', '.', '</s>'] ['<unk>', '<unk>', '<unk>', 'in', '<unk>', '<unk>', '<unk>', 'man', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '</s>']
['<unk>', 'beliebt', 'sind', 'in', '<unk>', 'auch', 'die', 'italienische', 'und', 'asiatische', '<unk>', '.', '</s>'] ['<unk>', '<unk>', '<unk>', 'der', '<unk>', '<unk>', 'in', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '</s>']
['<unk>', 'sei', 'an', 'dieser', '<unk>', 'das', '<unk>', '<unk>', '(', '<unk>', '&apos;', '<unk>', '<unk>', ')', 'genannt', '.', '</s>'] ['<unk>', '<unk>'


loss:10.287349:   1%|          | 1101/111721 [06:34<11:40:20,  2.63it/s][A
loss:10.109623:   1%|          | 1101/111721 [06:34<11:40:20,  2.63it/s][A
loss:10.109623:   1%|          | 1102/111721 [06:34<10:33:54,  2.91it/s][A
loss:9.767568:   1%|          | 1102/111721 [06:34<10:33:54,  2.91it/s] [A
loss:9.767568:   1%|          | 1103/111721 [06:35<10:26:16,  2.94it/s][A
loss:9.448397:   1%|          | 1103/111721 [06:35<10:26:16,  2.94it/s][A
loss:9.448397:   1%|          | 1104/111721 [06:35<10:21:09,  2.97it/s][A
loss:9.189953:   1%|          | 1104/111721 [06:35<10:21:09,  2.97it/s][A
loss:9.189953:   1%|          | 1105/111721 [06:35<10:58:54,  2.80it/s][A
loss:9.183713:   1%|          | 1105/111721 [06:35<10:58:54,  2.80it/s][A
loss:9.183713:   1%|          | 1106/111721 [06:36<10:28:22,  2.93it/s][A
loss:7.840610:   1%|          | 1106/111721 [06:36<10:28:22,  2.93it/s][A
loss:7.840610:   1%|          | 1107/111721 [06:36<10:09:44,  3.02it/s][A
loss:7.660269:   1%|

['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '-', '<unk>', '<unk>', 'von', '<unk>', 'und', 'in', '<unk>', '.', '<unk>', 'ideale', '<unk>', 'fr', '<unk>', ',', '<unk>', 'und', '<unk>', '.', '</s>'] ['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '</s>']
['<unk>', '<unk>', '<unk>', '<unk>', '-', '<unk>', 'erster', '<unk>', 'und', 'nur', 'wenige', '<unk>', 'von', '<unk>', ',', '<unk>', '<unk>', 'und', 'dem', '<unk>', 'von', '<unk>', '<unk>', 'entfernt', '.', '<unk>', 'in', 'letzter', '<unk>', 'und', '<unk>', '<unk>', 'fr', 'die', '<unk>', 'des', '<unk>', 'fr', '<unk>', '<unk>', 'in', '<unk>', '<unk>', '.', '</s>'] ['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', 'das', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', ',', '</s>']
['<unk>', '<unk>', '<unk>', '<unk>', '5', '<unk>', '<unk>', '-', '<unk>',


loss:7.595603:   1%|          | 1201/111721 [07:05<11:17:17,  2.72it/s][A
loss:7.283512:   1%|          | 1201/111721 [07:05<11:17:17,  2.72it/s][A
loss:7.283512:   1%|          | 1202/111721 [07:06<12:10:02,  2.52it/s][A
loss:7.145710:   1%|          | 1202/111721 [07:06<12:10:02,  2.52it/s][A
loss:7.145710:   1%|          | 1203/111721 [07:06<11:53:08,  2.58it/s][A
loss:7.316093:   1%|          | 1203/111721 [07:06<11:53:08,  2.58it/s][A
loss:7.316093:   1%|          | 1204/111721 [07:06<12:03:00,  2.55it/s][A
loss:7.132479:   1%|          | 1204/111721 [07:07<12:03:00,  2.55it/s][A
loss:7.132479:   1%|          | 1205/111721 [07:07<11:26:14,  2.68it/s][A
loss:6.809629:   1%|          | 1205/111721 [07:07<11:26:14,  2.68it/s][A
loss:6.809629:   1%|          | 1206/111721 [07:07<11:26:08,  2.68it/s][A
loss:7.470200:   1%|          | 1206/111721 [07:07<11:26:08,  2.68it/s][A
loss:7.470200:   1%|          | 1207/111721 [07:08<11:23:43,  2.69it/s][A
loss:7.455213:   1%|    

['<unk>', '<unk>', 'setzt', 'sich', 'aus', 'dem', '<unk>', 'und', 'den', 'örtlichen', '<unk>', 'verschiedener', '<unk>', 'zusammen', '.', '</s>'] ['<unk>', '<unk>', 'der', 'sich', 'in', 'der', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', 'der', '<unk>', 'der', '.', '</s>']
['<unk>', '<unk>', 'unterstehen', 'die', '<unk>', 'von', '16', '<unk>', 'und', 'zwei', '<unk>', '.', '</s>'] ['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', 'der', '<unk>', '.', 'der', '</s>']
['<unk>', '<unk>', '<unk>', 'sich', 'wiederum', 'in', '<unk>', ',', 'zum', '<unk>', 'auch', 'in', '<unk>', ',', '<unk>', 'und', '„', '<unk>', '“', '(', 'auf', '<unk>', ')', '.', '</s>'] ['<unk>', '<unk>', '<unk>', 'der', 'in', '<unk>', 'der', 'der', 'der', '<unk>', 'der', 'in', 'der', 'der', 'der', '<unk>', '<unk>', '<unk>', 'der', '<unk>', '<unk>', 'der', 'der', '.', '</s>']
['<unk>', '<unk>', 'setzen', 'sich', 'hingegen', 'aus', '<unk>', 'und', '<unk>', 'zusammen', ',', 'nur', 'im', '<unk>', '<unk>', 


loss:7.625777:   1%|          | 1301/111721 [07:42<13:14:51,  2.32it/s][A
loss:7.894316:   1%|          | 1301/111721 [07:42<13:14:51,  2.32it/s][A
loss:7.894316:   1%|          | 1302/111721 [07:42<14:18:22,  2.14it/s][A
loss:7.376709:   1%|          | 1302/111721 [07:42<14:18:22,  2.14it/s][A
loss:7.376709:   1%|          | 1303/111721 [07:43<13:55:11,  2.20it/s][A
loss:7.008856:   1%|          | 1303/111721 [07:43<13:55:11,  2.20it/s][A
loss:7.008856:   1%|          | 1304/111721 [07:43<13:43:22,  2.24it/s][A
loss:6.890112:   1%|          | 1304/111721 [07:43<13:43:22,  2.24it/s][A
loss:6.890112:   1%|          | 1305/111721 [07:43<13:02:28,  2.35it/s][A
loss:6.330023:   1%|          | 1305/111721 [07:44<13:02:28,  2.35it/s][A
loss:6.330023:   1%|          | 1306/111721 [07:44<12:58:28,  2.36it/s][A
loss:7.109238:   1%|          | 1306/111721 [07:44<12:58:28,  2.36it/s][A
loss:7.109238:   1%|          | 1307/111721 [07:44<13:29:09,  2.27it/s][A
loss:6.720879:   1%|    

['<unk>', 'der', '<unk>', 'des', '<unk>', '<unk>', 'ist', 'es', ',', 'eine', 'möglichst', '<unk>', '<unk>', 'der', '<unk>', 'im', '<unk>', '<unk>', 'zu', 'dokumentieren', '.', '</s>'] ['<unk>', 'der', '<unk>', 'der', '<unk>', 'der', 'der', 'die', 'die', 'die', 'der', 'die', 'der', 'der', '<unk>', 'der', '<unk>', 'der', 'der', 'einem', '<unk>', '</s>']
['<unk>', 'werden', 'von', '<unk>', '<unk>', 'die', 'notwendigen', '<unk>', 'zur', '<unk>', 'aus', '<unk>', 'geliefert', '.', '</s>'] ['<unk>', '<unk>', 'die', 'der', 'der', 'der', '<unk>', '<unk>', 'der', '<unk>', 'der', 'der', 'der', '<unk>', '</s>']
['<unk>', 'der', '<unk>', '-', 'die', 'späte', '<unk>', '<unk>', '<unk>', '(', '450', '<unk>', '650', 'n.', '<unk>', ')', '-', 'war', 'durch', '<unk>', 'in', 'den', '<unk>', '<unk>', 'zeitlichen', '<unk>', '<unk>', '<unk>', 'und', '<unk>', '<unk>', 'bisher', 'noch', 'nicht', 'dokumentiert', '.', '</s>'] ['<unk>', 'der', '<unk>', 'der', 'der', '<unk>', '<unk>', 'der', 'der', 'mit', '<unk>', 


loss:6.346705:   1%|▏         | 1401/111721 [08:16<12:13:52,  2.51it/s][A
loss:6.164670:   1%|▏         | 1401/111721 [08:16<12:13:52,  2.51it/s][A
loss:6.164670:   1%|▏         | 1402/111721 [08:17<11:01:11,  2.78it/s][A
loss:6.347475:   1%|▏         | 1402/111721 [08:17<11:01:11,  2.78it/s][A
loss:6.347475:   1%|▏         | 1403/111721 [08:17<10:27:32,  2.93it/s][A
loss:7.278375:   1%|▏         | 1403/111721 [08:17<10:27:32,  2.93it/s][A
loss:7.278375:   1%|▏         | 1404/111721 [08:17<11:21:39,  2.70it/s][A
loss:6.893407:   1%|▏         | 1404/111721 [08:18<11:21:39,  2.70it/s][A
loss:6.893407:   1%|▏         | 1405/111721 [08:18<11:38:10,  2.63it/s][A
loss:6.805099:   1%|▏         | 1405/111721 [08:18<11:38:10,  2.63it/s][A
loss:6.805099:   1%|▏         | 1406/111721 [08:18<11:34:19,  2.65it/s][A
loss:6.474840:   1%|▏         | 1406/111721 [08:18<11:34:19,  2.65it/s][A
loss:6.474840:   1%|▏         | 1407/111721 [08:19<11:24:15,  2.69it/s][A
loss:6.130690:   1%|▏   

['<unk>', '<unk>', '<unk>', '<unk>', 'des', '<unk>', '<unk>', '<unk>', 'in', '<unk>', 'on', '<unk>', 'benötigen', '<unk>', 'eine', '<unk>', '.', '</s>'] ['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '.', '<unk>', '<unk>', '</s>']
['<unk>', 'bei', '<unk>', 'für', 'das', '<unk>', '<unk>', '<unk>', 'in', '<unk>', 'on', '<unk>', 'angegebenen', '<unk>', 'verstehen', 'sich', 'als', '<unk>', 'pro', '<unk>', ',', '<unk>', '“', 'pro', '<unk>', '”', '.', '</s>'] ['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', 'sich', '<unk>', '<unk>', '.', '<unk>', '.', '<unk>', '.', '.', '<unk>', '.', '.', '</s>']
['<unk>', 'bei', '<unk>', 'für', 'das', '<unk>', '<unk>', '<unk>', 'in', '<unk>', 'angegebenen', '<unk>', 'verstehen', 'sich', 'als', '<unk>', 'pro', '<unk>', ',', '<unk>', '“', 'pro', '<unk>', '”', '.', '</s>'] ['<unk>', '<unk>', '<un


loss:6.447412:   1%|▏         | 1501/111721 [08:54<13:35:20,  2.25it/s][A
loss:6.710729:   1%|▏         | 1501/111721 [08:54<13:35:20,  2.25it/s][A
loss:6.710729:   1%|▏         | 1502/111721 [08:54<12:09:47,  2.52it/s][A
loss:6.347120:   1%|▏         | 1502/111721 [08:54<12:09:47,  2.52it/s][A
loss:6.347120:   1%|▏         | 1503/111721 [08:55<11:43:56,  2.61it/s][A
loss:6.500820:   1%|▏         | 1503/111721 [08:55<11:43:56,  2.61it/s][A
loss:6.500820:   1%|▏         | 1504/111721 [08:55<10:50:10,  2.83it/s][A
loss:6.989711:   1%|▏         | 1504/111721 [08:55<10:50:10,  2.83it/s][A
loss:6.989711:   1%|▏         | 1505/111721 [08:55<11:25:14,  2.68it/s][A
loss:6.812026:   1%|▏         | 1505/111721 [08:55<11:25:14,  2.68it/s][A
loss:6.812026:   1%|▏         | 1506/111721 [08:56<11:40:07,  2.62it/s][A
loss:6.247619:   1%|▏         | 1506/111721 [08:56<11:40:07,  2.62it/s][A
loss:6.247619:   1%|▏         | 1507/111721 [08:56<12:39:25,  2.42it/s][A
loss:7.130170:   1%|▏   

['<unk>', '<unk>', 'de', '<unk>', ',', 'betrachtet', 'als', 'eine', 'der', 'besten', 'der', '<unk>', ',', 'ist', 'nur', '100', 'm', 'von', 'der', '<unk>', 'entfernt', '.', '</s>'] ['<unk>', ',', ',', '<unk>', ',', '<unk>', ',', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', ',', '<unk>', ',', '<unk>', '<unk>', ',', '<unk>', '<unk>', ',', '.', '</s>']
['<unk>', 'können', '<unk>', 'viele', 'französischen', '<unk>', 'und', '<unk>', 'in', 'der', '<unk>', 'finden', '.', '</s>'] ['<unk>', ',', ',', ',', '<unk>', '<unk>', ',', '<unk>', '.', '<unk>', '<unk>', ',', '<unk>', '</s>']
['<unk>', 'de', 'la', '<unk>', 'und', '<unk>', 'de', '<unk>', ',', 'beide', 'nah', 'beim', '<unk>', ',', 'sind', 'berühmt', 'für', 'ihr', '<unk>', '<unk>', '.', '</s>'] ['<unk>', ',', '<unk>', '<unk>', ',', '<unk>', ',', '<unk>', ',', '<unk>', '<unk>', ',', '<unk>', ',', '<unk>', ',', ',', '<unk>', '<unk>', ',', '.', '</s>']
['<unk>', ',', 'da', 'die', '<unk>', 'im', '6.', '<unk>', 'ist', ',', 'wird', '<unk>', 'der', '


loss:6.590926:   1%|▏         | 1601/111721 [09:27<10:34:45,  2.89it/s][A
loss:7.236525:   1%|▏         | 1601/111721 [09:27<10:34:45,  2.89it/s][A
loss:7.236525:   1%|▏         | 1602/111721 [09:27<10:43:02,  2.85it/s][A
loss:6.628062:   1%|▏         | 1602/111721 [09:27<10:43:02,  2.85it/s][A
loss:6.628062:   1%|▏         | 1603/111721 [09:28<10:15:48,  2.98it/s][A
loss:6.800742:   1%|▏         | 1603/111721 [09:28<10:15:48,  2.98it/s][A
loss:6.800742:   1%|▏         | 1604/111721 [09:28<10:48:59,  2.83it/s][A
loss:6.463130:   1%|▏         | 1604/111721 [09:28<10:48:59,  2.83it/s][A
loss:6.463130:   1%|▏         | 1605/111721 [09:28<11:19:22,  2.70it/s][A
loss:5.969344:   1%|▏         | 1605/111721 [09:29<11:19:22,  2.70it/s][A
loss:5.969344:   1%|▏         | 1606/111721 [09:29<11:05:56,  2.76it/s][A
loss:5.459010:   1%|▏         | 1606/111721 [09:29<11:05:56,  2.76it/s][A
loss:5.459010:   1%|▏         | 1607/111721 [09:29<12:34:16,  2.43it/s][A
loss:5.571442:   1%|▏   

['<unk>', 'siehe', ',', 'auch', 'die', '<unk>', '<unk>', ',', 'die', 'die', '<unk>', 'und', 'die', '<unk>', 'unserer', '<unk>', 'von', 'der', '<unk>', 'an', ',', 'da', 'sie', '<unk>', 'verlassen', 'haben', ',', 'bis', 'jetzt', 'enthalten', ',', 'und', 'sie', 'sind', 'wahr', ';', 'und', 'wir', 'können', 'von', 'ihrer', '<unk>', 'wissen', ',', 'weil', 'wir', 'sie', 'vor', '<unk>', 'haben', '.', '</s>'] ['<unk>', '<unk>', '<unk>', 'daß', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '.', '<unk>', 'zu', '<unk>', '<unk>', 'daß', '<unk>', '<unk>', '<unk>', '<unk>', 'und', '<unk>', '<unk>', '.', '<unk>', '<unk>', '<unk>', '.', 'und', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '.', '</s>']
['7', '<unk>', 'nun', ',', 'meine', '<unk>', ',', 'möchte', 'ich', ',', 'daß', 'ihr', 'daran', 'denkt', ',', 'eifrig', 'darin', 'zu', 'a', 'forschen', ',',


loss:5.816508:   2%|▏         | 1701/111721 [10:04<17:12:00,  1.78it/s][A
loss:5.160712:   2%|▏         | 1701/111721 [10:04<17:12:00,  1.78it/s][A
loss:5.160712:   2%|▏         | 1702/111721 [10:04<14:39:31,  2.08it/s][A
loss:5.727708:   2%|▏         | 1702/111721 [10:04<14:39:31,  2.08it/s][A
loss:5.727708:   2%|▏         | 1703/111721 [10:04<12:50:42,  2.38it/s][A
loss:5.428538:   2%|▏         | 1703/111721 [10:04<12:50:42,  2.38it/s][A
loss:5.428538:   2%|▏         | 1704/111721 [10:04<11:13:26,  2.72it/s][A
loss:6.289327:   2%|▏         | 1704/111721 [10:05<11:13:26,  2.72it/s][A
loss:6.289327:   2%|▏         | 1705/111721 [10:05<12:21:45,  2.47it/s][A
loss:6.266634:   2%|▏         | 1705/111721 [10:05<12:21:45,  2.47it/s][A
loss:6.266634:   2%|▏         | 1706/111721 [10:05<11:47:24,  2.59it/s][A
loss:5.552575:   2%|▏         | 1706/111721 [10:05<11:47:24,  2.59it/s][A
loss:5.552575:   2%|▏         | 1707/111721 [10:06<10:39:45,  2.87it/s][A
loss:5.660098:   2%|▏   

KeyboardInterrupt: ignored

In [0]:
import torch.nn.functional as F
from nltk.translate.bleu_score import sentence_bleu
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu
import json
import torch.utils.data.dataset as Dataset
import torch.optim as optim

In [0]:
a = "where is shanghai?"
b = "where is beijing?"

In [0]:
sentence_bleu([a], b)