# Model

In [1]:
import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
import altair as alt
# from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
# from torchtext.vocab import build_vocab_from_iterator
# import torchtext.datasets as datasets
import spacy
import GPUtil
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
import pandas as pd
import random


# Set to False to skip notebook execution (e.g. for debugging)
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True

In [2]:
# Some convenience helper functions used throughout the notebook


def is_interactive_notebook():
    return __name__ == "__main__"


def show_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        return fn(*args)


def execute_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        fn(*args)


class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None

    def step(self):
        None

    def zero_grad(self, set_to_none=False):
        None


class DummyScheduler:
    def step(self):
        None

In [3]:
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many
    other models.
    """

    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator

    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)

    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)

    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [4]:
class Generator(nn.Module):
    "Define standard linear + softmax generation step."

    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return log_softmax(self.proj(x), dim=-1)

In [5]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [6]:
class Encoder(nn.Module):
    "Core encoder is a stack of N layers"

    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [7]:
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."

    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [8]:
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """

    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

In [9]:
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"

    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

In [10]:
class Decoder(nn.Module):
    "Generic N layer decoder with masking."

    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

In [11]:
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"

    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)

    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

In [12]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(
        torch.uint8
    )
    return subsequent_mask == 0

print(subsequent_mask(5))

tensor([[[ True, False, False, False, False],
         [ True,  True, False, False, False],
         [ True,  True,  True, False, False],
         [ True,  True,  True,  True, False],
         [ True,  True,  True,  True,  True]]])


In [13]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = scores.softmax(dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

# query = torch.empty(5, 5).fill_(5)
# key = torch.empty(5, 5).fill_(5)
# value = torch.empty(5, 5).fill_(5)
# mask = subsequent_mask(5)
# mask = torch.ones(1, 5)
# print(attention(query, key, value, mask=mask, dropout=None)[1])

In [14]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
            
        nbatches = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
        query, key, value = [
            lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
            for lin, x in zip(self.linears, (query, key, value))
        ]

        # print(query.shape)

        # 2) Apply attention on all the projected vectors in batch.
        x, self.attn = attention(
            query, key, value, mask=mask, dropout=self.dropout
        )

        # 3) "Concat" using a view and apply a final linear.
        x = (
            x.transpose(1, 2)
            .contiguous()
            .view(nbatches, -1, self.h * self.d_k)
        )
        del query
        del key
        del value
        return self.linears[-1](x)

# x = torch.Tensor(16, 10, 512)
# mask = subsequent_mask(10)

# model = MultiHeadedAttention(8, 512)
# print(model(x, x, x, mask).shape)


In [15]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."

    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(self.w_1(x).relu()))

In [16]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

In [17]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."

    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        # print(position)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)

# PositionalEncoding(512, 0.1)

In [18]:
def make_model(
    src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1
):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab),
    )

    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model



# LOSS & OPTIMIZATION

In [19]:
def rate(step, model_size, factor, warmup):
    """
    we have to default the step to 1 for LambdaLR function
    to avoid zero raising to negative power.
    """
    if step == 0:
        step = 1
    return factor * (
        model_size ** (-0.5) * min(step ** (-0.5), step * warmup ** (-1.5))
    )

In [20]:
class LabelSmoothing(nn.Module):
    "Implement label smoothing."

    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(reduction="sum")
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None

    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist

        # print(self.true_dist)
        # print(x)
        # print("-----------------------------------------")

        return self.criterion(x, true_dist.clone().detach())

In [21]:
class SimpleLossCompute:
    "A simple loss compute and train function."

    def __init__(self, generator, criterion):
        self.generator = generator
        self.criterion = criterion

    def __call__(self, x, y, norm):
        x = self.generator(x)
        sloss = (
            self.criterion(
                x.contiguous().view(-1, x.size(-1)), y.contiguous().view(-1)
            )
            / norm
        )
        return sloss.data * norm, sloss

# DATA

In [22]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch.nn as nn
# Function to build a word-to-index mapping (vocabulary)
def built_curpus(train_texts):
    word_2_index = {"<PAD>": 0, "<UNK>": 1, "<SOS>": 2, "<EOS>": 3}
    for text in train_texts:
        for word in text.split():
            word_2_index[word] = word_2_index.get(word, len(word_2_index))
    return word_2_index

# TranslationDataset class
class TranslationDataset(Dataset):
    def __init__(self, data, source_language, target_language, source_word_2_index, target_word_2_index, max_length=1000):
        """
        Initializes the TranslationDataset.

        Parameters:
        - data (pd.DataFrame): The DataFrame containing the translation data.
        - source_language (str): Column name for the source language.
        - target_language (str): Column name for the target language.
        - source_word_2_index (dict): Word-to-index mapping for the source language.
        - target_word_2_index (dict): Word-to-index mapping for the target language.
        - max_length (int): Maximum sequence length for padding/truncation.
        """
        self.source_texts = data[source_language].tolist()
        self.target_texts = data[target_language].tolist()
        self.source_word_2_index = source_word_2_index
        self.target_word_2_index = target_word_2_index
        self.max_length = max_length

    def tokenize_and_pad_src(self, text, word_2_index):
        """
        Tokenizes and pads a single text.

        Parameters:
        - text (str): The input text (sentence).
        - word_2_index (dict): Word-to-index mapping.

        Returns:
        - torch.Tensor: Tokenized and padded tensor.
        """
        # Tokenize words to indices
        tokenized = [word_2_index.get(word, word_2_index["<UNK>"]) for word in text.split()]
        # Add <SOS> and <EOS> tokens
        tokenized = tokenized + [word_2_index["<EOS>"]]
        # Pad or truncate to max_length
        if len(tokenized) < self.max_length:
            tokenized += [word_2_index["<PAD>"]] * (self.max_length - len(tokenized))
        else:
            tokenized = tokenized[:self.max_length]
        return torch.tensor(tokenized, dtype=torch.long)
    
    def tokenize_and_pad_tgt(self, text, word_2_index):
        """
        Tokenizes and pads a single text.

        Parameters:
        - text (str): The input text (sentence).
        - word_2_index (dict): Word-to-index mapping.

        Returns:
        - torch.Tensor: Tokenized and padded tensor.
        """
        # Tokenize words to indices
        tokenized = [word_2_index.get(word, word_2_index["<UNK>"]) for word in text.split()]
        # Add <SOS> and <EOS> tokens
        tokenized = [word_2_index["<SOS>"]] + tokenized 
        # Pad or truncate to max_length
        if len(tokenized) < self.max_length:
            tokenized += [word_2_index["<PAD>"]] * (self.max_length - len(tokenized))
        else:
            tokenized = tokenized[:self.max_length]
        return torch.tensor(tokenized, dtype=torch.long)
    
    def tokenize_and_pad_tgt_label(self, text, word_2_index):
        """
        Tokenizes and pads a single text.

        Parameters:
        - text (str): The input text (sentence).
        - word_2_index (dict): Word-to-index mapping.

        Returns:
        - torch.Tensor: Tokenized and padded tensor.
        """
        # Tokenize words to indices
        tokenized = [word_2_index.get(word, word_2_index["<UNK>"]) for word in text.split()]
        # Add <SOS> and <EOS> tokens
        tokenized = tokenized + [word_2_index["<EOS>"]]
        # Pad or truncate to max_length
        if len(tokenized) < self.max_length:
            tokenized += [word_2_index["<PAD>"]] * (self.max_length - len(tokenized))
        else:
            tokenized = tokenized[:self.max_length]
        return torch.tensor(tokenized, dtype=torch.long)

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, idx):
        source_text = self.source_texts[idx]
        target_text = self.target_texts[idx]

        # Tokenize and pad the source and target texts
        source_tokens = self.tokenize_and_pad_src(source_text, self.source_word_2_index)
        target_tokens = self.tokenize_and_pad_tgt(target_text, self.target_word_2_index)
        target_tokens_label = self.tokenize_and_pad_tgt_label(target_text, self.target_word_2_index)

        return source_tokens, target_tokens, target_tokens_label

In [23]:
file_path_train = './nusax-main/datasets/mt/train.csv'
data_trian = pd.read_csv(file_path_train, usecols=lambda col: col != 'Unnamed: 0')

file_path_test = './nusax-main/datasets/mt/test.csv'
data_test = pd.read_csv(file_path_test, usecols=lambda col: col != 'Unnamed: 0')

file_path_valid = './nusax-main/datasets/mt/valid.csv'
data_valid = pd.read_csv(file_path_test, usecols=lambda col: col != 'Unnamed: 0')

data = pd.concat([data_trian, data_test], ignore_index=True)

source_language = 'indonesian'
target_language = 'english'

    # Build vocabularies
source_word_2_index = built_curpus(data[source_language])
target_word_2_index = built_curpus(data[target_language])
print(len(source_word_2_index)) # Check whether the index list is correctly formed 
print(len(target_word_2_index)) # Check whether the index list is correctly formed 

5667
5814


# Train

### Only Class Train need to modifiy

In [24]:
class Train:
    src_vocab: int = len(source_word_2_index)
    target_vocab: int = len(target_word_2_index)
    N: int = 2
    Fixed_len: int = 50
    Epoch: int = 30
    Batch_size: int = 16

In [25]:
dataset = TranslationDataset(data, source_language, target_language, source_word_2_index, target_word_2_index, Train.Fixed_len)
dataloader = DataLoader(dataset, batch_size=Train.Batch_size, shuffle=True)

In [26]:
class TrainState:
    """Track number of steps, examples, and tokens processed"""

    step: int = 0  # Steps in the current epoch
    accum_step: int = 0  # Number of gradient accumulation steps
    samples: int = 0  # total # of examples used
    tokens: int = 0  # total # of tokens processed
    epoch: int = 0

In [27]:
def run_epoch(
    data_iter,
    model,
    loss_compute,
    optimizer,
    scheduler,
    mode="train",
    accum_iter=1,
    train_state=TrainState(),
):
    """Train a single epoch"""
    start = time.time()
    total_tokens = 0  # 统计总的 token 数量
    total_loss = 0  # 统计总的 loss
    n_accum = 0  # 累积更新计数

    # 初始化模型的 masks
    src_mask = torch.ones(1, 1, Train.Fixed_len)
    tgt_mask = subsequent_mask(Train.Fixed_len)

    # 遍历数据集的每个 batch
    for i, (src, tgt, tgt_label) in enumerate(data_iter):

        ntokens = (tgt_label != 0).sum().item()

        # 前向传播，计算输出
        out = model.forward(
            src, tgt, src_mask, tgt_mask
        )
        # 计算当前 batch 的损失
        loss, loss_node = loss_compute(out, tgt_label, ntokens)

        if mode == "train" or mode == "train+log":
            # 反向传播
            loss_node.backward()
            train_state.step += 1
            train_state.samples += src.shape[0]
            train_state.tokens += ntokens
            # 累积更新
            if i % accum_iter == 0:
                optimizer.step()
                optimizer.zero_grad(set_to_none=True)
                n_accum += 1
                train_state.accum_step += 1
            # 学习率调度器更新
            scheduler.step()

        # 累积总损失和总 token 数
        total_loss += loss
        total_tokens += ntokens

        # 释放不需要的变量
        del loss
        del loss_node

    # 计算并返回每个 epoch 的平均 loss
    avg_loss = total_loss / total_tokens
    print(f"Epoch {TrainState.epoch} completed. Average Loss: {avg_loss}")
    TrainState.epoch += 1
    return avg_loss, train_state

In [28]:
def tokenize_and_pad_src(text, word_2_index, max_length):
        """
        Tokenizes and pads a single text.

        Parameters:
        - text (str): The input text (sentence).
        - word_2_index (dict): Word-to-index mapping.

        Returns:
        - torch.Tensor: Tokenized and padded tensor.
        """
        # Tokenize words to indices
        tokenized = [word_2_index.get(word, word_2_index["<UNK>"]) for word in text.split()]
        # Add <SOS> and <EOS> tokens
        tokenized = tokenized + [word_2_index["<EOS>"]]
        # Pad or truncate to max_length
        if len(tokenized) < max_length:
            tokenized += [word_2_index["<PAD>"]] * (max_length - len(tokenized))
        else:
            tokenized = tokenized[:max_length]
        return torch.tensor(tokenized, dtype=torch.long)


def tokenize_and_pad_tgt_label(text, word_2_index, max_length):
        """
        Tokenizes and pads a single text.

        Parameters:
        - text (str): The input text (sentence).
        - word_2_index (dict): Word-to-index mapping.

        Returns:
        - torch.Tensor: Tokenized and padded tensor.
        """
        # Tokenize words to indices
        tokenized = [word_2_index.get(word, word_2_index["<UNK>"]) for word in text.split()]
        # Add <SOS> and <EOS> tokens
        tokenized = tokenized + [word_2_index["<EOS>"]]
        # Pad or truncate to max_length
        if len(tokenized) < max_length:
            tokenized += [word_2_index["<PAD>"]] * (max_length - len(tokenized))
        else:
            tokenized = tokenized[: max_length]
        return torch.tensor(tokenized, dtype=torch.long)


def greedy_decode(model, src, src_mask, max_len, start_symbol):
    memory = model.encode(src, src_mask)
    ys = torch.zeros(1, 1).fill_(start_symbol).type_as(src.data)
    for i in range(max_len - 1):
        out = model.decode(
            memory, src_mask, ys, subsequent_mask(ys.size(1)).type_as(src.data)
        )
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data[0]
        ys = torch.cat(
            [ys, torch.zeros(1, 1).type_as(src.data).fill_(next_word)], dim=1
        )
    return ys


def translate(model, src, start_symbol):
     
     model.eval()
     
     src = tokenize_and_pad_src(src, source_word_2_index, Train.Fixed_len).unsqueeze(0)
     src_mask = torch.ones(1, 1, Train.Fixed_len)
     tgt = greedy_decode(model, src, src_mask, max_len=Train.Fixed_len, start_symbol=start_symbol).squeeze()
     temp_dict = {v: k for k, v in target_word_2_index.items()}
     words = [temp_dict[idx.item()] for idx in tgt]
     sentence = ' '.join(words)

     return sentence
     

In [29]:
def train():
    criterion = LabelSmoothing(size=Train.target_vocab, padding_idx=0, smoothing=0.1)
    model = make_model(Train.src_vocab, Train.target_vocab, N=Train.N)

    optimizer = torch.optim.Adam(
        model.parameters(), lr=0.5, betas=(0.9, 0.98), eps=1e-9
    )
    lr_scheduler = LambdaLR(
        optimizer=optimizer,
        lr_lambda=lambda step: rate(
            step, model_size=model.src_embed[0].d_model, factor=1.0, warmup=400
        ),
    )

    for epoch in range(Train.Epoch):
        model.train()
        run_epoch(
            dataloader,
            model,
            SimpleLossCompute(model.generator, criterion),
            optimizer,
            lr_scheduler,
            mode="train",
        )

        randint = random.randint(0, 100)
        src = data[source_language][randint]
        print(src)
        print(translate(model, src, 2))
        print()

        if epoch == Train.Epoch - 1:
            torch.save(model.state_dict(), './transformer.pth')


train()

Epoch 0 completed. Average Loss: 6.854353427886963
Barangnya lumayan, cuma yang saya heran xiaomi redmi note 2 ini tombol onnya memang agak rusak? Terus baterai memang cepat low bat juragan?
<SOS> the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the

Epoch 1 completed. Average Loss: 5.976646900177002
Untuk memastikannya ibu bisa datang ke kantor cab bank mandiri tersebut pada hari dan jam kerja untuk penukaran uang.
<SOS> The is the the the the the the the the the the the the the the the the the the the the the the the <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>

Epoch 2 completed. Average Loss: 5.729384422302246
Saya coba yang di dekat kantor walikota. Saya pesan bakso malang super komplet. Rasanya enak. Kenyang. Harga juga masuk akal.
<SOS> The 

KeyboardInterrupt: 

In [30]:
def eval():

    model = make_model(Train.src_vocab, Train.target_vocab, N=Train.N)
    model.load_state_dict(torch.load('./transformer.pth'))
    for _ in range(5):
        randint = random.randint(0, 99)
        src = data_valid[source_language][randint]
        print(src)
        print(translate(model, src, 2))
        print()

eval()

Restoran ini adalah sebuah destinasi kuliner yang wajib dikunjungi bila ke bandung. Selain makanan yang sangat memanjakan lidah makanan sunda, restoran ini juga dibangun dengan konsep makan santai di dalam saung yang dikelilingi alam hutan dan pegunungan yang dihias pepohonan rindang dan dihibur oleh gemericik air sungai sehingga menawarkan pengalaman kuliner yang teramat mengesankan. Harus dicoba bagi siapapun yang ke bandung.
<SOS> This restaurant is the prime culinary destination if you're going to Bandung. Aside from the exceptionally delicious Sundanese menu, this restaurant is also built on the basis of a relaxing meal time in a saung surrounded by the woods and mountains accompanied by shady trees and splashing sounds of

Tidak mengerti ya kenapa resto ini menduduki peringkat 12 dari 1800an resto yang ada di jakarta. Harga mahal! Lokasi tersembunyi, dan rasa makanan biasa saja. Jangan terlalu percaya sama review yang menyesatkanlah.
<SOS> I don't get why this restaurant is ranke