In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os

In [2]:

from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_encoder
from tensor2tensor.data_generators import text_problems
from tensor2tensor.data_generators import translate
from tensor2tensor.data_generators import wiki_lm
from tensor2tensor.utils import registry

[K     |████████████████████████████████| 1.4 MB 7.1 MB/s 
[K     |████████████████████████████████| 649 kB 36.3 MB/s 
[K     |████████████████████████████████| 352 kB 54.8 MB/s 
[K     |████████████████████████████████| 79 kB 8.5 MB/s 
[K     |████████████████████████████████| 679 kB 52.9 MB/s 
[K     |████████████████████████████████| 366 kB 54.1 MB/s 
[K     |████████████████████████████████| 981 kB 56.8 MB/s 
[K     |████████████████████████████████| 5.6 MB 53.0 MB/s 
[K     |████████████████████████████████| 191 kB 48.9 MB/s 
[K     |████████████████████████████████| 365 kB 39.9 MB/s 
[K     |████████████████████████████████| 251 kB 45.1 MB/s 
[K     |████████████████████████████████| 191 kB 54.7 MB/s 
[K     |████████████████████████████████| 178 kB 54.3 MB/s 
[?25h  Building wheel for bz2file (setup.py) ... [?25l[?25hdone
  Building wheel for pypng (setup.py) ... [?25l[?25hdone


In [3]:
EOS = text_encoder.EOS_ID

_ENVI_TRAIN_SMALL_DATA = [
    [
        "https://github.com/WilliamDunbar/Machine_Translation/blob/master/Data.rar",
        ("dev.en",
         "dev.vi")
    ],
]
_ENVI_TEST_SMALL_DATA = [
    [
        "https://github.com/WilliamDunbar/Machine_Translation/blob/master/Data.rar",
        ("test.en",
         "test.vi")
    ],
]
_ENVI_TRAIN_LARGE_DATA = [
    [
        "https://github.com/WilliamDunbar/Machine_Translation/blob/master/Data.rar",
        ("train.en", "train.vi")
    ],
]
_ENVI_TEST_LARGE_DATA = [
    [
        "https://github.com/WilliamDunbar/Machine_Translation/blob/master/Data.rar",
        ("test.en", "test.vi")
    ],
]


In [4]:
@registry.register_problem
class TranslateEnViWmtSmall8k(translate.TranslateProblem):
  """Problem spec for WMT En-Vi translation."""

  @property
  def approx_vocab_size(self):
    return 2**13  # 8192

  @property
  def use_small_dataset(self):
    return True

  def source_data_files(self, dataset_split):
    train = dataset_split == problem.DatasetSplit.TRAIN
    if self.use_small_dataset:
      datasets = _ENVI_TRAIN_SMALL_DATA if train else _ENVI_TEST_SMALL_DATA
    else:
      datasets = _ENVI_TRAIN_LARGE_DATA if train else _ENVi_TEST_LARGE_DATA
    return datasets

  def vocab_data_files(self):
    return (_ENVI_TRAIN_SMALL_DATA if self.use_small_dataset
            else _ENVI_TRAIN_LARGE_DATA)


@registry.register_problem
class TranslateEnViWmtSmall32k(TranslateEnViWmtSmall8k):

  @property
  def approx_vocab_size(self):
    return 2**15  # 32768


@registry.register_problem
class TranslateEnViWmt8k(TranslateEnViWmtSmall8k):

  @property
  def use_small_dataset(self):
    return False


@registry.register_problem
class TranslateEnViWmt32k(TranslateEnViWmtSmall32k):

  @property
  def use_small_dataset(self):
    return False


@registry.register_problem
class TranslateEnViWmt32kPacked(TranslateEnViWmt32k):

  @property
  def packed_length(self):
    return 256

  @property
  def use_vocab_from_other_problem(self):
    return TranslateEnViWmt32k()


@registry.register_problem
class TranslateEnViWmt32kWithBacktranslateVi(TranslateEnViWmt32k):
  """En-Vi translation with added VietNam data, back-translated."""

  @property
  def use_vocab_from_other_problem(self):
    return TranslateEnViWmt32k()

  @property
  def already_shuffled(self):
    return True

  @property
  def skip_random_fraction_when_training(self):
    return False

  @property
  def backtranslate_data_filenames(self):
    """List of pairs of files with matched back-translated data."""
    # Files must be placed in tmp_dir, each similar size to authentic data.
    return [("en_mono_en.txt", "en_mono_vi.txt")]

  @property
  def dataset_splits(self):
    """Splits of data to produce and number of output shards for each."""
    return [{
        "split": problem.DatasetSplit.TRAIN,
        "shards": 1,  # Use just 1 shard so as to not mix data.
    }, {
        "split": problem.DatasetSplit.EVAL,
        "shards": 1,
    }]

  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    datasets = self.source_data_files(dataset_split)
    tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev"
    data_path = translate.compile_data(
        tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag))
    # For eval, use authentic data.
    if dataset_split != problem.DatasetSplit.TRAIN:
      for example in text_problems.text2text_txt_iterator(
          data_path + ".lang1", data_path + ".lang2"):
        yield example
    else:  # For training, mix synthetic and authentic data as follows.
      for (file1, file2) in self.backtranslate_data_filenames:
        path1 = os.path.join(tmp_dir, file1)
        path2 = os.path.join(tmp_dir, file2)
        # Synthetic data first.
        for example in text_problems.text2text_txt_iterator(path1, path2):
          yield example
        # Now authentic data.
        for example in text_problems.text2text_txt_iterator(
            data_path + ".lang1", data_path + ".lang2"):
          yield example


@registry.register_problem
class TranslateEnViWmt32kWithBacktranslateEn(
    TranslateEnViWmt32kWithBacktranslateVi):
  """En-Vi translation with added English data, back-translated."""

  @property
  def backtranslate_data_filenames(self):
    """List of pairs of files with matched back-translated data."""
    # Files must be placed in tmp_dir, each similar size to authentic data.
    return [("en_mono_en.txt%d" % i, "en_mono_vi.txt%d" % i) for i in [0, 1, 2]]


@registry.register_problem
class TranslateEnViWmtSmallCharacters(translate.TranslateProblem):
  """Problem spec for WMT En-Vi translation."""

  @property
  def vocab_type(self):
    return text_problems.VocabType.CHARACTER

  @property
  def use_small_dataset(self):
    return True

  def source_data_files(self, dataset_split):
    train = dataset_split == problem.DatasetSplit.TRAIN
    if self.use_small_dataset:
      datasets = _ENVI_TRAIN_SMALL_DATA if train else _ENVI_TEST_SMALL_DATA
    else:
      datasets = _ENVI_TRAIN_LARGE_DATA if train else _ENVI_TEST_LARGE_DATA
    return datasets


@registry.register_problem
class TranslateEnViWmtCharacters(TranslateEnViWmtSmallCharacters):

  @property
  def use_small_dataset(self):
    return False


@registry.register_problem
class TranslateEnViWmtMulti64k(TranslateEnViWmtSmall32k):
  """Translation with muli-lingual vocabulary."""

  @property
  def use_small_dataset(self):
    return False

  @property
  def use_vocab_from_other_problem(self):
    return wiki_lm.LanguagemodelDeEnViRoWiki64k()


@registry.register_problem
class TranslateEnViWmtMulti64kPacked1k(TranslateEnViWmtMulti64k):
  """Translation with muli-lingual vocabulary."""

  @property
  def packed_length(self):
    return 1024

  @property
  def num_training_examples(self):
    return 1760600

  @property
  def inputs_prefix(self):
    return "translate English VietNam "

  @property
  def targets_prefix(self):
    return "translate VietNam English "

In [6]:
!pip install kora
from kora import console
console.start()  # and click link

Collecting kora
  Downloading kora-0.9.19-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 3.2 MB/s 
[?25hCollecting fastcore
  Downloading fastcore-1.3.20-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 2.1 MB/s 
Installing collected packages: fastcore, kora
Successfully installed fastcore-1.3.20 kora-0.9.19



In [7]:
!bash

bash: cannot set terminal process group (61): Inappropriate ioctl for device
bash: no job control in this shell
[1;36m/content[m# 
[1;36m/content[m# 
[1;36m/content[m# exit
