In [1]:
import sockeye
import mosestokenizer
import html

import mxnet as mx
import sentencepiece as spm

import ipywidgets as widgets

from truecaser import applytc
from time import time
from sockeye.translate import inference

### File path constants

In [2]:
SOCKEYE_MODEL_FOLDER_ENETLV = ['en-et-lv-model']
TRUECASE_MODEL_ENETLV = 'preprocessing-models/joint-truecase-enetlv.tc'
SENTENCEPIECE_MODEL_ENETLV = 'preprocessing-models/sp.model'

### Load models

In [3]:
my_tokenizer = mosestokenizer.MosesTokenizer('en')

[INFO:MosesTokenizer] executing argv ['perl', '/usr/local/lib/python3.6/dist-packages/mosestokenizer/tokenizer-v1.1.perl', '-q', '-l', 'en', '-b', '-a']
[INFO:MosesTokenizer] spawned process 28076


In [4]:
my_detokenizer = mosestokenizer.MosesDetokenizer('en')

[INFO:MosesDetokenizer] executing argv ['perl', '/usr/local/lib/python3.6/dist-packages/mosestokenizer/detokenizer.perl', '-q', '-b', '-l', 'en']
[INFO:MosesDetokenizer] spawned process 28098


In [5]:
my_truecaser_enetlv = applytc.loadModel(TRUECASE_MODEL_ENETLV)

In [6]:
my_segmenter_enetlv = spm.SentencePieceProcessor()
my_segmenter_enetlv.Load(SENTENCEPIECE_MODEL_ENETLV)

True

In [7]:
def get_translator(model_folders):
    ctx = mx.gpu()
    models, source_vocabs, target_vocab = inference.load_models(
        context=ctx,
        max_input_len=None,
        beam_size=5,
        batch_size=1,
        model_folders=model_folders,
        checkpoints=None,
        softmax_temperature=None,
        max_output_length_num_stds=2,
        decoder_return_logit_inputs=False,
        cache_output_layer_w_b=False)
    return inference.Translator(context=ctx,
                                ensemble_mode="linear",
                                bucket_source_width=10,
                                length_penalty=inference.LengthPenalty(1.0, 0.0),
                                beam_prune=0,
                                beam_search_stop='all',
                                models=models,
                                source_vocabs=source_vocabs,
                                target_vocab=target_vocab,
                                restrict_lexicon=None,
                                store_beam=False,
                                strip_unknown_words=False)

In [8]:
my_translator_enetlv = get_translator(SOCKEYE_MODEL_FOLDER_ENETLV)

[INFO:sockeye.inference] Loading 1 model(s) from ['en-et-lv-model'] ...
[INFO:sockeye.vocab] Vocabulary (34089 words) loaded from "en-et-lv-model/vocab.src.0.json"
[INFO:sockeye.vocab] Vocabulary (7 words) loaded from "en-et-lv-model/vocab.src.1.json"
[INFO:sockeye.vocab] Vocabulary (8 words) loaded from "en-et-lv-model/vocab.src.2.json"
[INFO:sockeye.vocab] Vocabulary (34089 words) loaded from "en-et-lv-model/vocab.trg.0.json"
[INFO:sockeye.inference] Model version: 1.18.51
[INFO:sockeye.model] ModelConfig loaded from "en-et-lv-model/config"
[INFO:sockeye.model] Config[_frozen=True, config_data=Config[_frozen=True, data_statistics=Config[_frozen=True, average_len_target_per_bucket=[5.925335796339093, 13.455985766027595, 23.04356145777397, 32.409491679603356, 41.705934319477535, 51.04894822540166, 60.30271898744638, 69.60858214192915, 78.81654387987426, 87.80222051615205], buckets=[(10, 10), (20, 20), (30, 30), (40, 40), (50, 50), (60, 60), (70, 70), (80, 80), (90, 90), (100, 100)], le

### Preprocessing, translation, postprocessing

In [9]:
def preprocess(sentence, lang_factor, style_factor,
               tokenizer, truecaser, segmenter):
    tokenized_sentence = html.unescape(' '.join(tokenizer(sentence)))
    truecased_sentence = applytc.processLine(truecaser,
                                             tokenized_sentence)
    segmented_sentence = ' '.join([x
                                   for x in segmenter.EncodeAsPieces(truecased_sentence)])
    factored_sentence = ' '.join([x + '|' + lang_factor + '|' + style_factor
                                  for x in segmented_sentence.split()])
    
    return factored_sentence

In [10]:
def postprocess(sentence, segmenter, detokenizer):
    de_segmented_sentence = segmenter.DecodePieces(sentence.split())
    de_truecased_sentence = de_segmented_sentence[0].upper() + de_segmented_sentence[1:]
    de_tokenized_sentence = detokenizer(de_truecased_sentence.split())
    
    return de_tokenized_sentence

In [11]:
def forward(sentence, t):
    trans_inputs = inference.make_input_from_factored_string(sentence_id=1, factored_string=sentence, translator=t)
    outputs = t.translate([trans_inputs])
    return outputs[0].translation

In [12]:
def translate(sentence, lang_factor, style_factor,
              tokenizer, detokenizer, truecaser, segmenter,
              translator):
    translation = forward(preprocess(sentence, lang_factor, style_factor,
                                     tokenizer, truecaser, segmenter),
                          translator)
    postprocessed_translation = postprocess(translation, segmenter, detokenizer)
    
    return postprocessed_translation

In [13]:
def send(sentence, lang_str, style_str,
         tokenizer, detokenizer, truecaser, segmenter,
         translator):
    lang_dict = {'EN': 'to-en',
                 'ET': 'to-et',
                 'LV': 'to-lv',
                 'DE': 'to-de',
                 'FR': 'to-fr'}
    style_dict = {'Informal': 'to-osubs',
                  'Official': 'to-eparl',
                  'Legal': 'to-jrcac',
                  'Medical': 'to-emea'}
    return translate(sentence, lang_dict[lang_str], style_dict[style_str],
                     tokenizer, detokenizer, truecaser, segmenter,
                     translator)

# Translate

## EN-ET-LV translation

In [14]:
# Choose target language
TRG_LANG = 'EN'
# TRG_LANG = 'ET'
# TRG_LANG = 'LV'

# Choose target style
# TRG_STYLE = 'Informal'
TRG_STYLE = 'Official'
# TRG_STYLE = 'Legal'
# TRG_STYLE = 'Medical'

# Print sentence
src_sent = 'I could not come to class today because I had a very important meeting, and also because my cat chewed my report.'

start = time()
print(send(src_sent, lang_str=TRG_LANG, style_str=TRG_STYLE,
     tokenizer=my_tokenizer, detokenizer=my_detokenizer,
     truecaser=my_truecaser_enetlv, segmenter=my_segmenter_enetlv, translator=my_translator_enetlv))
print(time()-start)

I could not come to class today because I had a very important meeting, and also because my cat chewed my report.
0.7731778621673584


### Widgets

In [15]:
textw1 = widgets.Textarea(
    value='Hello!',
    placeholder='Source sentence',
    description='',
    disabled=False,
    layout=widgets.Layout(width='300px', height='90px')
)

In [16]:
textw2 = widgets.Textarea(
    value='',
    placeholder='Translating...',
    description='',
    disabled=False,
    layout=widgets.Layout(width='300px', height='90px')
)

In [17]:
langwidget = widgets.ToggleButtons(
    options=['EN', 'ET', 'LV'],
    description='Target lang:',
    disabled=False,
    button_style='',
    tooltips=['Translate into English', 'Translate into Estonian', 'Translate into Latvian'],
    layout=widgets.Layout(width='600px', height='40px')   
)

langwidget.style.button_width='138px'

In [18]:
stylewidget = widgets.ToggleButtons(
    options=['Informal', 'Official', 'Legal', 'Medical'],
    description='Target style:',
    disabled=False,
    button_style='',
    tooltips=['Use colloquial style', 'Use official speech style', 'Use legal documents style', 'Use medical documents style'],
    layout=widgets.Layout(width='600px', height='70px')
)

stylewidget.style.button_width='209px'

In [19]:
transl_button = widgets.Button(
    description='Translate',
    disabled=False,
    button_style='success',
    tooltip='Translate sentence',
    icon='',
    layout=widgets.Layout(width='150px', height='35px')
)

In [20]:
def translate_on_click(b):
    textw2.value=''
    tr = send(textw1.value, lang_str=langwidget.value, style_str=stylewidget.value,
     tokenizer=my_tokenizer, detokenizer=my_detokenizer,
     truecaser=my_truecaser_enetlv, segmenter=my_segmenter_enetlv, translator=my_translator_enetlv)
    textw2.value=tr
        
transl_button.on_click(translate_on_click)

In [21]:
textbox = widgets.HBox([textw1, textw2])
buttonbox = widgets.HBox([transl_button])
buttonbox.layout.padding = '0px 0px 0px 220px'

# EN-ET-LV translation

In [22]:
textbox = widgets.HBox([textw1, textw2])
buttonbox = widgets.HBox([transl_button])
buttonbox.layout.padding = '0px 0px 0px 220px'
display(textbox, langwidget, stylewidget, buttonbox)

HBox(children=(Textarea(value='Hello!', layout=Layout(height='90px', width='300px'), placeholder='Source sente…

ToggleButtons(description='Target lang:', layout=Layout(height='40px', width='600px'), options=('EN', 'ET', 'L…

ToggleButtons(description='Target style:', layout=Layout(height='70px', width='600px'), options=('Informal', '…

HBox(children=(Button(button_style='success', description='Translate', layout=Layout(height='35px', width='150…