<h1>Overview<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#WMA-Tokenization" data-toc-modified-id="WMA-Tokenization-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>WMA Tokenization</a></span><ul class="toc-item"><li><span><a href="#WMA-en-de-train/val" data-toc-modified-id="WMA-en-de-train/val-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>WMA en-de train/val</a></span></li></ul></li></ul></div>

In [1]:
import os
import re
import sys
import numpy as np
import pandas as pd

from time import sleep
from keras import backend as K
from keras.models import Model 
from keras.models import Sequential as SequentialModel
from keras.layers import Dense, Conv1D, LSTM, Dropout, Embedding, Layer, Input, Flatten, concatenate as Concatenate, Lambda
from keras.callbacks import Callback
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer as KerasTokenizer

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

sys.path.insert(0, '../ct')

import load
from preprocess import preprocess
from preprocess import Tokenizer
from preprocess.preprocess import separator_samples

from model.layers import LayerNormalization
from model.layers import ContentBasedAttention_CT
from model.layers import ScaledDotProductAttention
from model.layers import MultiHeadAttention
from model import CompressiveTransformer

from load.wma import load as load_wma

Using TensorFlow backend.


In [4]:
def file_utf_to_ascii(input_path, output_path=None):
    if output_path is None:
        s = input_path.split('.')
        output_path = '.'.join(s[:-1]) + '-ascii.' + s[-1]
    
    with open(input_path, 'r', encoding='utf8') as file:
        content = file.read()
    content = content.encode('ascii', 'xmlcharrefreplace')
    content = content.decode('ascii')
    with open(output_path, 'w', encoding='ascii') as file:
        file.write(content)
    print(f'converted utf->ascii for {input_path}')

In [25]:
# for p in input_paths:
#    file_utf_to_ascii(p)

converted utf->ascii for ..\data\wma-en-de\input\train-en.txt
converted utf->ascii for ..\data\wma-en-de\input\train-de.txt


# WMA Tokenization

In [8]:
vocab_size=1024
lowercase=False

input_paths = {'en': '../data/wma-en-de/input/train-en.txt',
               'de': '../data/wma-en-de/input/train-de.txt'}

tokenizer_output_path = f'../data/wma-en-de/tokenizer/en-de-v0-t{vocab_size}' \
                        f'{"-lowercase" if lowercase else ""}.tok'

In [9]:
tokenizer = Tokenizer(input_paths=list(input_paths.values()), 
                      tokenizer_output_path=tokenizer_output_path,
                      vocab_size=vocab_size,
                      lowercase=lowercase)

In [10]:
hello = tokenizer.encode_batch(['hello'])[0]
hello

Encoding(num_tokens=3, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [11]:
print(hello.ids)
print(hello.tokens)

[401, 314, 78]
['he', 'll', 'o']


## WMA en-de train/val

In [12]:
from load.wma import load as load_wma

In [13]:
wma = load_wma(input_paths['en'],
               input_paths['de'])

In [14]:
wma

Unnamed: 0,english,german
0,iron cement is a ready for use paste which is ...,iron cement ist eine gebrauchs ##AT##-##AT## f...
1,iron cement protects the ingot against the hot...,Nach der Aushärtung schützt iron cement die Ko...
2,"a fire restant repair cement for fire places ,...",feuerfester Reparaturkitt für Feuerungsanlagen...
3,Construction and repair of highways and ...\n,Der Bau und die Reparatur der Autostraßen ...\n
4,An announcement must be commercial character .\n,die Mitteilungen sollen den geschäftlichen kom...
...,...,...
4468835,Their achievement remains one of the greatest ...,Das bleibt eine der größten Errungenschaften i...
4468836,"At the same time , Zuma ’ s revolutionary gene...",Gleichzeitig scheint sich Zumas revolutionäre ...
4468837,"In a region that reveres the elderly , Zuma ’ ...","In einer Region , wo die älteren Menschen sehr..."
4468838,Three in ten South Africans are younger than 1...,Drei von zehn Südafrikanern sind jünger als 15...


In [15]:
english_encodings = tokenizer.encode_batch(wma.english.tolist())

wma['english_ids'] = [encoding.ids for encoding in english_encodings]

In [16]:
del english_encodings

In [17]:
# german_encodings = tokenizer.encode_batch(wma.german.tolist())
# wma['german_ids'] = [encoding.ids for encoding in german_encodings]
# del german_encodings

In [18]:
wma[['english_ids']]

Unnamed: 0,english_ids
0,"[330, 265, 296, 339, 301, 326, 259, 353, 445, ..."
1,"[330, 265, 296, 339, 301, 406, 288, 364, 82, 2..."
2,"[64, 285, 651, 409, 397, 490, 353, 79, 64, 330..."
3,"[34, 265, 323, 722, 682, 312, 353, 79, 64, 330..."
4,"[32, 77, 291, 77, 749, 66, 339, 301, 926, 324,..."
...,...
4468835,"[474, 330, 259, 262, 274, 85, 339, 301, 353, 7..."
4468836,"[32, 83, 277, 268, 837, 972, 263, 465, 361, 64..."
4468837,"[621, 259, 750, 289, 398, 353, 434, 264, 277, ..."
4468838,"[622, 904, 283, 260, 256, 302, 697, 71, 320, 6..."


In [19]:
val_index = int(len(wma)*0.8)

In [20]:
x_train = wma[['english_ids']][:val_index]
x_val = wma[['english_ids']][-val_index:]

In [21]:
x_train.to_pickle('../data/wma-en-de/processed/train-en.pkl.zip')
x_val.to_pickle('../data/wma-en-de/processed/val-en.pkl.zip')