In [1]:
import tensorflow as tf
import unicodedata
import re
import io

def getData(path):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    # n_total = 118964
    pairs = [[preprocess(x) for x in l.split('\t')] for l in lines]
    en, sp = zip(*pairs) #make tuples from pairs

    en_tensor, en_map = tokenize(en)
    sp_tensor, sp_map = tokenize(sp)

    return en_tensor, sp_tensor, en_map, sp_map

def uni_to_ascii(s) -> str:
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn') # ensuring it's not an accent

def preprocess(w) -> str:
    w = uni_to_ascii(w.lower().strip())

    # make space between word and punct
    w = re.sub(r"([?.!,¿¡])", r" \1 ", w) # substitutes captured string thing (referenced w \1) with that thing + space
    # get rid of multiple space seq things
    w = re.sub(r'[" "]+', " ", w)

    # keep only letters and punct
    w = re.sub(r"[^a-zA-Z?.!,¿¡]+", " ", w).strip()

    # add start + end token for model
    w = '<s> ' + w + ' <e>'
    
    return w

def tokenize(lang) -> (tf.Tensor, tf.keras.preprocessing.text.Tokenizer):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

    return tensor, lang_tokenizer

def detokenize(sequence, lang_map) -> str:
    toRet = ""
    for x in sequence:
        if x != 0:
            toRet += lang_map.index_word[x] + " "
    return toRet

In [2]:
import tensorflow.keras as keras
import keras.layers as layers
import numpy as np

class EnDe2():
    def __init__(self, inp_size, targ_size, embedding_dim, units, ds, bat, input_vocab, target_vocab):
        
        self.bat_size = bat
        
        # input -> encoder embedding -> encoder GRU -> 
        # save states -> decoder input -> decoder GRU w encoder states -> 
        # decoder LSTM -> dense w softmax activ
        
        
        enc_in = layers.Input(shape=(inp_size,), batch_size=bat)
#         print(input_vocab, embedding_dim)
        enc_out = layers.Embedding(input_dim=input_vocab+1, output_dim=embedding_dim, input_length=inp_size)(enc_in)
        enc_out, state = layers.GRU(
                                        units, 
                                        return_state=True,
                                        recurrent_initializer='glorot_uniform')(enc_out)

        dec_in = layers.Input(shape=(targ_size,), batch_size=bat)
#         print(target_vocab, embedding_dim
        dec_out = layers.Embedding(input_dim=target_vocab+1, output_dim=embedding_dim, input_length=targ_size)(dec_in)
        dec_out = layers.GRU(units)(dec_out, initial_state=state)
        dec_out = layers.Dense(targ_size, activation='softmax')(dec_out)
        
        self.model = keras.models.Model([enc_in, dec_in], dec_out)
        
        self.model.compile(optimizer='adam', loss='categorical_crossentropy')
        self.model.summary()
        
    #def train(input_texts, target_texts, epochs):
    def train(self, en, es, epochs):
        self.model.compile(
            optimizer="adam", 
            loss="categorical_crossentropy", 
            metrics=["accuracy"]
        )
        
        self.model.fit(
            [en, es],
            es,
            batch_size=self.bat_size,
            epochs=epochs,
            validation_split=0.2
        )
        self.model.save("s2s")
        

In [3]:
import numpy as np
import tensorflow as tf
#import dataset
#import EncoderDecoder
from termcolor import colored
#import network
import os.path
import os
import json


print(colored("Successfully imported packages", "green"))

CITATION="""
@inproceedings{
    Tiedemann2012ParallelData,
    author = {Tiedemann, J},
    title = {Parallel Data, Tools and Interfaces in OPUS},
    booktitle = {LREC}
    year = {2012}
}
"""

# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

# instantiating the model in the strategy scope creates the model on the TPU
with tpu_strategy.scope():

# Download the file
#path_to_zip = tf.keras.utils.get_file(
#    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
#    extract=True)

    en, es, en_map, es_map = getData("../input/spaeng/spa.txt")
    en_f = []
    es_f = []

    # espanol dict is larger than the eng dict so we pad for even
    target_length = len(en[0]) if len(en[0]) > len(es[0]) else len(es[0])
    for i in range(len(en)): # they have the same amount of samples so we can use one index
        en_f.append(np.append(en[i], [0]*(target_length - len(en[i]))))
        es_f.append(np.append(es[i], [0]*(target_length - len(es[i]))))

    en_train = en_f[0:100000]
    es_train = es_f[0:100000]
    en_eval = en_f[100000:]
    es_eval = es_f[100000:]

    BATCH_SIZE=64
    BUFFER_SIZE = len(en_train)
    EMBEDDING_DIM = 512
    STEP_EPOCH = len(en_train)//BATCH_SIZE
    EPOCHS = 30
    UNITS = 1024
    VOCAB_INP_SIZE = len(en_train[0])
    VOCAB_OUT_SIZE = len(es_train[0])

    train_ds = tf.data.Dataset.from_tensor_slices((en_train, es_train)).shuffle(BUFFER_SIZE)
    train_ds = train_ds.batch(BATCH_SIZE, drop_remainder=True)

    eval_ds = tf.data.Dataset.from_tensor_slices((en_eval, es_eval)).shuffle(BUFFER_SIZE)
    eval_ds = eval_ds.batch(BATCH_SIZE, drop_remainder=True)

    full_ds = tf.data.Dataset.from_tensor_slices((en_f, es_f)).shuffle(BUFFER_SIZE)#.batch(BATCH_SIZE, drop_remainder=True)


    print(colored("Train and Eval datasets created", "green"))

    print(detokenize(en_train[0], en_map) + " | " + detokenize(es_train[0], es_map))

    # for i in range(0, 52):
    #     print(detokenize([i], en_map))

    # for i in range(0, 54):
    #     print(detokenize([i], es_map))

    """
    EncDecModel = EncDec(VOCAB_INP_SIZE, VOCAB_OUT_SIZE, EMBEDDING_DIM, UNITS, BATCH_SIZE, en_map, es_map)
    EncDecModel.train(train_ds, EPOCHS, STEP_EPOCH)
    test = preprocess("hi how are you")
    print(f"Preprocessed: {test}")
    testseq = en_map.text_to_sequences(test)
    encout, weights = EncDecModel.encoder(testseq, hidden_init(BATCH_SIZE, UNITS))
    end = EncDecModel.decoder(encout)
    print(f"Output: {es_map.sequences_to_texts(end)}")
    """
    print(full_ds)
    en_config = en_map.get_config()
    es_config = es_map.get_config()
    n_eng = len(json.loads(en_config['word_counts']).keys()) # going into the config dict, taking the dict with the word counts, and taking the n of keys to get the overall number of words because apparently my other method is broken :/
    n_es = len(json.loads(es_config['word_counts']).keys())

    ED2 = EnDe2(VOCAB_INP_SIZE, VOCAB_OUT_SIZE, EMBEDDING_DIM, UNITS, full_ds, BATCH_SIZE, n_eng, n_es)
    ED2.train(en, es, EPOCHS)
    
    test_string = "hi how are you doing"
    tokenized_string = en_map.texts_to_sequences(test_string)
    result = ED2.model.predict(tokenized_string)
    translated = es_map.sequences_to_texts(result)
    print(f"Model translated {test_string} (eng) to {translated} (esp)")

    # NMTAttn = network.NMTAttn(VOCAB_INP_SIZE, VOCAB_OUT_SIZE, UNITS, n_encoder=3, n_decoder=3, n_attn_heads=1, dropout=0.03, mode='test')
    # NMTAttn.model(np.array([1, 2, 3]))

[32mSuccessfully imported packages[0m
[32mTrain and Eval datasets created[0m
<s> go . <e>  | <s> ve . <e> 
<ShuffleDataset shapes: ((53,), (53,)), types: (tf.int32, tf.float64)>
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(8, 53)]            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(8, 53)]            0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (8, 53, 512)         6622208     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1

KeyboardInterrupt: 