In [1]:
pip install numpy tensorflow --use-feature=2020-resolver termcolor 


Usage:   
  /opt/conda/bin/python3.7 -m pip install [options] <requirement specifier> [package-index-options] ...
  /opt/conda/bin/python3.7 -m pip install [options] -r <requirements file> [package-index-options] ...
  /opt/conda/bin/python3.7 -m pip install [options] [-e] <vcs project url> ...
  /opt/conda/bin/python3.7 -m pip install [options] [-e] <local project path> ...
  /opt/conda/bin/python3.7 -m pip install [options] <archive url/path> ...

no such option: --use-feature
Note: you may need to restart the kernel to use updated packages.


In [2]:
from tensorflow.keras import layers as layers
import tensorflow.keras as keras
import tensorflow as tf
import numpy as np
import time
from termcolor import colored

class EncDec():

    def __init__(self, input_vocab, target_vocab, embedding_dim, units, batch_size, inpmap, targmap):

        # constructs network
        
        hidden = hidden_init(batch_size, units)
        self.input_dim = input_vocab
        self.out_dim = target_vocab
        self.batch_size = batch_size
        self.embed_dim = embedding_dim
        self.units = units
        self.inmap = inpmap
        self.outmap = targmap

        self.encoder = EncDec.Encoder(input_vocab, embedding_dim, units, batch_size)
        self.decoder = EncDec.Decoder(target_vocab, embedding_dim, units, batch_size)

        print(colored("Encoder and Decoder models created! Ready for training", "green"))


    def train(self, data, epochs, steps_per_epoch):
        self.optimizer = keras.optimizers.Adam(0.01)
        self.loss_obj = keras.losses.SparseCategoricalCrossentropy(
            from_logits=True,
            reduction='none')

        for ep in range(epochs):
            start = time.time()

            enchid = hidden_init(self.batch_size, self.units)
            total_loss = 0

            for (batch, (inp, targ)) in enumerate(data.take(steps_per_epoch)):
                
                batch_loss = self.train_step(inp, targ, enchid)
                total_loss += batch_loss

                if(batch%100 == 0): 
                    print(f"Epoch {ep+1} | Batch {batch} | Loss {batch_loss}")
            
            print(colored(f"Epoch {ep+1} completed | Loss {total_loss/steps_per_epoch}", "green"))
            print(f"Time for epoch {ep+1}: {time.time()-start} seconds")

        
            
    def loss(self, ans, pred):
        mask = tf.math.logical_not(tf.math.equal(ans, 0))
        loss = self.loss_obj(ans, pred)
        mask = tf.cast(mask, dtype=loss.dtype)
        loss*=mask

        return tf.reduce_mean(loss)

    def train_step(self, inp, targ, hid):
        loss = 0

        with tf.GradientTape() as tape:
            encout, enchid = self.encoder(inp, hid)

            decin = tf.expand_dims([self.outmap.word_index['<s>']]*self.batch_size, 1)

            for t in range(1, targ.shape[1]):
                pred, dec_hid, _ = self.decoder(decin, enchid, encout)

                loss += loss(self, targ[:,t], pred) # doing loss onto the predicted translation

                decin = tf.expand_dims(targ[:,t], 1) #teacher forcing - feeding in answer as input

            batch_loss = int(loss / targ.shape[1]) # total loss / n_examples = avg loss
            vars = self.encoder.trainable_variables + self.decoder.trainable_variables
            grads = tape.gradient(loss, vars) #finds gradient between loss and vars
            self.optimizer.apply_gradients(zip(grads, vars))
            
            return grads


    class Encoder(keras.Model):
        def __init__(self, input_vocab, embed_dim, encoder_units, batch_size):
            super(EncDec.Encoder, self).__init__()
            self.embed = layers.Embedding(input_vocab, embed_dim)
            self.gru = layers.GRU(
                    encoder_units, 
                    return_sequences=True,
                    return_state=True,
                    recurrent_initializer = 'glorot_uniform') # draws samples (initial weights) from uniform distr btwn -lim, lim where lim = sqrt( 6 / (num_inps + num_outs) )

        def call(self, x, hidden):
            output, state = self.gru(self.embed(x), initial_state=hidden)
            return output, state
    

    class Decoder(keras.Model):
        def __init__(self, target_vocab, embed_dim, units, batch_size):
            super(Decoder, self).__init__()
            self.batch_size = batch_size
            self.dec_units = units
            self.embedding = layers.Embedding(target_vocab, embed_dim)
            self.gru = layers.GRU(self.dec_units, 
                                   return_sequences=True, 
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
            self.fc = layers.Dense(target_vocab)
            
        def call(self, x, enc_out):
            x = self.embedding(x)
            out, state = self.gru(x)
            out = tf.reshape(out, (-1, out.shape[2]))
            x = self.fc(out)
            x = tf.nn.log_softmax(x)

            return x, state

def hidden_init(batch, n_encoder):
    return tf.zeros((batch, n_encoder))



In [3]:
import tensorflow as tf
import unicodedata
import re
import io

def getData(path):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    # n_total = 118964
    pairs = [[preprocess(x) for x in l.split('\t')] for l in lines]
    en, sp = zip(*pairs) #make tuples from pairs

    en_tensor, en_map = tokenize(en)
    sp_tensor, sp_map = tokenize(sp)

    return en_tensor, sp_tensor, en_map, sp_map

def uni_to_ascii(s) -> str:
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn') # ensuring it's not an accent

def preprocess(w) -> str:
    w = uni_to_ascii(w.lower().strip())

    # make space between word and punct
    w = re.sub(r"([?.!,¿¡])", r" \1 ", w) # substitutes captured string thing (referenced w \1) with that thing + space
    # get rid of multiple space seq things
    w = re.sub(r'[" "]+', " ", w)

    # keep only letters and punct
    w = re.sub(r"[^a-zA-Z?.!,¿¡]+", " ", w).strip()

    # add start + end token for model
    w = '<s> ' + w + ' <e>'
    
    return w

def tokenize(lang) -> (tf.Tensor, tf.keras.preprocessing.text.Tokenizer):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

    return tensor, lang_tokenizer

def detokenize(sequence, lang_map) -> str:
    toRet = ""
    for x in sequence:
        if x != 0:
            toRet += lang_map.index_word[x] + " "
    return toRet

In [10]:
import numpy as np
import tensorflow as tf
#import dataset
#import EncoderDecoder
from termcolor import colored
#import network
import os.path
import os


print(colored("Successfully imported packages", "green"))

CITATION="""
@inproceedings{
    Tiedemann2012ParallelData,
    author = {Tiedemann, J},
    title = {Parallel Data, Tools and Interfaces in OPUS},
    booktitle = {LREC}
    year = {2012}
}
"""

# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

# instantiating the model in the strategy scope creates the model on the TPU
with tpu_strategy.scope():

    # Download the file
    #path_to_zip = tf.keras.utils.get_file(
    #    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    #    extract=True)

    en, es, en_map, es_map = getData("../input/spaeng/spa.txt")

    en_train = en[0:100000]
    es_train = es[0:100000]
    en_eval = en[100000:]
    es_eval = es[100000:]

    BATCH_SIZE=64
    BUFFER_SIZE = len(en_train)
    EMBEDDING_DIM = 512
    STEP_EPOCH = len(en_train)//BATCH_SIZE
    EPOCHS = 10
    UNITS = 1024
    VOCAB_INP_SIZE = len(en_map.word_index)+1
    VOCAB_OUT_SIZE = len(es_map.word_index)+1
    
    train_ds = tf.data.Dataset.from_tensor_slices((en_train, es_train)).shuffle(BUFFER_SIZE)
    train_ds = train_ds.batch(BATCH_SIZE, drop_remainder=True)

    eval_ds = tf.data.Dataset.from_tensor_slices((en_eval, es_eval)).shuffle(BUFFER_SIZE)
    eval_ds = eval_ds.batch(BATCH_SIZE, drop_remainder=True)


    print(colored("Train and Eval datasets created", "green"))

    print(dataset.detokenize(en_train[0], en_map) + " " + dataset.detokenize(es_train[0], es_map))

    EncDecModel = EncDec(VOCAB_INP_SIZE, VOCAB_OUT_SIZE, EMBEDDING_DIM, UNITS, BATCH_SIZE, en_map, es_map)
    EncDecModel.train(self, train_ds, EPOCHS, STEP_EPOCH)


    # NMTAttn = network.NMTAttn(VOCAB_INP_SIZE, VOCAB_OUT_SIZE, UNITS, n_encoder=3, n_decoder=3, n_attn_heads=1, dropout=0.03, mode='test')
    # NMTAttn.model(np.array([1, 2, 3]))

[32mSuccessfully imported packages[0m


NameError: name 'BUFFER_SIZE' is not defined