In [1]:
import tensorflow as tf
import unicodedata
import re
import io

def getData(path):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    # n_total = 118964
    pairs = [[preprocess(x) for x in l.split('\t')] for l in lines]
    en, sp = zip(*pairs) #make tuples from pairs

    en_tensor, en_map = tokenize(en)
    sp_tensor, sp_map = tokenize(sp)

    return en_tensor, sp_tensor, en_map, sp_map

def uni_to_ascii(s) -> str:
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn') # ensuring it's not an accent

def preprocess(w) -> str:
    w = uni_to_ascii(w.lower().strip())

    # make space between word and punct
    w = re.sub(r"([?.!,¿¡])", r" \1 ", w) # substitutes captured string thing (referenced w \1) with that thing + space
    # get rid of multiple space seq things
    w = re.sub(r'[" "]+', " ", w)

    # keep only letters and punct
    w = re.sub(r"[^a-zA-Z?.!,¿¡]+", " ", w).strip()

    # add start + end token for model
    w = '<s> ' + w + ' <e>'
    
    return w

def tokenize(lang) -> (tf.Tensor, tf.keras.preprocessing.text.Tokenizer):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

    return tensor, lang_tokenizer

def detokenize(sequence, lang_map) -> str:
    toRet = ""
    for x in sequence:
        if x != 0:
            toRet += lang_map.index_word[x] + " "
    return toRet

In [2]:
import numpy as np

def sep_data(enc_og, dec_og, n_eng_words, n_es_words):
    
    
    n_pairs = len(enc_og)
    max_eng = len(enc_og[0])
    max_es = len(dec_og[0])
    
    enc_inp = np.zeros((n_pairs, max_eng, n_eng_words), dtype=int)
    dec_inp = np.zeros((n_pairs, max_es, n_es_words), dtype=int)
    dec_out = np.zeros((n_pairs, max_es, n_es_words), dtype=int) #going to be same dims as dec_inp but off by a timestep
    # so a specific index in dec_out would correspond to index-1 in dec_inp
    
    # actually putting data into the arrays no
    
    for i, (inp, targ) in enumerate(zip(enc_og, dec_og)):
        
        for t, c in enumerate(inp): # english data
            enc_inp[i, t, c] = 1 # c represents the index/token/code thing for a word so we're basically going to that index
            # and saying "hey, there's x word present at this timestamp in this data pair"
            # no need to add spacces bc it's doing words so by default, after predictions and joined, spaces will be added in
        
        for t, c in enumerate(targ): # spanish data
            dec_inp[i, t, int(c)] = 1
            if t>0:
                # dec out will be ahead by 1 timestep and not include first word
                # so second word of input will be first of output
                dec_out[i, t-1, int(c)] = 1
                         
    
    return (enc_inp, dec_inp, dec_out)

In [3]:
import tensorflow.keras as keras
import numpy as np

"""
Borrowed code from https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
(modified some of it but used that site for base)
"""


class DataGenerator(keras.utils.Sequence):
# class DataGenerator():

    def __init__(self, list_IDs, n_en, n_es, batch_size=32, shuffle=True):
        # Initializing the generator
        
#         self.dim=dim # This is actually necessary for the keras.utils.Sequence, need to figure out the dims of the batch/data returned
        self.batch_size=2 # files are already batched, we need to load one of them
        self.dim = (32, 51, 53)
#         self.labels=labels
        self.list_id=list_IDs
#         self.n_chan=n_channels
#         self.n_class=n_classes
        self.shuffle=shuffle
        
        self.n_eng = n_en
        self.n_esp = n_es
        
        self.on_epoch_end()
        
    def on_epoch_end(self):
        self.idx = np.arange(len(self.list_id))
        
        # Makes sure that batches change between epochs
        if self.shuffle:
            np.random.shuffle(self.idx)
            
    def __len__(self):
        return int(np.floor(len(self.list_id)) // self.batch_size) # n_epochs per batch | the // operator divides but turns it into an int as well w/o the explicit int(x) call
    
    def __getitem__(self, idx):
        # generate indices of bat
        idxs = self.idx[(idx*self.batch_size):((idx+1)*self.batch_size)]
#         print("idxs " + str(idxs))
    
        temp_idx = [self.list_id[k] for k in idxs]
        
        enc, d_in, d_out = self.__data_generation(temp_idx)
        
#         for i in range(len(enc)):
#             yield ([enc[i], d_in[i]], d_out[i])
        
        return ([enc, d_in], d_out)
    
    
    def __data_generation(self, temp_ids):
            
        """
        The way this will work is basically by loading in the data at the id in a saved np file
        Then we'll read it in as the 2d matrix (a list of tensors where each tensor is a list of indices/values) and transform it into a 3d matrix in the style of the sep_data method
        """
        
#         en = np.empty((self.bat, *self.dim, self.n_chan))
#         es = np.empty((self.bat, *self.dim, self.n_chan))
        en = []
        es = []
        
        # data gen
        for i, ID in enumerate(temp_ids):
            # store sample
            en.append(np.load('../input/spa-eng-separated/archive/' + str(ID) + '_en.npy'))
            es.append(np.load('../input/spa-eng-separated/archive/' + str(ID) + '_es.npy'))
        
#         print(f"en {str(en)} es {str(es)}")
        # data loading and en/es declaration is working correctly
        
#         enc_in = np.ndarray([])
#         dec_in = np.ndarray([])
#         dec_out = np.ndarray([])
#         count = 0
    
#         for i, x in enumerate(en):
#             e_temp, d_temp, o_temp = sep_data(en[i], es[i], self.n_eng, self.n_esp)
#             # returned shapes: 
            
#             # e_temp: (32, 51, 12933) | np.ndarray | encoder input
#             # d_temp: (32, 53, 24794) | np.ndarray | decoder input
#             # o_temp: (32, 53, 24794) | np.ndarray | decoder output

# #             print(e_temp.shape, d_temp.shape, o_temp.shape)
# #             print(f"temp enc {str(e_temp)} temp dec in {str(d_temp)} temp dec out {str(o_temp)}")
#             # this is working as well, has them big [0, 1] arrays with the time stuff. So where's the error?
#             # seems like the only place is in the np.append. But idk what's wrong
#             # oh my god i'm stupid, i wasn't actually appending anything. np.append returns a copy so you have to do new_arr = np.append(old_arr, val)
#             # for some reason this strips the dims and just makes a long 1dim arr
#             enc_in = np.append(enc_in, e_temp)
#             dec_in = np.append(dec_in, d_temp)
#             dec_out = np.append(dec_out, o_temp)
# #             print(enc_in)
#             count += 1
#             print(count)
            
#         enc_in.astype('int')
#         dec_in.astype('int')
#         dec_out.astype('int')
#         print("enc in " + str(enc_in) + " dec in " + str(dec_in) + " dec out "+ str(dec_out))
        e_temp, d_temp, o_temp = sep_data(en[i], es[i], self.n_eng, self.n_esp)

        return e_temp, d_temp, o_temp
    
#         return enc_in, dec_in, dec_out
        
        
  

In [4]:
import tensorflow.keras as keras
import keras.layers as layers
import numpy as np

class EnDe2():
    def __init__(self, inp_size, targ_size, embedding_dim, units, bat, input_vocab, target_vocab):
        
        self.bat_size = bat
        
        # input -> encoder embedding -> encoder GRU -> 
        # save states -> decoder input -> decoder GRU w encoder states -> 
        # decoder LSTM -> dense w softmax activ
        
#        # e_temp: (32, 51, 12933) | np.ndarray | encoder input
#        # d_temp: (32, 53, 24794) | np.ndarray | decoder input
#        # o_temp: (32, 53, 24794) | np.ndarray | decoder output
        
        enc_in = layers.Input(shape=(inp_size, input_vocab), batch_size=bat)
#         print(input_vocab, embedding_dim)
#         enc_out = layers.Embedding(input_dim=input_vocab+1, output_dim=embedding_dim//2)(enc_in)
        enc = keras.layers.LSTM(embedding_dim, return_state=True)
        enc_out, state_h, state_c = enc(enc_in)
#         enc_out, state = layers.GRU(units//2, 
#                                     return_state=True,
#                                     recurrent_initializer='glorot_uniform')(enc_out)

        enc_states = [state_h, state_c]

        dec_in = layers.Input(shape=(targ_size, target_vocab), batch_size=bat)
#         print(target_vocab, embedding_dim
#         dec_out = layers.Embedding(input_dim=target_vocab+1, output_dim=embedding_dim//2)(dec_in)
#         dec_out = layers.GRU(units//2)(dec_out, initial_state=state)
#         dec_out = layers.Dense(targ_size, activation='softmax')(dec_out)
        dec_lstm = keras.layers.LSTM(embedding_dim, 
                                     return_sequences=True, 
                                     return_state=True)
        dec_out, _, _ = dec_lstm(dec_in, initial_state=enc_states)
        dec_dense = keras.layers.Dense(target_vocab, activation='softmax')
        dec_out = dec_dense(dec_out)
        
        self.model = keras.models.Model([enc_in, dec_in], dec_out)
        
        self.model.compile(optimizer='adam', loss='categorical_crossentropy')
        self.model.summary()
        
    def train(self, train_gen, epochs):
        self.model.compile(
            optimizer="rmsprop", 
            loss="categorical_crossentropy", 
            metrics=["accuracy"]
        )
#         self.model.fit(
#             [e, d_in],
#             d_out,
#             batch_size=self.bat_size,
#             epochs=epochs,
#             validation_split=0.2, 
#         )
        model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath="s2s",
            save_weights_only=True,
            monitor='val_accuracy',
            mode='max',
            save_best_only=True
        )

        for i in range(epochs):
            """
            APPARENTLY callbacks doesn't work so the data doesn't save
            and considering kaggle sessions last 9 hours max and this takes 2 hours per epoch, we *need* checkpoints very badly
            so yeah
            crappy but hopefully workable solution
            """
            self.model.fit(
                x=train_gen, # train_gen will return tuple ([encoder_in, decoder_in], decoder_out)
                epochs=1, 
                workers = 5,
                use_multiprocessing = True, 
                callbacks = [model_checkpoint_callback]

            )
            
            self.model.save("s2s")
        

In [None]:
import numpy as np
import tensorflow as tf
#import dataset
#import EncoderDecoder
from termcolor import colored
#import network
import os.path
import os
import json
import random


print(colored("Successfully imported packages", "green"))

CITATION="""
@inproceedings{
Tiedemann2012ParallelData,
author = {Tiedemann, J},
title = {Parallel Data, Tools and Interfaces in OPUS},
booktitle = {LREC}
year = {2012}
}
"""

# # detect and init the TPU
# tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
# tf.config.experimental_connect_to_cluster(tpu)
# tf.tpu.experimental.initialize_tpu_system(tpu)

# # instantiate a distribution strategy
# tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

# # instantiating the model in the strategy scope creates the model on the TPU
# with tpu_strategy.scope():

# Download the file
#path_to_zip = tf.keras.utils.get_file(
#    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
#    extract=True)

en, es, en_map, es_map = getData("../input/spaeng/spa.txt")
# en_f = []
# es_f = []

# # espanol dict is larger than the eng dict so we pad for even
# target_length = len(en[0]) if len(en[0]) > len(es[0]) else len(es[0])
# for i in range(len(en)): # they have the same amount of samples so we can use one index
#     en_f.append(np.append(en[i], [0]*(target_length - len(en[i]))))
#     es_f.append(np.append(es[i], [0]*(target_length - len(es[i]))))

#     en_train = en_f[0:100000]
#     es_train = es_f[0:100000]
#     en_eval = en_f[100000:]
#     es_eval = es_f[100000:]

BATCH_SIZE=32
#     BUFFER_SIZE = len(en_train)
EMBEDDING_DIM = 512
#     STEP_EPOCH = len(en_train)//BATCH_SIZE
EPOCHS = 30
UNITS = 1024
VOCAB_INP_SIZE = len(en[0])
VOCAB_OUT_SIZE = len(es[0])

# VOCAB_INP_SIZE = 51
# VOCAB_OUT_SIZE = 53

# train_ds = tf.data.Dataset.from_tensor_slices((en_train, es_train)).shuffle(BUFFER_SIZE)
# train_ds = train_ds.batch(BATCH_SIZE, drop_remainder=True)

# eval_ds = tf.data.Dataset.from_tensor_slices((en_eval, es_eval)).shuffle(BUFFER_SIZE)
# eval_ds = eval_ds.batch(BATCH_SIZE, drop_remainder=True)

# full_ds = tf.data.Dataset.from_tensor_slices((en_f, es_f)).shuffle(BUFFER_SIZE)#.batch(BATCH_SIZE, drop_remainder=True)


# print(colored("Train and Eval datasets created", "green"))

# print(detokenize(en_train[0], en_map) + " | " + detokenize(es_train[0], es_map))

# for i in range(0, 52):
#     print(detokenize([i], en_map))

# for i in range(0, 54):
#     print(detokenize([i], es_map))

"""
EncDecModel = EncDec(VOCAB_INP_SIZE, VOCAB_OUT_SIZE, EMBEDDING_DIM, UNITS, BATCH_SIZE, en_map, es_map)
EncDecModel.train(train_ds, EPOCHS, STEP_EPOCH)
test = preprocess("hi how are you")
print(f"Preprocessed: {test}")
testseq = en_map.text_to_sequences(test)
encout, weights = EncDecModel.encoder(testseq, hidden_init(BATCH_SIZE, UNITS))
end = EncDecModel.decoder(encout)
print(f"Output: {es_map.sequences_to_texts(end)}")
"""
# print(full_ds)
en_config = en_map.get_config()
es_config = es_map.get_config()
n_eng = len(json.loads(en_config['word_counts']).keys()) # going into the config dict, taking the dict with the word counts, and taking the n of keys to get the overall number of words because apparently my other method is broken :/
n_es = len(json.loads(es_config['word_counts']).keys())

# dec_out = [np.append(x[1:], [0]) for x in es_f]

# method header
# sep_data(enc_og, dec_og, n_eng_words, n_es_words)

# en_f, es_f, dec_out = sep_data(en, es, n_eng, n_es)
# print(dec_out[0])

ED2 = EnDe2(VOCAB_INP_SIZE, VOCAB_OUT_SIZE, EMBEDDING_DIM, UNITS, BATCH_SIZE, n_eng, n_es)
print(colored("About to start training", "green"))
# print(len(en_f), len(es_f), len(dec_out))
# print(en_f[0:5], es_f[0], dec_out[0])
# ED2.train(en_f[0:200], es_f[0:200], dec_out[0:200], EPOCHS)

"""
THINGS TO ADD: 

PARAMS W/ DIM, BATCH SIZE, N_CLASSES, N_CHANNELS, SHUFFLE
PARTITION W N_IDS
LABELS W N_LABELS
train_gen = DataGenerator(partition['train'], labels, **params)
eval_gen = DataGenerator(partition['train'], labels, **params)

ED2.train(train_gen, eval_gen)

MAKE SURE THAT THE DATA GENERATOR TECHNIQUE WILL WORK WITH THE KERAS FUNCTIONAL API THING
IF NOT, SCREW AROUND WITH THE FUNCTIONAL API AND INPUTS UNTIL IT DOES

"""
list_ids = sorted(np.load('../input/spa-eng-separated/archive/list_ids.npy'))
train_gen = DataGenerator(list_ids, n_eng, n_es)
# print(train_gen.__getitem__(0))
ED2.train(train_gen, EPOCHS)

test_string = "hi how are you doing"
tokenized_string = en_map.texts_to_sequences(test_string)
result = ED2.model.predict(tokenized_string)
translated = es_map.sequences_to_texts(result)
print(f"Model translated {test_string} (eng) to {translated} (esp)")

# NMTAttn = network.NMTAttn(VOCAB_INP_SIZE, VOCAB_OUT_SIZE, UNITS, n_encoder=3, n_decoder=3, n_attn_heads=1, dropout=0.03, mode='test')
# NMTAttn.model(np.array([1, 2, 3]))

[32mSuccessfully imported packages[0m
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(32, 51, 12933)]    0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(32, 53, 24794)]    0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(32, 512), (32, 512 27537408    input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(32, 53, 512), (32, 51828736    input_2[0][0]                    
                                               

In [None]:
# # Saving Data
# import numpy as np
# import os

# # Clearing output to make sure we don't crash the notebook by writing too much to disc lol
# # for file in os.walk("./"):
# #     print(file)
# #     os.remove(file)

# DataGenNeeded = False
# batch = 32
# # leaving code so that i can generate new dataset thing whenever i need

# if DataGenNeeded:

#     en, es, en_map, es_map = getData("../input/spaeng/spa.txt")

#     list_ids = np.array([])
#     counter = 0

#     for i in range(0, len(en), batch):
#         if i+batch < len(en):
#             np.save(f"{counter}_en.npy", en[i:i+batch])
#             np.save(f"{counter}_es.npy", es[i:i+batch])
#             np.append(list_ids, counter)
#             print(f"Saved values {i} to {i+batch} in file with id {counter}")

#         else: 
#             continue
#             # ignore the files with batch size <32

#         counter += 1

#     np.save("list_ids.npy", list_ids)
#     print("Data Save complete")

In [None]:
# import numpy as np
# ID="0"
# a = np.load('../input/spa-eng-separated/' + ID + '_en.npy')
# b = np.load('../input/spa-eng-separated/' + ID + '_es.npy')
