# SetUp

In [1]:
%%capture
!git clone https://github.com/Ryuksito/chatbot.git

In [2]:
import json

In [3]:
VOCAB_PATH = '/content/chatbot/weights/vocab.txt'
METADATA_PATH = '/content/chatbot/weights/metadata.json'

with open(VOCAB_PATH, 'r') as f:
  vocab = f.read().split('\n')

with open(METADATA_PATH, 'r') as f:
  metadata = json.load(f)

corpus = [
    "la vida es un hermoso viaje.",
    "nunca es tarde para aprender algo nuevo.",
    "el éxito es la suma de pequeños esfuerzos repetidos día tras día.",
    "La perseverancia es la clave para alcanzar tus sueños.",
    "La creatividad es la inteligencia divirtiéndose."
]

texts = [
    "El futuro pertenece a quienes creen en sus sueños.",
    "Los obstáculos son esas cosas que ves cuando apartas la vista de tu meta.",
    "El trabajo duro supera al talento cuando el talento no trabaja duro.",
    "Cree en ti mismo y serás imparable.",
    "Las grandes ideas nacen de grandes desafíos."
]

def pprint(sequence, break_num=5):
  for i, part in enumerate(sequence):
    print(f'|{part}|,  ', end='')
    if (i+1) % break_num == 0:
        print('\n')
  print('\n')

## WordPiece Tokenizer

In [4]:
from pickle import FALSE
from collections import defaultdict
from typing import List, Dict
import re
from tqdm import tqdm
import tensorflow as tf

_unwanted_chars = """­Ö¢«/\ê$ìà#Ù†ë™ä·ã³'±]_›>´çö¹¬®½:ÃØÈ¶[&ôËù¾€ò»Ò„üß0²ªè|°@{îºæ}^Çï5õ¨+âøÀ"%*<=`~£¥©µ¼Å×ð÷ûĀāēěīıłōœşūżǒǜȳəɛɪʃˆˈ˚̩́̄ΓΔΣΩαβγδεθκλμνπρσψगयो​‍–—‖‘’“”•…⁰⁴⁷⁺⁻₀₁₂₄₆₹℃℉⅓⅔←→↓↔⇌∂∃∑−∙√∞∠∣∧∨∩≈≠≡≤≥⋅⏰─│┌┐└┘├┤┬┴┼═║╔╗╚╝╠╣╦╩╬█░▓▼☀☁★☕☠♀♂♕♖♙♛♜♟♻⚠⚽⛰✅✈✨❄❤➕➖➡⭐。「」あいうがくござすたちとはまりるを一仁千困坂夕夢天太失子強得徳必愚明智月有本標準皇目私者聖虑語難馬龍ﬁ️，；�🆕🇦🇧🇨🇩🇪🇫🇬🇭🇮🇯🇰🇱🇲🇳🇴🇵🇶🇷🇸🇹🇺🇽🇿🌀🌅🌊🌍🌎🌏🌞🌟🌧🌱🌲🌳🌴🌸🌻🌿🍃🍊🍌🍎🍓🍕🍞🍩🍪🍯🍳🍴🍿🎂🎈🎉🎊🎓🎥🎧🎨🎬🎮🎵🎶🎼🎾🏀🏃🏈🏊🏒🏖🏡🏻🏽🏾🐈🐘🐛🐥🐨🐮🐰🐱🐶🐹🐾👇👊👌👏👓👠👨👩💆💉💕💖💙💚💡💦💪💫💯💰💳💸💻📈📖📚📢📣📲📺🔍🔒🔥🔹🕷🗳🗺😀😁😂😆😈😊😋😍😎😔😱😴🙌🙏🚀🚂🚆🚇🚌🚗🚨🚪🚰🚴🚶🛍🟢🤖🤝🤩🥕🥦🦁🦇🦊🦸🧁🧘🧴🧸🧹🧼🪀"""


class WordPieceTokenizer:

    def __init__(self, seq_length:int=None, part_of_word_token:str='<pow>', unk_token:str='<unk>', spe_tokens:List[str]=['', '<bos>', '<eos>', '<sep>', '<mask>']):
        self.word_freqs: defaultdict
        self.alphabet: List[str]
        self.vocab: List[str]
        self.splits: Dict[str, List[str]]
        self.vocab: List[str]
        self.part_of_word_token = part_of_word_token
        self.unk_token = unk_token
        self.seq_length = seq_length
        self.spe_tokens: List[str] = spe_tokens + [unk_token, part_of_word_token]


    #------------------------------Make Vocab-------------------------------------------

    def get_vocab(self):
      return self.vocab

    def load_vocab(self, vocab):
      self.vocab = vocab

    def get_metadata(self):
        return {
            "vocab_size": self.vocab_size,
            "part_of_word_token": self.part_of_word_token,
            "unk_token": self.unk_token,
            "spe_tokens": self.spe_tokens
        }

    def load_metadata(self, metadata:dict):
        self.vocab_size = metadata["vocab_size"]
        self.part_of_word_token = metadata["part_of_word_token"]
        self.unk_token = metadata["unk_token"]
        self.spe_tokens = metadata["spe_tokens"]

    def adapt(self, corpus: List[str], vocab_size:int):
        self.vocab_size: int = vocab_size

        self.word_freqs = self._get_word_freqs(corpus)
        self.alphabet = self._get_alphabet()
        self.vocab = self._set_vocab()
        self.splits = self._get_split_words()
        self.vocab = self._make_vocab()

    #------------------------------Encode Text-------------------------------------------

    def clean_text(self, text_tensor, unwant_chars:str=_unwanted_chars, to_lower=False):
        if to_lower:
            text_tensor = tf.strings.lower(text_tensor)

        unwant_chars_pattern = "[" + re.escape(unwant_chars) + "]"
        text_tensor = tf.strings.regex_replace(text_tensor, unwant_chars_pattern, "")
        text_tensor = tf.strings.regex_replace(text_tensor, r"[\n\t\r]+", " ")
        text_tensor = tf.strings.regex_replace(text_tensor, r"\s+", " ")
        text_tensor = tf.strings.regex_replace(text_tensor, r"\.{3,}", "...")
        text_tensor = tf.strings.regex_replace(text_tensor, r"\.\.+", ".")

        text_tensor = tf.strings.strip(text_tensor)

        return text_tensor

    def pretokenize(self, text_tensor, unwant_chars=_unwanted_chars): # tested function
        cleaned_text = self.clean_text(text_tensor, unwant_chars)

        decoded_text = tf.strings.unicode_decode(cleaned_text, 'UTF-8')

        encoded_text = tf.strings.unicode_encode(decoded_text, 'UTF-8')

        tokens = tf.strings.regex_replace(
            encoded_text,
            r'(\p{L}+|\p{N}+|\p{P})',
            r' \1 '
        )

        tokens = tf.strings.split(tokens)

        return tokens

    def encode_word(self, word): # tested function
        tokens = tf.TensorArray(dtype=tf.string, size=0, dynamic_size=True)

        def condition(word, tokens):
            return tf.strings.length(word) > 0

        def body(word, tokens):
            length = tf.strings.length(word)
            i = length

            def sub_condition(i, word):
                substr = tf.strings.substr(word, 0, i)
                is_in_vocab = tf.reduce_any(tf.equal(self.vocab, substr))
                return tf.logical_and(tf.greater(i, 0), tf.logical_not(is_in_vocab))
            def decrement_i(i, word):
                return i - tf.convert_to_tensor(1), word
            i, word = tf.while_loop(sub_condition, decrement_i, [i, word])

            if i == tf.constant(0):

                token = tf.TensorArray(dtype=tf.string, size=0, dynamic_size=True)
                token = tokens.write(0, tf.convert_to_tensor(self.unk_token))
                return tf.convert_to_tensor(''), token



            tokens = tokens.write(tokens.size(), tf.strings.substr(word, 0, i))
            word = tf.strings.substr(word, i, -1)


            if tf.strings.length(word) > 0:
                word = tf.strings.join([self.part_of_word_token, word])


            return word, tokens

        word, tokens = tf.while_loop(condition, body, [word, tokens])

        t = tokens.stack()
        return t

    def tokenize(self, sentence, null_token:bool=False): # tested function
        tokens = tf.TensorArray(dtype=tf.string, size=0, dynamic_size=True)

        words = self.pretokenize(sentence)

        word_index = tf.constant(0)

        def sentence_condition(word_index, tokens, words):
            return tf.not_equal(word_index, tf.size(words))
        def sentence_body(word_index, tokens, words):
            word = words[word_index]

            def word_condition(word, tokens):
                return tf.strings.length(word) > 0
            def word_body(word, tokens):
                length = tf.strings.length(word)
                i = length

                def sub_condition(i, word):
                    substr = tf.strings.substr(word, 0, i)
                    is_in_vocab = tf.reduce_any(tf.equal(self.vocab, substr))
                    return tf.logical_and(tf.greater(i, 0), tf.logical_not(is_in_vocab))
                def decrement_i(i, word):
                    return i - tf.convert_to_tensor(1), word
                i, word = tf.while_loop(sub_condition, decrement_i, [i, word])

                if i == tf.constant(0):

                    token = tf.TensorArray(dtype=tf.string, size=0, dynamic_size=True)
                    token = tokens.write(0, tf.convert_to_tensor(self.unk_token))
                    return tf.convert_to_tensor(''), token



                tokens = tokens.write(tokens.size(), tf.strings.substr(word, 0, i))
                word = tf.strings.substr(word, i, -1)


                if tf.strings.length(word) > 0:
                    word = tf.strings.join([self.part_of_word_token, word])


                return word, tokens

            word, tokens = tf.while_loop(word_condition, word_body, [word, tokens])

            return word_index + 1, tokens, words

        word_index, tokens, words = tf.while_loop(sentence_condition, sentence_body, [word_index, tokens, words])

        tokenized_text = tokens.stack()
        current_num_tokens = tf.size(tokenized_text)


        def truncate_tokens():
            truncated_tokens = tf.slice(tokenized_text, [0], [self.seq_length -1])
            return truncated_tokens

        def pad_tokens():
            pad_length = (self.seq_length - 1) - current_num_tokens
            padding = tf.fill([pad_length], self.vocab[0])
            return tf.concat([tokenized_text, padding], axis=0)

        def process_tokens():
            return tf.cond(current_num_tokens > self.seq_length,
                          true_fn=truncate_tokens,
                          false_fn=pad_tokens)

        tokenized_text = tf.cond(
            tf.convert_to_tensor(null_token),
            true_fn=process_tokens,
            false_fn=lambda: tokenized_text
        )

        return tokenized_text

    def format_sequence(self, tokenized_text, begin_token = True, end_token=True):
        if begin_token:
          tokenized_text = tf.concat([[self.spe_tokens[1],], tokenized_text], axis=0)

        current_num_tokens = tf.size(tokenized_text)

        def truncate_tokens():
            truncated_tokens = tf.slice(tokenized_text, [0], [self.seq_length -1])
            if (end_token):
              return tf.concat([truncated_tokens, [self.spe_tokens[2],]], axis=0)
            else:
              return truncated_tokens

        def pad_tokens():
            pad_length = (self.seq_length - 1) - current_num_tokens
            padding = tf.fill([pad_length], self.vocab[0])
            if end_token:
              concated_tokens = tf.concat([tokenized_text, [self.spe_tokens[2], ], padding], axis=0)
            else:
              concated_tokens = tf.concat([tokenized_text, padding], axis=0)
            return concated_tokens

        return tf.cond(current_num_tokens > self.seq_length,
                          true_fn=truncate_tokens,
                          false_fn=pad_tokens)

    def encode(self, sentence, answer=None, begin_token = True, sep_token=False, end_token=True):
        tokenized_sentence = self.tokenize(sentence, null_token=False)
        if answer is not None:
          tokens_list = [
              tokenized_sentence,
              [self.spe_tokens[3], ],
              self.tokenize(answer, null_token=False)
          ]
        else:
          tokens_list = [
              tokenized_sentence,
              [self.spe_tokens[3], ],
          ]

        tokens = tf.concat(
            tokens_list,
            axis=0
        )
        tokens = self.format_sequence(tokens, begin_token=begin_token, end_token=end_token)

        if not hasattr(self, 'vocab_to_index'):
            self.vocab_to_index = {token: idx for idx, token in enumerate(self.vocab)}

        vocab_tensor = tf.constant(list(self.vocab_to_index.keys()), dtype=tf.string)
        index_tensor = tf.constant(list(self.vocab_to_index.values()), dtype=tf.int32)

        def lookup_token(token):
            token_index = tf.where(tf.equal(vocab_tensor, token), index_tensor, -1)
            token_index = tf.reduce_max(token_index)
            return tf.cond(token_index == -1,
                           lambda: self.vocab_to_index[self.unk_token],
                           lambda: token_index)

        token_indices = tf.map_fn(lookup_token, tokens, fn_output_signature=tf.int32)

        return token_indices

    #------------------------------Decode Text-------------------------------------------
    def decode(self, encoded_text):
        if not hasattr(self, 'index_to_vocab'):
            self.index_to_vocab = {idx: token for idx, token in enumerate(self.vocab)}

        index_tensor = tf.constant(list(self.index_to_vocab.keys()), dtype=tf.int32)
        vocab_tensor = tf.constant(list(self.index_to_vocab.values()), dtype=tf.string)

        def lookup_index(index):
            token = vocab_tensor[index]

            def is_part_of_word():
                word = tf.strings.substr(token, len(self.part_of_word_token), -1)
                return word
            def not_part_of_word():
                return ' ' + token

            token = tf.cond(
                tf.equal(tf.strings.substr(token, 0, len(self.part_of_word_token)), self.part_of_word_token),
                is_part_of_word,
                not_part_of_word
            )
            return token

        decoded_text_tensor = tf.map_fn(lookup_index, encoded_text, dtype=tf.string)

        decoded_text = decoded_text_tensor

        decoded_text = tf.strings.reduce_join(decoded_text_tensor, separator='')

        return decoded_text.numpy().decode('utf-8').strip()


    #------------------------------Make vocab functions-------------------------------------------

    def _join_tokens(self, tokens: List[str]):
        sentence = ""
        for token in tokens:
            if token.startswith(self.part_of_word_token) or token == self.spe_tokens[0]:
                sentence += token[len(self.part_of_word_token):]
            else:
                if sentence:
                    sentence += " "
                sentence += token
        return sentence

    def _get_word_freqs(self, corpus):
      word_freqs = defaultdict(int)
      for text in self.pretokenize(corpus):
          for word in text:
              word = word.numpy().decode('utf-8')
              word_freqs[word] += 1
      return word_freqs

    def _get_alphabet(self):
      alphabet = []
      for word in self.word_freqs.keys():
          if word[0] not in alphabet:
              alphabet.append(word[0])
          for letter in word[1:]:
              if f"{self.part_of_word_token}{letter}" not in alphabet:
                  alphabet.append(f"{self.part_of_word_token}{letter}")
      alphabet.sort()
      return alphabet

    def _set_vocab(self):
      vocab = self.spe_tokens + self.alphabet.copy()
      return vocab

    def _get_split_words(self):
      splits = {
        word: [c if i == 0 else f"<pow>{c}" for i, c in enumerate(word)] for word in self.word_freqs.keys()
      }
      return splits

    def _compute_pair_scores(self):
      letter_freqs = defaultdict(int)
      pair_freqs = defaultdict(int)
      for word, freq in self.word_freqs.items():
          split = self.splits[word]
          if len(split) == 1:
              letter_freqs[split[0]] += freq
              continue
          for i in range(len(split) - 1):
              pair = (split[i], split[i + 1])
              letter_freqs[split[i]] += freq
              pair_freqs[pair] += freq
          letter_freqs[split[-1]] += freq

      scores = {
          pair: freq / (letter_freqs[pair[0]] * letter_freqs[pair[1]])
          for pair, freq in pair_freqs.items()
      }
      return scores

    def _merge_pair(self, a, b):
      for word in self.word_freqs:
          split = self.splits[word]
          if len(split) == 1:
              continue
          i = 0
          while i < len(split) - 1:
              if split[i] == a and split[i + 1] == b:
                  merge = a + b.replace(self.part_of_word_token, "", 1) if b.startswith("<pow>") else a + b
                  split = split[:i] + [merge] + split[i + 2 :]
              else:
                  i += 1
          self.splits[word] = split
      return self.splits

    def _combine_tokens(self, token_pair):
      tokens = []

      for i, token in enumerate(token_pair):

          if token.startswith(self.part_of_word_token):
            tokens.append(token.replace(self.part_of_word_token, "", 1))
          else:
            tokens.append(token)
      combined_token = ''.join(tokens)

      return combined_token

    def _make_vocab(self):

      with tqdm(total=self.vocab_size, desc="Procesing") as pbar:
        vocab_len = 0
        while vocab_len < self.vocab_size:
            try:
                scores = self._compute_pair_scores()
                best_pair, max_score = "", None
                for pair, score in scores.items():
                    if max_score is None or max_score < score:
                        best_pair = pair
                        max_score = score
                self.splits = self._merge_pair(*best_pair)
                new_token = (
                    best_pair[0] + best_pair[1].replace(self.part_of_word_token, "", 1)
                    if best_pair[1].startswith(self.part_of_word_token)
                    else best_pair[0] + best_pair[1]
                )
                self.vocab.append(new_token)

                vocab_len = len(self.vocab)
                delta = vocab_len - pbar.n

                pbar.update(delta)


            except:
                break

      return self.vocab


## Adaptar el tokenizer al texto de prueba


*   Probar primero con un vocab_size de 100
*   Probar despues con un vocab_size de 500



Procesing: 100%|██████████| 100/100 [00:00<00:00, 6365.62it/s]


### Visualizar el vocabulario adaptado

['',
 '<bos>',
 '<eos>',
 '<sep>',
 '<mask>',
 '<unk>',
 '<pow>',
 '.',
 '<pow>a',
 '<pow>c']

## Cargar pesos del tokenizer

### Visualizar el vocabulario adaptado

['', '<bos>', '<eos>', '<sep>', '<mask>', '<unk>', '<pow>', '!', '(', ')']

# Pretokenizar el texto




la vida es un hermoso viaje.
|b'la'|,  |b'vida'|,  |b'es'|,  |b'un'|,  |b'hermoso'|,  

|b'viaje'|,  |b'.'|,  



# Tokenizar el texto
Importante a tomar en cuenta


*   Tokenizer Adaptado: esta adaptado con poco texto de prueba solo 100 tokens de vocabulario
*   Tokenizer Cargado: esta cargado con 15000 tokens de vocabulario unicamente funciona con palabras minusculas

## Texto Conocido

la vida es un hermoso viaje.
|b'l'|,  |b'<pow>a'|,  |b'vid'|,  |b'<pow>a'|,  |b'e'|,  

|b'<pow>s'|,  |b'un'|,  |b'h'|,  |b'<pow>e'|,  |b'<pow>rmo'|,  

|b'<pow>s'|,  |b'<pow>o'|,  |b'vi'|,  |b'<pow>a'|,  |b'<pow>j'|,  

|b'<pow>e'|,  |b'.'|,  



# Codificar el texto

Importante
*   Al momento de crear el objeto tokenizer es importante definir el ***seq_length***=***N***, en un entero mayor a 0, esto para usar la funcion encode.
*   De lo contrario para evitar el error de usar el metodo encode sin ***seq_length*** definido se debe usar el metodo ***encode_word***



## Texto Conocido