# Test

In [74]:
import re
from collections import defaultdict

In [75]:
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [None]:
def pretokenize(text):
    tokens = re.findall(r"\w+|[^\w\s]", text, re.UNICODE)
    return tokens

def get_word_freqs(corpus):
    word_freqs = defaultdict(int)
    for text in corpus:
        for word in pretokenize(text):
            word_freqs[word] += 1
    return word_freqs

def get_alphabet(word_freqs):
  alphabet = []
  for word in word_freqs.keys():
      if word[0] not in alphabet:
          alphabet.append(word[0])
      for letter in word[1:]:
          if f"<pow>{letter}" not in alphabet:
              alphabet.append(f"<pow>{letter}")
  alphabet.sort()
  return alphabet

def get_vocab(alphabet):
  vocab = ["<pad>", "<unk>", "<cls>", "<sep>", "<mask>", "<pow>"] + alphabet.copy()
  return vocab

def get_split_words(word_freqs):
  splits = {
    word: [c if i == 0 else f"<pow>{c}" for i, c in enumerate(word)] for word in word_freqs.keys()
  }
  return splits

def compute_pair_scores(splits):
    letter_freqs = defaultdict(int)
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            letter_freqs[split[0]] += freq
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            letter_freqs[split[i]] += freq
            pair_freqs[pair] += freq
        letter_freqs[split[-1]] += freq

    scores = {
        pair: freq / (letter_freqs[pair[0]] * letter_freqs[pair[1]])
        for pair, freq in pair_freqs.items()
    }
    return scores

def merge_pair(a, b, splits, word_freqs):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue
        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                merge = a + b[5:] if b.startswith("<pow>") else a + b
                split = split[:i] + [merge] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

def combine_tokens(token_pair):
    tokens = []

    for i, token in enumerate(token_pair):

        if token.startswith("<pow>"):
          tokens.append(token.replace("<pow>", "", 1))
        else:
          tokens.append(token)
    combined_token = ''.join(tokens)

    return combined_token

def make_vocab(vocab, vocab_size, splits, word_freqs):
  while len(vocab) < vocab_size:
      scores = compute_pair_scores(splits)
      best_pair, max_score = "", None
      for pair, score in scores.items():
          if max_score is None or max_score < score:
              best_pair = pair
              max_score = score
      splits = merge_pair(*best_pair, splits, word_freqs)
      new_token = (
          best_pair[0] + best_pair[1][5:]
          if best_pair[1].startswith("<pow>")
          else best_pair[0] + best_pair[1]
      )
      vocab.append(new_token)
  return vocab

def encode_word(word, vocab):
    tokens = []
    while len(word) > 0:
        i = len(word)
        while i > 0 and word[:i] not in vocab:
            i -= 1
        if i == 0:
            return ["<unk>"]
        tokens.append(word[:i])
        print(tokens)
        word = word[i:]
        if len(word) > 0:
            word = f"<pow>{word}"
    return tokens

In [None]:
word_freqs = get_word_freqs(corpus)
alphabet = get_alphabet(word_freqs)
vocab = get_vocab(alphabet)
splits = get_split_words(word_freqs)

In [None]:
pair_scores = compute_pair_scores(splits)
for i, key in enumerate(pair_scores.keys()):
    print(f"{key}: {pair_scores[key]}")
    if i >= 5:
        break

('T', '<pow>h'): 0.125
('<pow>h', '<pow>i'): 0.03409090909090909
('<pow>i', '<pow>s'): 0.02727272727272727
('i', '<pow>s'): 0.1
('t', '<pow>h'): 0.03571428571428571
('<pow>h', '<pow>e'): 0.011904761904761904


In [None]:
best_pair = ""
max_score = None
for pair, score in pair_scores.items():
    if max_score is None or max_score < score:
        best_pair = pair
        max_score = score

vocab.append(combine_tokens(best_pair))
print(best_pair, max_score)

('a', '<pow>b') 0.2


In [None]:
splits = merge_pair(*best_pair, splits, word_freqs)
splits["about"]

['ab', '<pow>o', '<pow>u', '<pow>t']

In [None]:
splits

{'This': ['T', '<pow>h', '<pow>i', '<pow>s'],
 'is': ['i', '<pow>s'],
 'the': ['t', '<pow>h', '<pow>e'],
 'Hugging': ['H', '<pow>u', '<pow>g', '<pow>g', '<pow>i', '<pow>n', '<pow>g'],
 'Face': ['F', '<pow>a', '<pow>c', '<pow>e'],
 'Course': ['C', '<pow>o', '<pow>u', '<pow>r', '<pow>s', '<pow>e'],
 '.': ['.'],
 'chapter': ['c', '<pow>h', '<pow>a', '<pow>p', '<pow>t', '<pow>e', '<pow>r'],
 'about': ['ab', '<pow>o', '<pow>u', '<pow>t'],
 'tokenization': ['t',
  '<pow>o',
  '<pow>k',
  '<pow>e',
  '<pow>n',
  '<pow>i',
  '<pow>z',
  '<pow>a',
  '<pow>t',
  '<pow>i',
  '<pow>o',
  '<pow>n'],
 'section': ['s', '<pow>e', '<pow>c', '<pow>t', '<pow>i', '<pow>o', '<pow>n'],
 'shows': ['s', '<pow>h', '<pow>o', '<pow>w', '<pow>s'],
 'several': ['s', '<pow>e', '<pow>v', '<pow>e', '<pow>r', '<pow>a', '<pow>l'],
 'tokenizer': ['t',
  '<pow>o',
  '<pow>k',
  '<pow>e',
  '<pow>n',
  '<pow>i',
  '<pow>z',
  '<pow>e',
  '<pow>r'],
 'algorithms': ['a',
  '<pow>l',
  '<pow>g',
  '<pow>o',
  '<pow>r',
  '<p

In [None]:
vocab = make_vocab(vocab, 161, splits, word_freqs)

In [None]:
vocab

['<pad>',
 '<unk>',
 '<cls>',
 '<sep>',
 '<mask>',
 '<pow>',
 ',',
 '.',
 '<pow>a',
 '<pow>b',
 '<pow>c',
 '<pow>d',
 '<pow>e',
 '<pow>f',
 '<pow>g',
 '<pow>h',
 '<pow>i',
 '<pow>k',
 '<pow>l',
 '<pow>m',
 '<pow>n',
 '<pow>o',
 '<pow>p',
 '<pow>r',
 '<pow>s',
 '<pow>t',
 '<pow>u',
 '<pow>v',
 '<pow>w',
 '<pow>y',
 '<pow>z',
 'C',
 'F',
 'H',
 'T',
 'a',
 'b',
 'c',
 'g',
 'h',
 'i',
 's',
 't',
 'u',
 'w',
 'y',
 'ab',
 '<pow>fu',
 'Fa',
 'Fac',
 '<pow>ct',
 '<pow>ful',
 '<pow>full',
 '<pow>fully',
 'Th',
 'ch',
 '<pow>hm',
 'cha',
 'chap',
 'chapt',
 '<pow>thm',
 'Hu',
 'Hug',
 'Hugg',
 'sh',
 'th',
 'is',
 '<pow>thms',
 '<pow>za',
 '<pow>zat',
 '<pow>ut',
 '<pow>ta',
 '<pow>at',
 '<pow>sta',
 '<pow>ra',
 '<pow>rsta',
 '<pow>rat',
 '<pow>ur',
 '<pow>urs',
 '<pow>ws',
 '<pow>ral',
 'tra',
 '<pow>lg',
 'alg',
 'abl',
 '<pow>ll',
 'ar',
 'Thi',
 'This',
 'Huggi',
 '<pow>izat',
 '<pow>izati',
 '<pow>cti',
 '<pow>iz',
 '<pow>ithms',
 'wi',
 'will',
 'trai',
 '<pow>rithms',
 'Huggin',
 'Hug

In [None]:
print(encode_word("Huggingface", vocab))
print(encode_word("HOg", vocab))

['Hugging', '<pow>f', '<pow>a', '<pow>c', '<pow>e']


# WordPiece

In [116]:
from pickle import FALSE
from collections import defaultdict
from typing import List, Dict
import re
from tqdm import tqdm

class WordPieceTokenizer:

    def __init__(self, part_of_word_token:str='<pow>', unk_token:str='<unk>', spe_tokens:List[str]=['', '<pad>', '<cls>', '<sep>', '<mask>']):
        self.word_freqs: defaultdict
        self.alphabet: List[str]
        self.vocab: List[str]
        self.splits: Dict[str, List[str]]
        self.vocab: List[str]
        self.part_of_word_token = part_of_word_token
        self.unk_token = unk_token
        self.spe_tokens: List[str] = spe_tokens + [unk_token, part_of_word_token]

    def get_vocab(self):
      return self.vocab

    def load_vocab(self, vocab):
      self.vocab = vocab

    def get_metadata(self):
        return {
            "vocab_size": self.vocab_size,
            "part_of_word_token": self.part_of_word_token,
            "unk_token": self.unk_token,
            "spe_tokens": self.spe_tokens
        }

    def load_metadata(self, metadata:dict):
        self.vocab_size = metadata.get("vocab_size", self.vocab_size)
        self.part_of_word_token = metadata.get("part_of_word_token", self.part_of_word_token)
        self.unk_token = metadata.get("unk_token", self.unk_token)
        self.spe_tokens = set(metadata.get("spe_tokens", self.spe_tokens))

    def adapt(self, corpus: List[str], vocab_size:int):
        self.vocab_size: int = vocab_size

        self.word_freqs = self._get_word_freqs(corpus)
        self.alphabet = self._get_alphabet()
        self.vocab = self._set_vocab()
        self.splits = self._get_split_words()
        self.vocab = self._make_vocab()

    def pretokenize(self, text: List[str], unwant_chars:str = "­Ö¢«/\ê$ìà#Ù†ë™ä·ã³'±]_›>´çö¹¬®½:ÃØÈ¶[&ôËù¾€ò»Ò„üß0²ªè|°@{îºæ}^Çï5õ¨+âøÀ", to_lower=False):
        tokens = [re.findall(r'\w+|[^\w\s]', self.clean_text(sentence, unwant_chars), re.UNICODE) for sentence in text]
        return tokens

    def clean_text(self, text:str, unwant_chars:str="­Ö¢«/\ê$ìà#Ù†ë™ä·ã³'±]_›>´çö¹¬®½:ÃØÈ¶[&ôËù¾€ò»Ò„üß0²ªè|°@{îºæ}^Çï5õ¨+âøÀ", to_lower=False):
        if to_lower: text = text.lower()

        regex = f"[{re.escape(unwant_chars)}]"
        text = re.sub(regex, "", text)
        text = re.sub(r"[\n\t\r]+", ' ', text)
        text = re.sub(r"\s+", ' ', text)
        text = re.sub(r"\.{3,}", "...", text)
        text = re.sub(r"\.\.+", ".", text)

        text = text.strip()
        return text

    def encode_word(self, word):
        tokens = []
        while len(word) > 0:
            i = len(word)
            while i > 0 and word[:i] not in self.vocab:
                i -= 1
            if i == 0:
                return [self.unk_token]
            tokens.append(word[:i])
            word = word[i:]
            if len(word) > 0:
                word = f'{self.part_of_word_token}{word}'
        return tokens

    def tokenize(self, text: List[str]):
        tokens = []
        text = self.pretokenize(text)
        for sentence in text:
            sentence_tokens = [token for word in sentence for token in self.encode_word(word)]
            tokens.append(sentence_tokens)
        return tokens

    def encode(self, text: List[str], max_tokens: int = None):
        tokens = self.tokenize(text)

        if not hasattr(self, 'vocab_to_index'):
            self.vocab_to_index = {token: idx for idx, token in enumerate(self.vocab)}

        encoded_text = []
        for sentence in tokens:
            if max_tokens:
                encoded_sentence = [
                    self.vocab_to_index.get(token, self.vocab_to_index.get(self.unk_token, 0))
                    for token in sentence[:max_tokens]
                ]
                if len(encoded_sentence) < max_tokens:
                  encoded_sentence += [0] * (max_tokens - len(encoded_sentence))
            else:
                encoded_sentence = [
                    self.vocab_to_index.get(token, self.vocab_to_index.get(self.unk_token, 0))
                    for token in sentence
                ]

            encoded_text.append(encoded_sentence)



        return encoded_text

    def decode(self, encoded_text: List[List[int]]):
        if not hasattr(self, 'index_to_vocab'):
            self.index_to_vocab = {idx: token for idx, token in enumerate(self.vocab)}

        decoded_text = []
        for sentence in encoded_text:
            decoded_sentence = [self.index_to_vocab.get(index, self.unk_token) for index in sentence]
            joined_sentence = self._join_tokens(decoded_sentence)
            decoded_text.append(joined_sentence)

        return decoded_text

    def _join_tokens(self, tokens: List[str]):
        sentence = ""
        for token in tokens:
            if token.startswith(self.part_of_word_token) or token == self.spe_tokens[0]:
                sentence += token[len(self.part_of_word_token):]
            else:
                if sentence:
                    sentence += " "
                sentence += token
        return sentence

    def _get_word_freqs(self, corpus):
      word_freqs = defaultdict(int)
      for text in self.pretokenize(corpus):
          for word in text:
              word_freqs[word] += 1
      return word_freqs

    def _get_alphabet(self):
      alphabet = []
      for word in self.word_freqs.keys():
          if word[0] not in alphabet:
              alphabet.append(word[0])
          for letter in word[1:]:
              if f"{self.part_of_word_token}{letter}" not in alphabet:
                  alphabet.append(f"{self.part_of_word_token}{letter}")
      alphabet.sort()
      return alphabet

    def _set_vocab(self):
      vocab = self.spe_tokens + self.alphabet.copy()
      return vocab

    def _get_split_words(self):
      splits = {
        word: [c if i == 0 else f"<pow>{c}" for i, c in enumerate(word)] for word in self.word_freqs.keys()
      }
      return splits

    def _compute_pair_scores(self):
      letter_freqs = defaultdict(int)
      pair_freqs = defaultdict(int)
      for word, freq in self.word_freqs.items():
          split = self.splits[word]
          if len(split) == 1:
              letter_freqs[split[0]] += freq
              continue
          for i in range(len(split) - 1):
              pair = (split[i], split[i + 1])
              letter_freqs[split[i]] += freq
              pair_freqs[pair] += freq
          letter_freqs[split[-1]] += freq

      scores = {
          pair: freq / (letter_freqs[pair[0]] * letter_freqs[pair[1]])
          for pair, freq in pair_freqs.items()
      }
      return scores

    def _merge_pair(self, a, b):
      for word in self.word_freqs:
          split = self.splits[word]
          if len(split) == 1:
              continue
          i = 0
          while i < len(split) - 1:
              if split[i] == a and split[i + 1] == b:
                  merge = a + b.replace(self.part_of_word_token, "", 1) if b.startswith("<pow>") else a + b
                  split = split[:i] + [merge] + split[i + 2 :]
              else:
                  i += 1
          self.splits[word] = split
      return self.splits

    def _combine_tokens(self, token_pair):
      tokens = []

      for i, token in enumerate(token_pair):

          if token.startswith(self.part_of_word_token):
            tokens.append(token.replace(self.part_of_word_token, "", 1))
          else:
            tokens.append(token)
      combined_token = ''.join(tokens)

      return combined_token

    def _make_vocab(self):

      with tqdm(total=self.vocab_size, desc="Procesing") as pbar:
        vocab_len = 0
        while vocab_len < self.vocab_size:
            try:
                scores = self._compute_pair_scores()
                best_pair, max_score = "", None
                for pair, score in scores.items():
                    if max_score is None or max_score < score:
                        best_pair = pair
                        max_score = score
                self.splits = self._merge_pair(*best_pair)
                new_token = (
                    best_pair[0] + best_pair[1].replace(self.part_of_word_token, "", 1)
                    if best_pair[1].startswith(self.part_of_word_token)
                    else best_pair[0] + best_pair[1]
                )
                self.vocab.append(new_token)

                vocab_len = len(self.vocab)
                delta = vocab_len - pbar.n

                pbar.update(delta)


            except:
                break

      return self.vocab



In [117]:
tokenizer = WordPieceTokenizer()
tokenizer.adapt(corpus, 100)

Procesing: 100%|██████████| 100/100 [00:00<00:00, 8310.16it/s]


In [118]:
tokenizer.tokenize(["Hugging Face is cool.", "deep learning is cool."])

[['Huggi',
  '<pow>n',
  '<pow>g',
  'Fac',
  '<pow>e',
  'is',
  'c',
  '<pow>o',
  '<pow>o',
  '<pow>l',
  '.'],
 ['<unk>', '<unk>', 'is', 'c', '<pow>o', '<pow>o', '<pow>l', '.']]

In [121]:
encoded = tokenizer.encode(["Hugging face is cool.", "deep learning is cool."], max_tokens=500)
print(encoded[0])

[90, 21, 15, 5, 67, 38, 22, 22, 19, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [122]:
tokenizer.decode(encoded)

['Hugging <unk> is cool .', '<unk> <unk> is cool .']

In [None]:
tokenizer.vocab

# Tokenize Spanish Corpus

In [67]:
!unzip /content/drive/MyDrive/Exposiciones/Chatbot/data/text.zip -d "data/"

Archive:  /content/drive/MyDrive/Exposiciones/Chatbot/data/text.zip
replace data/text.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: data/text.txt           


In [68]:
import os

with open('/content/data/text.txt', 'r') as f:
    text = f.read()

In [69]:
set_text = {*text}

In [70]:
tokenizer.clean_text(''.join(list(set_text)))

'éÜybRCMpHF.mfWG zuK-vÉnSÁkhAg XáZE2)cVÑ!d3×wJOT;"qe%7í81óil6(ÓoPYUt¿sBQr¡úD ñÍxN4=9L,?ÚjIa'

In [16]:
data = [text, ]

In [18]:
tokenizer = WordPieceTokenizer(1000)
tokenizer.adapt(data)

Procesing: 100%|██████████| 300/300 [00:35<00:00,  8.34it/s]


In [None]:
_vocab = tokenizer.get_vocab()

with open('/content/drive/MyDrive/Exposiciones/Chatbot/data/vocab.txt', 'w') as f:
    f.write('\n'.join(_vocab))

In [4]:
with open('/content/drive/MyDrive/Exposiciones/Chatbot/data/vocab.txt', 'r') as f:
    _vocab = f.read().split('\n')

tokenizer = WordPieceTokenizer(10000)
tokenizer.vocab = _vocab

In [8]:
print(tokenizer.tokenize(["Me encantan los rollos de canela.", "saco a pasear a mi perro por las mañanas"]))
encoded = tokenizer.encode(["Me encantan los rollos de canela.", "saco a pasear a mi perro por las mañanas"])
print(encoded)
tokenizer.decode(encoded)

[['M', '<pow>e', 'e', '<pow>n', '<pow>c', '<pow>a', '<pow>n', '<pow>t', '<pow>a', '<pow>n', 'l', '<pow>o', '<pow>s', 'r', '<pow>o', '<pow>l', '<pow>l', '<pow>o', '<pow>s', 'd', '<pow>e', 'c', '<pow>a', '<pow>n', '<pow>e', '<pow>l', '<pow>a', '.'], ['s', '<pow>a', '<pow>c', '<pow>o', 'a', 'p', '<pow>a', '<pow>s', '<pow>e', '<pow>a', '<pow>r', 'a', 'm', '<pow>i', 'p', '<pow>e', '<pow>r', '<pow>r', '<pow>o', 'p', '<pow>o', '<pow>r', 'l', '<pow>a', '<pow>s', 'm', '<pow>a', '<pow>ñ', '<pow>a', '<pow>n', '<pow>a', '<pow>s']]
[[154, 73, 177, 82, 71, 69, 82, 88, 69, 82, 184, 83, 87, 190, 83, 80, 80, 83, 87, 176, 73, 175, 69, 82, 73, 80, 69, 18], [191, 69, 71, 83, 173, 188, 69, 87, 73, 69, 86, 173, 185, 77, 188, 73, 86, 86, 83, 188, 83, 86, 184, 69, 87, 185, 69, 128, 69, 82, 69, 87]]


['Me encantan los rollos de canela .',
 'saco a pasear a mi perro por las mañanas']

# Comparation

In [104]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization


In [105]:
# Configuración del vectorizador de texto
max_tokens = 10000  # Tamaño del vocabulario (número máximo de tokens únicos)
max_sequence_length = 50  # Longitud máxima de la secuencia

vectorizer = TextVectorization(
    max_tokens=max_tokens,
    ragged=False,  # Usa secuencias de longitud fija
    output_sequence_length=max_sequence_length
)


In [106]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

# Datos de ejemplo
sentences = [
    "Este es un ejemplo de texto.",
    "Aquí hay otro ejemplo.",
    "Vamos a tokenizar estos textos."
]

# Crear la instancia de TextVectorization
max_tokens = 10000
max_sequence_length = 10  # Ajusta según sea necesario

vectorizer = TextVectorization(
    max_tokens=max_tokens,
    output_sequence_length=max_sequence_length
)

# Adaptar el vectorizador al texto
vectorizer.adapt(sentences)

# Transformar texto en vectores
vectorized_texts = vectorizer(sentences)

# Mostrar resultados
print("Vocabulario:", vectorizer.get_vocabulary())
print("Textos vectorizados:")
print(vectorized_texts.numpy())


Vocabulario: ['', '[UNK]', 'ejemplo', 'vamos', 'un', 'tokenizar', 'textos', 'texto', 'otro', 'hay', 'estos', 'este', 'es', 'de', 'aquí', 'a']
Textos vectorizados:
[[11 12  4  2 13  7  0  0  0  0]
 [14  9  8  2  0  0  0  0  0  0]
 [ 3 15  5 10  6  0  0  0  0  0]]
