In [1]:
import os
import regex
from tqdm import tqdm
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore")

batch_size = 500

def batch_iterator():
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]
        
def batch_iterator_split():
    pat = regex.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
    for i in range(0, len(dataset), batch_size):
        yield [regex.findall(pat, text) 
               for text in dataset[i : i + batch_size]]
    
dataset = [open(f"../project5/data/un/TXT/{f}").read() for f in os.listdir("../project5/data/un/TXT/")[:10000]]

In [2]:
from pcatt.hf.greedtok import GreedTok

# text iterator yield batches of lists of str

GT_Train = GreedTok().train_new_from_iterator(
    batch_iterator(), 
    vocab_size = 100,
    special_tokens_map={
        "pad_token":"<pad>",
        "unk_token":"<unk>", 
        "eos_token":"<eos>"
    },
    min_word_count=1,
    max_token_size=1000
)

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Trie constructed
Word counts size: 103190
Token set size: 0
Empty token set size selected -> all possible substrings with...
Max token size: 100
Min. word count: 1
len:  100
Final candidate token set size: 678311
Initial setup phase: 1601 ms
0. |<pad> [3c 70 61 64 3e ] | 0
1. |<unk> [3c 75 6e 6b 3e ] | 0
2. |<eos> [3c 65 6f 73 3e ] | 0
Starting main routine...
4. | the [20 74 68 65 ] | 378447 | 84 ms | 102 ms | shortlist: 2947
5. |tion [74 69 6f 6e ] | 206001 | 51 ms | 85 ms | shortlist: 68487
6. | and [20 61 6e 64 ] | 157941 | 45 ms | 46 ms | shortlist: 612
7. | of [20 6f 66 ] | 151426 | 36 ms | 36 ms | shortlist: 895
8. |in [69 6e ] | 147507 | 30 ms | 113 ms | shortlist: 177066
9. |re [72 65 ] | 119689 | 40 ms | 93 ms | shortlist: 98801
10. | t [20 74 ] | 105344 | 50 ms | 63 ms | shortlist: 23658
11. | a [20 61 ] | 99935 | 33 ms | 48 ms | shortlist: 39319
12. |er [65 72 ] | 97867 | 35 ms | 95 ms | shortlist: 117395
13. |en [65 6e ] | 94431 | 39 ms | 93 ms | shortlist: 95674
14. | co 

In [3]:
# text iterator can also yield batches of lists of list of str
# this is useful if you want more control, i.e. using other strategies to split text
# the default behavior embedded is regex.findall using regex pattern string as shown above
# for decoding, the regex pattern does not matter

GT_Train = GreedTok().train_new_from_iterator(
    batch_iterator_split(), 
    vocab_size = 100,
    special_tokens_map={
        "pad_token":"<pad>",
        "unk_token":"<unk>", 
        "eos_token":"<eos>"
    },
    min_word_count=1,
    max_token_size=1000
)

Trie constructed
Word counts size: 103190
Token set size: 0
Empty token set size selected -> all possible substrings with...
Max token size: 100
Min. word count: 1
len:  100
Final candidate token set size: 678311
Initial setup phase: 2042 ms
0. |<pad> [3c 70 61 64 3e ] | 0
1. |<unk> [3c 75 6e 6b 3e ] | 0
2. |<eos> [3c 65 6f 73 3e ] | 0
Starting main routine...
4. | the [20 74 68 65 ] | 378447 | 79 ms | 97 ms | shortlist: 2947
5. |tion [74 69 6f 6e ] | 206001 | 35 ms | 69 ms | shortlist: 68487
6. | and [20 61 6e 64 ] | 157941 | 44 ms | 46 ms | shortlist: 612
7. | of [20 6f 66 ] | 151426 | 27 ms | 28 ms | shortlist: 895
8. |in [69 6e ] | 147507 | 28 ms | 116 ms | shortlist: 177066
9. |re [72 65 ] | 119689 | 43 ms | 95 ms | shortlist: 98801
10. | t [20 74 ] | 105344 | 39 ms | 50 ms | shortlist: 23658
11. | a [20 61 ] | 99935 | 36 ms | 51 ms | shortlist: 39319
12. |er [65 72 ] | 97867 | 36 ms | 93 ms | shortlist: 117395
13. |en [65 6e ] | 94431 | 53 ms | 110 ms | shortlist: 95674
14. | co 

In [4]:
# we can change the regex pattern, we can pass the desired into 'pattern'
# we can also set the no. of workers to speed up splitting (default is 8)

GT_Train = GreedTok().train_new_from_iterator(
    batch_iterator(), 
    vocab_size = 100,
    special_tokens_map={
        "pad_token":"<pad>",
        "unk_token":"<unk>", 
        "eos_token":"<eos>"
    },
    min_word_count=1,
    max_token_size=1000,
    pattern = r""" ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
    workers = 10
)

Trie constructed
Word counts size: 103181
Token set size: 0
Empty token set size selected -> all possible substrings with...
Max token size: 100
Min. word count: 1
Final candidate token set size: 678306
Initial setup phase: 1736 ms
0. |<pad> [3c 70 61 64 3e ] | 0
1. |<unk> [3c 75 6e 6b 3e ] | 0
2. |<eos> [3c 65 6f 73 3e ] | 0
Starting main routine...
4. | the [20 74 68 65 ] | 378447 | 88 ms | 107 ms | shortlist: 2947
5. |tion [74 69 6f 6e ] | 206001 | 38 ms | 73 ms | shortlist: 68487
6. | and [20 61 6e 64 ] | 157941 | 50 ms | 51 ms | shortlist: 612
7. | of [20 6f 66 ] | 151426 | 27 ms | 28 ms | shortlist: 895
8. |in [69 6e ] | 147507 | 28 ms | 114 ms | shortlist: 177067
9. |re [72 65 ] | 119688 | 45 ms | 97 ms | shortlist: 98799
10. | t [20 74 ] | 105344 | 39 ms | 50 ms | shortlist: 23658
11. | a [20 61 ] | 99935 | 33 ms | 48 ms | shortlist: 39319
12. |er [65 72 ] | 97867 | 35 ms | 93 ms | shortlist: 117395
13. |en [65 6e ] | 94431 | 42 ms | 99 ms | shortlist: 95676
14. | co [20 63 6f 

In [5]:
# to save

GT_Train.save_pretrained('pcatt/hf/examples/greedtok_test2')

tokenizer config file saved in pcatt/hf/examples/greedtok_test2/tokenizer_config.json
special_tokens_map file saved in pcatt/hf/examples/greedtok_test2/special_tokens_map.json
added tokens file saved in pcatt/hf/examples/greedtok_test2/added_tokens.txt


('pcatt/hf/examples/greedtok_test2/tokenizer_config.json',
 'pcatt/hf/examples/greedtok_test2/special_tokens_map.json',
 'pcatt/hf/examples/greedtok_test2/added_tokens.txt')

In [6]:
# loading pretrained

from pcatt.hf.greedtok import GreedTok
GT_Train = GreedTok.from_pretrained("pcatt/hf/examples/greedtok_test2")
print(GT_Train.pat)

regex.Regex(' ?[\\p{L}]+| ?[\\p{N}]+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+', flags=regex.V0)Trie constructed
unk_token <unk> 1
pad_token <pad> 0
eos_token <eos> 2



In [7]:
test = [x for x in next(batch_iterator())]
test_split = [x for x in next(batch_iterator_split())]

In [8]:
test_encode = GT_Train(test_split[:50], test_split[50:100], is_split_into_words=True)
print(test_encode['input_ids'][1][:100])
test_decode = GT_Train.batch_decode(test_encode['input_ids'])
print(test_decode[1][:200])

[176, 201, 216, 39, 201, 62, 203, 214, 19, 84, 19, 201, 132, 221, 31, 146, 68, 89, 144, 5, 3, 132, 182, 201, 212, 217, 198, 208, 28, 132, 110, 211, 202, 132, 166, 84, 203, 30, 205, 197, 132, 217, 212, 20, 132, 221, 31, 214, 59, 37, 199, 4, 10, 215, 132, 180, 8, 77, 200, 75, 6, 3, 132, 171, 12, 11, 16, 82, 215, 79, 209, 198, 56, 10, 216, 90, 132, 110, 202, 17, 216, 221, 145, 79, 218, 12, 51, 22, 15, 77, 20, 146, 132, 189, 31, 214, 59, 220, 212, 11]
Let me congratulate you. Sir, and the Republic 
of Bulgaria upon your election as President of the General Assembly at its 
forty-seventh session. Your experience as a respected political leader and 



In [9]:
test_encode = GT_Train(test[:10], test[10:20], is_split_into_words=False)
print(test_encode['input_ids'][0][:100])
test_decode = GT_Train.batch_decode(test_encode['input_ids'])
print(test_decode[0][:200])

[173, 216, 92, 205, 218, 15, 39, 201, 25, 37, 54, 217, 8, 10, 216, 3, 132, 31, 216, 79, 216, 27, 132, 206, 211, 7, 3, 22, 212, 201, 197, 207, 11, 215, 23, 204, 211, 25, 8, 199, 24, 24, 132, 110, 209, 201, 26, 62, 203, 214, 19, 84, 19, 32, 132, 221, 31, 146, 68, 89, 144, 86, 132, 221, 31, 214, 59, 37, 199, 4, 27, 3, 25, 8, 77, 200, 12, 199, 221, 6, 3, 132, 110, 171, 12, 11, 16, 82, 215, 79, 209, 198, 56, 10, 216, 90, 65, 216, 221, 145]
It gives me pleasure at the outset to join the speakers who preceded 
me in congratulating you. Sir, on your election to the presidency of the 
General Assembly at its forty-seventh session. My delega


In [10]:
test_encode = GT_Train(test, is_split_into_words=False)
print(test_encode['input_ids'][0][:100])
test_decode = GT_Train.batch_decode(test_encode['input_ids'])
print(test_decode[0][:200])

[173, 216, 92, 205, 218, 15, 39, 201, 25, 37, 54, 217, 8, 10, 216, 3, 132, 31, 216, 79, 216, 27, 132, 206, 211, 7, 3, 22, 212, 201, 197, 207, 11, 215, 23, 204, 211, 25, 8, 199, 24, 24, 132, 110, 209, 201, 26, 62, 203, 214, 19, 84, 19, 32, 132, 221, 31, 146, 68, 89, 144, 86, 132, 221, 31, 214, 59, 37, 199, 4, 27, 3, 25, 8, 77, 200, 12, 199, 221, 6, 3, 132, 110, 171, 12, 11, 16, 82, 215, 79, 209, 198, 56, 10, 216, 90, 65, 216, 221, 145]
It gives me pleasure at the outset to join the speakers who preceded 
me in congratulating you. Sir, on your election to the presidency of the 
General Assembly at its forty-seventh session. My delega


In [11]:
test_encode = GT_Train(test_split, is_split_into_words=True)
print(test_encode['input_ids'][0][:100])
test_decode = GT_Train.batch_decode(test_encode['input_ids'])
print(test_decode[0][:200])

[173, 216, 92, 205, 218, 15, 39, 201, 25, 37, 54, 217, 8, 10, 216, 3, 132, 31, 216, 79, 216, 27, 132, 206, 211, 7, 3, 22, 212, 201, 197, 207, 11, 215, 23, 204, 211, 25, 8, 199, 24, 24, 132, 110, 209, 201, 26, 62, 203, 214, 19, 84, 19, 32, 132, 221, 31, 146, 68, 89, 144, 86, 132, 221, 31, 214, 59, 37, 199, 4, 27, 3, 25, 8, 77, 200, 12, 199, 221, 6, 3, 132, 110, 171, 12, 11, 16, 82, 215, 79, 209, 198, 56, 10, 216, 90, 65, 216, 221, 145]
It gives me pleasure at the outset to join the speakers who preceded 
me in congratulating you. Sir, on your election to the presidency of the 
General Assembly at its forty-seventh session. My delega


In [12]:
from pcatt.hf.greedtok import GreedTok
GT = GreedTok(ranked_tokens = ['aa', 'bb', 'abc', 'bc', '12', '123', '34', "<pad>", "<eos>"],
         special_tokens_map = {"pad_token":"<pad>", "eos_token":"<eos>"})
GT.save_pretrained("pcatt/hf/examples/greedtok_test1")

Trie constructed
eos_token <eos> 8
pad_token <pad> 7
tokenizer config file saved in pcatt/hf/examples/greedtok_test1/tokenizer_config.json
special_tokens_map file saved in pcatt/hf/examples/greedtok_test1/special_tokens_map.json
added tokens file saved in pcatt/hf/examples/greedtok_test1/added_tokens.txt


('pcatt/hf/examples/greedtok_test1/tokenizer_config.json',
 'pcatt/hf/examples/greedtok_test1/special_tokens_map.json',
 'pcatt/hf/examples/greedtok_test1/added_tokens.txt')

In [13]:
from pcatt.hf.greedtok import GreedTok
GT2 = GreedTok.from_pretrained("pcatt/hf/examples/greedtok_test1")

Trie constructed
pad_token <pad> 7
eos_token <eos> 8


In [14]:
#basic decoding
print(GT2.batch_decode([[0,1], [1,2], [3,4,7]]))
print(GT2.batch_decode([[3,4,7,8], [200,1,222]], skip_special_tokens=True))

['aabb', 'bbabc', 'bc12<pad>']
['bc12', '\\xbfbb\\xd5']


In [15]:
# testing __call__
GT2(["aabc", "aa1234", "abv"], is_split_into_words=False)

{'input_ids': [[0, 3], [0, 5, 61], [106, 107, 127]]}

In [16]:
# testing __call__ presplit
GT2([["aa","bc"], ["aa", "123", "4"], ["ab","v"]], is_split_into_words=True)

{'input_ids': [[0, 3], [0, 5, 61], [106, 107, 127]]}

In [17]:
# testing __call__ no padding and no truncation
outputs = GT2(["aabc", "<pad>aa1234<eos>", "abv<pad>"], 
    is_split_into_words=False, 
    padding=False,
    return_attention_mask=True,
    return_special_tokens_mask=True,
    max_length = 10)
for k,v in outputs.items():
    print(k)
    for o in v:
        print('\t', len(o), ': ', o)

special_tokens_mask
	 2 :  [0, 0]
	 5 :  [1, 0, 0, 0, 1]
	 4 :  [0, 0, 0, 1]
input_ids
	 2 :  [0, 3]
	 5 :  [7, 0, 5, 61, 8]
	 4 :  [106, 107, 127, 7]


In [18]:
# testing __call__ with padding and truncation
outputs = GT2(["aabc", 
               "aa1234",
               "abv<pad>",
               "abv<pad>abv<pad>abv<pad>aa1234"], 
    is_split_into_words=False, 
    padding="max_length",
    truncation = "longest_first",
    return_overflowing_tokens=True,
    return_attention_mask=True,
    return_special_tokens_mask=True,
    max_length = 10)
for k,v in outputs.items():
    print(k)
    for o in v:
        print('\t', len(o), ': ', o)

special_tokens_mask
	 10 :  [0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
	 10 :  [0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
	 10 :  [0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
	 10 :  [0, 0, 0, 1, 0, 0, 0, 1, 0, 0]
overflowing_tokens
	 0 :  []
	 0 :  []
	 0 :  []
	 5 :  [127, 7, 0, 5, 61]
attention_mask
	 10 :  [1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
	 10 :  [1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
	 10 :  [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]
	 10 :  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
input_ids
	 10 :  [0, 3, 7, 7, 7, 7, 7, 7, 7, 7]
	 10 :  [0, 5, 61, 7, 7, 7, 7, 7, 7, 7]
	 10 :  [106, 107, 127, 7, 7, 7, 7, 7, 7, 7]
	 10 :  [106, 107, 127, 7, 106, 107, 127, 7, 106, 107]


In [19]:
# testing pairs
outputs = GT2(["aabc", "aa1234"],
               ["abv<pad>", "abv<pad>abv<pad>abv<pad>aa1234"])
for k,v in outputs.items():
    print(k)
    for o in v:
        print('\t', len(o), ': ', o)

input_ids
	 7 :  [0, 3, 41, 106, 107, 127, 7]
	 19 :  [0, 5, 61, 41, 106, 107, 127, 7, 106, 107, 127, 7, 106, 107, 127, 7, 0, 5, 61]


In [20]:
# testing pairs with presplit words
outputs = GT2([["aa","bc"], ["aa","1234"]],
               [["abv", "<pad>"], ["abv<pad>abv<pad>","abv<pad>","aa1234"]],
             is_split_into_words=True)
for k,v in outputs.items():
    print(k)
    for o in v:
        print('\t', len(o), ': ', o)

input_ids
	 7 :  [0, 3, 41, 106, 107, 127, 7]
	 19 :  [0, 5, 61, 41, 106, 107, 127, 7, 106, 107, 127, 7, 106, 107, 127, 7, 0, 5, 61]


In [21]:
outputs = GT2([["aa","bc"], ["aa","1234"]],
               [["abv", "<pad>"], ["abv<pad>abv<pad>","abv<pad>","aa1234"]],
             is_split_into_words=True, 
    padding="max_length",
    truncation = "only_second",
    return_token_type_ids=True,
    return_overflowing_tokens=True,
    return_attention_mask=True,
    return_special_tokens_mask=True,
    max_length = 10)
for k,v in outputs.items():
    print(k)
    for o in v:
        print('\t', len(o), ': ', o)

special_tokens_mask
	 10 :  [0, 0, 0, 0, 0, 0, 1, 1, 1, 1]
	 11 :  [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
token_type_ids
	 10 :  [0, 0, 1, 1, 1, 1, 1, 7, 7, 7]
	 11 :  [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
attention_mask
	 10 :  [1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
	 11 :  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
input_ids
	 10 :  [0, 3, 41, 106, 107, 127, 7, 7, 7, 7]
	 11 :  [0, 5, 61, 41, 106, 107, 127, 7, 106, 107, 127]


In [22]:
from pcatt.pco_tokenizer import build as build_pco
words = [t for t in open('cpp_inputs/words/un.txt').read().strip().split(" ")] 
counts = [int(t.strip()) for t in open('cpp_inputs/counts/un.txt').read().strip().split('\n')]
un_counts = {a:b for a,b in zip(words, counts)}

# we can use train_new_from_counts instead to get the same result as:
'''
test = build_pco(un_counts)
test.initialize_graph(5, 1)
test_tokens, test_scores = test.solve_to_step(100)
'''

from pcatt.hf.greedtok import GreedTok

greedtok = GreedTok().train_new_from_counts(un_counts, 100, max_token_length=5, min_word_count=1)

Trie constructed
Word counts size: 105505
Token set size: 0
Empty token set size selected -> all possible substrings with...
Max token size: 5
Min. word count: 1
Final candidate token set size: 81136
Initial setup phase: 591 ms
Starting main routine...
1. |Ġ [c4 a0 ] | 30035114 | 38 ms | 237 ms | shortlist: 75764
2. |Ġth [c4 a0 74 68 ] | 7109102 | 19 ms | 22 ms | shortlist: 1864
3. |tion [74 69 6f 6e ] | 4043268 | 13 ms | 29 ms | shortlist: 7700
4. |Ġof [c4 a0 6f 66 ] | 3300812 | 12 ms | 12 ms | shortlist: 371
5. |Ġa [c4 a0 61 ] | 3259093 | 8 ms | 15 ms | shortlist: 7092
6. |in [69 6e ] | 2782359 | 13 ms | 65 ms | shortlist: 21307
7. |re [72 65 ] | 2384688 | 15 ms | 43 ms | shortlist: 13589
8. |Ġto [c4 a0 74 6f ] | 2228162 | 13 ms | 14 ms | shortlist: 1091
9. |er [65 72 ] | 1910725 | 7 ms | 38 ms | shortlist: 16660
10. |en [65 6e ] | 1831877 | 13 ms | 44 ms | shortlist: 13572
11. |Ġco [c4 a0 63 6f ] | 1782132 | 13 ms | 19 ms | shortlist: 4574
12. |it [69 74 ] | 1622191 | 9 ms | 25 ms |

In [23]:
# to use in existing codebases simply import and load from AutoTokenizer
import pcatt.hf
from transformers import AutoTokenizer
tokenize = AutoTokenizer.from_pretrained("pcatt/hf/examples/greedtok_test2")

# we can also pass callbacks to modify the final encoding
original_str = "The United Nations Organization for peace."
callback = lambda x1: [*x1, 1, 1, 1, 5, 5, 5]
idxs = tokenize.encode(original_str, callback=callback)
print(idxs)
print("Original:", original_str)
print("Tokens:  ", idxs)
print("Readable:", [
    tokenize.final_ids_map[x]
    for x in idxs
    if x not in tokenize.special_token_ids
])
print("EncDec:  ", tokenize.decode(idxs))

[58, 47, 63, 132, 97, 65, 66, 146, 1, 1, 1, 5, 5, 5]Trie constructed
unk_token <unk> 1
pad_token <pad> 0
eos_token <eos> 2

Original: The United Nations Organization for peace.
Tokens:   [58, 47, 63, 132, 97, 65, 66, 146, 1, 1, 1, 5, 5, 5]
Readable: [b'The', b' United', b' Nations', b' ', b'Organization', b' for', b' peace', b'.', b' and', b' and', b' and']
EncDec:   The United Nations Organization for peace.<unk><unk><unk> and and and


In [24]:
original_str = "The United Nations Organization for peace."
original_str2 = "The world is developing."
def callback(x1, x2):
    return x1 + [tokenize.final_tokens_map[b" "]]*5 + x2, [0]*len(x1) + [tokenize.final_tokens_map[b" "]]*5 + [1]*len(x2)
idxs = tokenize.encode(original_str, original_str2, callback=callback)
print(idxs)
print("Original:", original_str + " " + original_str2)
print("Tokens:  ", idxs)
print("Readable:", [
    tokenize.final_ids_map[x]
    for x in idxs
    if x not in tokenize.special_token_ids
])
print("EncDec:  ", tokenize.decode(idxs))

[58, 47, 63, 132, 97, 65, 66, 146, 132, 132, 132, 132, 132, 58, 80, 76, 132, 38, 32, 146]
Original: The United Nations Organization for peace. The world is developing.
Tokens:   [58, 47, 63, 132, 97, 65, 66, 146, 132, 132, 132, 132, 132, 58, 80, 76, 132, 38, 32, 146]
Readable: [b'The', b' United', b' Nations', b' ', b'Organization', b' for', b' peace', b'.', b' ', b' ', b' ', b' ', b' ', b'The', b' world', b' is', b' ', b'develop', b'ing', b'.']
EncDec:   The United Nations Organization for peace.     The world is developing.
