# 1. Przygotowanie Danych

In [None]:
!pip install datasets

In [None]:
from pprint import pprint
from datasets import list_datasets, load_dataset
from sklearn.model_selection import train_test_split

In [None]:
list_datasets()

In [None]:
dataset = load_dataset("merve/folk-mythology-tales")

Using custom data configuration merve___folk-mythology-tales-7ad723422688336c
Reusing dataset text (/root/.cache/huggingface/datasets/text/merve___folk-mythology-tales-7ad723422688336c/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
dataset['train'], dataset['validation'] = dataset['train'].train_test_split(.2).values()

Loading cached split indices for dataset at /root/.cache/huggingface/datasets/text/merve___folk-mythology-tales-7ad723422688336c/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5/cache-75d3e36d9141f8c9.arrow and /root/.cache/huggingface/datasets/text/merve___folk-mythology-tales-7ad723422688336c/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5/cache-aa93046626ebaad6.arrow


In [None]:
dataset['validation'], dataset['test'] = dataset['validation'].train_test_split(.5).values()

In [None]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 197592
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 24699
    })
    test: Dataset({
        features: ['text'],
        num_rows: 24700
    })
})


In [None]:
def prepare_data_set(data_set_name: str):
  print(f'Processing {data_set_name} set')
  data_set = dataset[data_set_name]

  with open(f'{data_set_name}.txt', 'wt') as f_write:
    for document_dict in data_set:
      f_write.write(document_dict['text'] + '\n')

  print(f'Loaded {len(data_set)} documents from {data_set_name}')

In [None]:
prepare_data_set('train')
prepare_data_set('validation')
prepare_data_set('test')

Processing train set
Loaded 197592 documents from train
Processing validation set
Loaded 24699 documents from validation
Processing test set
Loaded 24700 documents from test


In [None]:
!ls -lh train.txt validation.txt test.txt

-rw-r--r-- 1 root root 1.2M Nov 30 15:18 test.txt
-rw-r--r-- 1 root root 9.6M Nov 30 15:18 train.txt
-rw-r--r-- 1 root root 1.2M Nov 30 15:18 validation.txt


In [None]:
!head train.txt


"The three legs of it looks mighty quare stickin' up," says she.
The young lady stopped at this door, and knocked gently; whereupon both
would take no refusal, but implored her to have pity on him, promising
 The Bul. Edit. gives by mistake of diacritical points,
Peronnik was a poor idiot who belonged to nobody, and he would have died
In the farmhouse lived a young married couple; they loved each
and prayed a two bow prayer, a thanksgiving to God for my
coming to the rescue in their dire need, and then besought him to



In [None]:
!head test.txt

asked her husband.--Arise she answered, and take him in thy bosom, and

got up, and bringing Far Rua out, "Are you strong?" said he.
'and then? what happens then?'

suffered us not to remain so! Thus they did until the morning, when the



 Khaleefehs; or the Four Welees (eminent saints), the seyyid


In [None]:
!head validation.txt

they also would put her out of the town. But this did not content the

the prince himself, who lay stretched out on his bed with his eyes
That in the text stood to the east of the principal street in
guards came to them, and put chains upon their necks, and upon my neck
discover any thing; and having turned his head about on every side, he
During the whole of the week Irene had been thinking every other moment
king to slay her. 'But he would not do that,' she continued softly, 'and
into the locks she said:
was with you, I thought not of that, but always--you know it well--when


# 2. Stworzenie i trenowanie Tokenizerów

In [None]:
!pip install tokenizers



In [None]:
from tokenizers import ByteLevelBPETokenizer, CharBPETokenizer, SentencePieceUnigramTokenizer

In [None]:
unk_token = "[UNK]"
special_tokens = [unk_token, "[CLS]", "[SEP]", "[PAD]", "[MASK]"]

In [None]:
files = ["train.txt"]

In [None]:
vocab_size = 5_000

In [None]:
!mkdir -p out out/bpe_char out/bpe_byte out/spm

## 2.1 Char BPE

In [None]:
bpe_char = CharBPETokenizer(unk_token=unk_token)

In [None]:
bpe_char.train(files=files, vocab_size=vocab_size, min_frequency=15, special_tokens=special_tokens, show_progress=True)

In [None]:
bpe_char.save_model('out/bpe_char')

['out/bpe_char/vocab.json', 'out/bpe_char/merges.txt']

In [None]:
!ls -lh out/bpe_char

total 124K
-rw-r--r-- 1 root root 45K Nov 30 15:19 merges.txt
-rw-r--r-- 1 root root 74K Nov 30 15:19 vocab.json


## 2.2 Byte BPE

In [None]:
bpe_byte = ByteLevelBPETokenizer()

In [None]:
bpe_byte.train(files=files, vocab_size=vocab_size, min_frequency=15, special_tokens=special_tokens, show_progress=True)

In [None]:
bpe_byte.save_model('out/bpe_byte')

['out/bpe_byte/vocab.json', 'out/bpe_byte/merges.txt']

In [None]:
!ls -lh out/bpe_byte

total 104K
-rw-r--r-- 1 root root 36K Nov 30 15:19 merges.txt
-rw-r--r-- 1 root root 65K Nov 30 15:19 vocab.json


## 2.3 SMP Unigram

In [None]:
spm = SentencePieceUnigramTokenizer()

In [None]:
spm.train(files=files, vocab_size=vocab_size, special_tokens=special_tokens, show_progress=True)

In [None]:
spm.save_model('out/spm')

['out/spm/unigram.json']

In [None]:
!ls -lh out/spm

total 280K
-rw-r--r-- 1 root root 277K Nov 30 15:19 unigram.json


## 3. Porównanie Tokenizerów

In [None]:
text_test_1 = 'delighted with him that she would take him up to the farm at once to the queens were by this time more than a yard long, and they did not.'
text_test_2 = 'He agreed delight, and remitted the remaining third of his claim to the merchants.'
text_test_3 = 'this new difficulty? With bowed head, and feeling very sad, he sat down.'
text_test_4 = 'where to lead the sheep to the sweetest pastures, and where among the'
text_test_5 = 'She then cried out, and suddenly ten female slaves came to me, and threw'

In [None]:
def print_encoded(tokens):
  print(f'[{len(tokens)}]', ' '.join(tokens))

## 3.1 Char BPE

In [None]:
encoded = bpe_char.encode(text_test_1)
print_encoded(encoded.tokens)

encoded = bpe_char.encode(text_test_2)
print_encoded(encoded.tokens)

encoded = bpe_char.encode(text_test_3)
print_encoded(encoded.tokens)

encoded = bpe_char.encode(text_test_4)
print_encoded(encoded.tokens)

encoded = bpe_char.encode(text_test_5)
print_encoded(encoded.tokens)

[33] delighted</w> with</w> him</w> that</w> she</w> would</w> take</w> him</w> up</w> to</w> the</w> farm</w> at</w> once</w> to</w> the</w> que ens</w> were</w> by</w> this</w> time</w> more</w> than</w> a</w> yard</w> long</w> ,</w> and</w> they</w> did</w> not</w> .</w>
[18] He</w> agreed</w> delight</w> ,</w> and</w> re mitted</w> the</w> rema ining</w> third</w> of</w> his</w> claim</w> to</w> the</w> merchants</w> .</w>
[17] this</w> new</w> difficulty</w> ?</w> With</w> bowed</w> head</w> ,</w> and</w> feeling</w> very</w> sad</w> ,</w> he</w> sat</w> down</w> .</w>
[17] where</w> to</w> lead</w> the</w> sheep</w> to</w> the</w> swee test</w> pa stu res</w> ,</w> and</w> where</w> among</w> the</w>
[16] She</w> then</w> cried</w> out</w> ,</w> and</w> suddenly</w> ten</w> female</w> slaves</w> came</w> to</w> me</w> ,</w> and</w> threw</w>


In [None]:
encoded = bpe_char.encode('Misunderstandings')
print_encoded(encoded.tokens)

encoded = bpe_char.encode('Consanguineous')
print_encoded(encoded.tokens)

encoded = bpe_char.encode('Psychotomimetic')
print_encoded(encoded.tokens)

[5] M is under stan dings</w>
[5] Con san gu ine ous</w>
[7] P sy cho to mi me tic</w>


## 3.2 Byte BPE

In [None]:
encoded = bpe_byte.encode(text_test_1)
print_encoded(encoded.tokens)

encoded = bpe_byte.encode(text_test_2)
print_encoded(encoded.tokens)

encoded = bpe_byte.encode(text_test_3)
print_encoded(encoded.tokens)

encoded = bpe_byte.encode(text_test_4)
print_encoded(encoded.tokens)

encoded = bpe_byte.encode(text_test_5)
print_encoded(encoded.tokens)

[34] del ighted Ġwith Ġhim Ġthat Ġshe Ġwould Ġtake Ġhim Ġup Ġto Ġthe Ġfarm Ġat Ġonce Ġto Ġthe Ġqueen s Ġwere Ġby Ġthis Ġtime Ġmore Ġthan Ġa Ġyard Ġlong , Ġand Ġthey Ġdid Ġnot .
[18] He Ġagreed Ġdelight , Ġand Ġrem itted Ġthe Ġremain ing Ġthird Ġof Ġhis Ġclaim Ġto Ġthe Ġmerchants .
[17] this Ġnew Ġdifficulty ? ĠWith Ġbowed Ġhead , Ġand Ġfeeling Ġvery Ġsad , Ġhe Ġsat Ġdown .
[16] where Ġto Ġlead Ġthe Ġsheep Ġto Ġthe Ġsweet est Ġpast ures , Ġand Ġwhere Ġamong Ġthe
[16] She Ġthen Ġcried Ġout , Ġand Ġsuddenly Ġten Ġfemale Ġslaves Ġcame Ġto Ġme , Ġand Ġthrew


In [None]:
encoded = bpe_byte.encode('Misunderstandings')
print_encoded(encoded.tokens)

encoded = bpe_byte.encode('Consanguineous')
print_encoded(encoded.tokens)

encoded = bpe_byte.encode('Psychotomimetic')
print_encoded(encoded.tokens)

[5] M is under stand ings
[5] C ons angu ine ous
[8] P sy ch ot om im et ic


## 3.3 SMP Unigram

In [None]:
encoded = spm.encode(text_test_1)
print_encoded(encoded.tokens)

encoded = spm.encode(text_test_2)
print_encoded(encoded.tokens)

encoded = spm.encode(text_test_3)
print_encoded(encoded.tokens)

encoded = spm.encode(text_test_4)
print_encoded(encoded.tokens)

encoded = spm.encode(text_test_5)
print_encoded(encoded.tokens)

[32] ▁delighted ▁with ▁him ▁that ▁she ▁would ▁take ▁him ▁up ▁to ▁the ▁farm ▁at ▁once ▁to ▁the ▁queen s ▁were ▁by ▁this ▁time ▁more ▁than ▁a ▁yard ▁long, ▁and ▁they ▁did ▁not .
[21] ▁He ▁agreed ▁delight , ▁and ▁ re m it ted ▁the ▁remain ing ▁third ▁of ▁his ▁claim ▁to ▁the ▁merchants .
[16] ▁this ▁new ▁difficulty ? ▁With ▁bowed ▁head, ▁and ▁feeling ▁very ▁sad , ▁he ▁sat ▁down .
[16] ▁where ▁to ▁lead ▁the ▁sheep ▁to ▁the ▁sweet est ▁past ure s, ▁and ▁where ▁among ▁the
[14] ▁She ▁then ▁cried ▁out, ▁and ▁suddenly ▁ten ▁female ▁slaves ▁came ▁to ▁me, ▁and ▁threw


In [None]:
encoded = spm.encode('Misunderstandings')
print_encoded(encoded.tokens)

encoded = spm.encode('Consanguineous')
print_encoded(encoded.tokens)

encoded = spm.encode('Psychotomimetic')
print_encoded(encoded.tokens)

[8] ▁M is un der st and ing s
[8] ▁Con s an g u in e ous
[9] ▁P s y ch o to mi me tic


# 3.4 Wnioski z porównania

### 3.4.1. Zdania

Dla zdania:

```
delighted with him that she would take him up to the farm at once to the queens were by this time more than a yard long, and they did not.
```

można zobaczyć, że słowo `queens` zostało rozbite na 3 sposoby (z pominięciem znaków `</w>`, `Ġ` oraz `▁`):
- `que ens` - 2 tokeny - dla tokenizera BPE opartego o znakach
- `queen s` - 2 tokeny - dla tokenizera BPE opartego o bajtach
- `queen s` - 2 tokeny - dla tokenizera opartego o Unigramowy Model Języka

można zobaczyć, że słowo `delighted` zostało rozbite na 1 sposób (z pominięciem znaków `</w>`, `Ġ` oraz `▁`):
- `del ighted` - 2 tokeny - dla tokenizera BPE opartego o bajtach

Dla zdania:

```
He agreed delight, and remitted the remaining third of his claim to the merchants.
```

można zobaczyć, że słowo `remitted` zostało rozbite na 3 sposoby (z pominięciem znaków `</w>`, `Ġ` oraz `▁`):
- `re mitted` - 2 tokeny - dla tokenizera BPE opartego o znakach
- `rem itted` - 2 tokeny - dla tokenizera BPE opartego o bajtach
- `re m it ted` - 4 tokeny - dla tokenizera opartego o Unigramowy Model Języka

można zobaczyć, że słowo `remaining` zostało rozbite na 3 sposoby (z pominięciem znaków `</w>`, `Ġ` oraz `▁`):
- `rema ining` - 2 tokeny - dla tokenizera BPE opartego o znakach
- `remain ing` - 2 tokeny - dla tokenizera BPE opartego o bajtach
- `remain ing` - 2 tokeny - dla tokenizera opartego o Unigramowy Model Języka



Dla zdania:

```
this new difficulty? With bowed head, and feeling very sad, he sat down.
```

Brak rozbić

Dla zdania:

```
where to lead the sheep to the sweetest pastures, and where among the
```

można zobaczyć, że słowo `pastures` zostało rozbite na 3 sposoby (z pominięciem znaków `</w>`, `Ġ` oraz `▁`):
- `pa stu res` - 3 tokeny - dla tokenizera BPE opartego o znakach
- `past ures` - 2 tokeny - dla tokenizera BPE opartego o bajtach
- `past ure s` - 3 tokeny - dla tokenizera opartego o Unigramowy Model Języka

można zobaczyć, że słowo `sweetest` zostało rozbite na 3 sposoby (z pominięciem znaków `</w>`, `Ġ` oraz `▁`):
- `swee test` - 2 tokeny - dla tokenizera BPE opartego o znakach
- `sweet est` - 2 tokeny - dla tokenizera BPE opartego o bajtach
- `sweet est` - 2 tokeny - dla tokenizera opartego o Unigramowy Model Języka



Dla zdania:

```
She then cried out, and suddenly ten female slaves came to me, and threw
```

Brak rozbić

### 3.4.2. Słowa

Dla słowa:

```
Misunderstandings
```

można zobaczyć, że zostało ono rozbite na 3 sposoby (z pominięciem znaków `</w>`, `Ġ` oraz `▁`):
- `M is under stan dings` - 5 tokenów - dla tokenizera BPE opartego o znakach
- `M is under stand ings` - 5 tokeny - dla tokenizera BPE opartego o bajtach
- `M is un der st and ing s` - 8 tokenów - dla tokenizera opartego o Unigramowy Model Języka

Dla słowa:

```
Consanguineous
```

można zobaczyć, że zostało ono rozbite na 3 sposoby (z pominięciem znaków `</w>`, `Ġ` oraz `▁`):
- `Con san gu ine ous` - 5 tokenów - dla tokenizera BPE opartego o znakach
- `C ons angu ine ous` - 5 tokenów - dla tokenizera BPE opartego o bajtach
- `Con s an g u in e ous` - 8 tokenów - dla tokenizera opartego o Unigramowy Model Języka

Dla słowa:

```
Psychotomimetic
```

można zobaczyć, że zostało ono rozbite na 3 sposoby (z pominięciem znaków `</w>`, `Ġ` oraz `▁`):
- `P sy cho to mi me tic` - 7 tokenów - dla tokenizera BPE opartego o znakach
- `P sy ch ot om im et ic` - 8 tokenów - dla tokenizera BPE opartego o bajtach
- `P s y ch o to mi me tic` - 9 tokenów - dla tokenizera opartego o Unigramowy Model Języka

# 4. Dodatkowy tokenizer (Pomniejszony 5x (z 5000 do 1000))

## 4.1 Stworzenie

In [None]:
vocab_size = 1_000

In [None]:
bpe_char_2 = CharBPETokenizer(unk_token=unk_token)

In [None]:
bpe_char_2.train(files=files, vocab_size=vocab_size, min_frequency=15, special_tokens=special_tokens, show_progress=True)

In [None]:
!mkdir -p out out/bpe_char_2

In [None]:
bpe_char_2.save_model('out/bpe_char_2')

['out/bpe_char_2/vocab.json', 'out/bpe_char_2/merges.txt']

In [None]:
!ls -lh out/bpe_char_2

total 20K
-rw-r--r-- 1 root root 6.5K Nov 30 15:19 merges.txt
-rw-r--r-- 1 root root  12K Nov 30 15:19 vocab.json


# 4.2 Testy

In [None]:
encoded = bpe_char_2.encode(text_test_1)
print_encoded(encoded.tokens)

encoded = bpe_char_2.encode(text_test_2)
print_encoded(encoded.tokens)

encoded = bpe_char_2.encode(text_test_3)
print_encoded(encoded.tokens)

encoded = bpe_char_2.encode(text_test_4)
print_encoded(encoded.tokens)

encoded = bpe_char_2.encode(text_test_5)
print_encoded(encoded.tokens)

[40] de ligh ted</w> with</w> him</w> that</w> she</w> would</w> take</w> him</w> up</w> to</w> the</w> f ar m</w> at</w> once</w> to</w> the</w> qu e en s</w> were</w> by</w> this</w> time</w> more</w> than</w> a</w> y ard</w> long</w> ,</w> and</w> they</w> did</w> not</w> .</w>
[29] He</w> a gre ed</w> de light</w> ,</w> and</w> re m it ted</w> the</w> rema in ing</w> thir d</w> of</w> his</w> cla i m</w> to</w> the</w> mer chan ts</w> .</w>
[28] this</w> new</w> di f fi cu l ty</w> ?</w> W it h</w> b owed</w> head</w> ,</w> and</w> fe el ing</w> very</w> sa d</w> ,</w> he</w> sat</w> down</w> .</w>
[25] where</w> to</w> lea d</w> the</w> sh e ep</w> to</w> the</w> s we e te st</w> pa st u res</w> ,</w> and</w> where</w> am ong</w> the</w>
[24] She</w> then</w> cried</w> out</w> ,</w> and</w> su dd en ly</w> ten</w> fe ma le</w> sla ves</w> came</w> to</w> me</w> ,</w> and</w> th re w</w>


In [None]:
encoded = bpe_char_2.encode('Misunderstandings')
print_encoded(encoded.tokens)

encoded = bpe_char_2.encode('Consanguineous')
print_encoded(encoded.tokens)

encoded = bpe_char_2.encode('Psychotomimetic')
print_encoded(encoded.tokens)

[8] M i s un der stan d ings</w>
[8] C on s an gu in e ous</w>
[10] P s y ch o to mi me ti c</w>


## 4.3 Wnioski

Tokenizer rozbił w zdaniach o wiele więcej słów (a same słowa na więcej tokenów)

Dla tekstu 1 - `5k - 1` vs `1k - 4`

Dla tekstu 2 - `5k - 2` vs `1k - 7`

Dla tekstu 3 - `5k - 0` vs `1k - 5`

Dla tekstu 4 - `5k - 2` vs `1k - 5`

Dla tekstu 5 - `5k - 0` vs `1k - 4`

Liczba tokenów w słowach

Dla Słowa 1 - `5k - 5` vs `1k - 8`

Dla Słowa 2 - `5k - 5` vs `1k - 8`

Dla Słowa 3 - `5k - 7` vs `1k - 10`
