# Sentencepiece and T5 tokenizer experiment

## Sentencepiece

The tokenizer was trained with the following arguments 

```
spm.SentencePieceTrainer.train(
    input='../data/enTokenData.txt', 
    model_prefix='en', 
    vocab_size=25000, 
    pad_id = 3
)
```

In [30]:
import sentencepiece as spm

boTokenizerPath = '../preProcessing/bo.model'
enTokenizerPath = '../preProcessing/en.model'

In [2]:
enTokenizer = spm.SentencePieceProcessor(model_file=enTokenizerPath)

### Encode a single string

In [3]:
enTokenizer.encode('This is a test')

[458, 13, 10, 6586]

In [4]:
enTokenizer.encode('This is a test', out_type=str)

['▁This', '▁is', '▁a', '▁test']

In [5]:
# Introduce some randomness? 
enTokenizer.encode('This is a test', out_type=str, enable_sampling=True, alpha=0.1, nbest_size=-1)

['▁T', 'h', 'is', '▁i', 's', '▁a', '▁test']

### Encode a batch

In [6]:
enTokenizer.encode(['My name is Lambus', 'your karma is me'])

[[8804, 181, 13, 7508, 6903, 8], [104, 677, 13, 121]]

In [7]:
enTokenizer.encode(['My name is Lambus', 'your karma is me'], out_type=str)

[['▁My', '▁name', '▁is', '▁Lam', 'bu', 's'], ['▁your', '▁karma', '▁is', '▁me']]

### Decode

In [20]:
enTokenizer.decode([458, 13, 10, 6586])

'This is a test'

In [21]:
enTokenizer.decode([1, 458, 13, 10, 6586, 2, 3, 3, 3, 3, 3, 3, 3])

'This is a test'

In [22]:
enTokenizer.decode([[8804, 181, 13, 7508, 6903, 8], [104, 677, 13, 121]])

['My name is Lambus', 'your karma is me']

In [11]:
enTokenizer.decode(['▁This', '▁is', '▁a', '▁test'])

'This is a test'

In [23]:
enTokenizer.decode(['<s>', '▁This', '▁is', '▁a', '▁test', '</s>', '<pad>', '<pad>'])

'This is a test'

In [13]:
enTokenizer.decode([['▁My', '▁name', '▁is', '▁Lam', 'bu', 's'], ['▁your', '▁karma', '▁is', '▁me']])

['My name is Lambus', 'your karma is me']

In [24]:
enTokenizer.decode([['<s>', '▁My', '▁name', '▁is', '▁Lam', 'bu', 's', '</s>', '<pad>', '<pad>'], ['<s>', '▁your', '▁karma', '▁is', '▁me', '</s>', '<pad>', '<pad>']])

['My name is Lambus', 'your karma is me']

### Special utilities

In [25]:
# Vocab size 
enTokenizer.get_piece_size()

25000

In [26]:
enTokenizer.id_to_piece(0)

'<unk>'

In [27]:
enTokenizer.id_to_piece(1)

'<s>'

In [28]:
enTokenizer.id_to_piece(2)

'</s>'

In [29]:
enTokenizer.id_to_piece(3)

'<pad>'

In [31]:
enTokenizer.piece_to_id('<s>')

1

In [33]:
enTokenizer.piece_to_id('<pad>')

3

## T5 tokenizer 

In [28]:
from transformers import T5Tokenizer

In [29]:
t5tok = T5Tokenizer.from_pretrained('t5-small')

In [30]:
t5tok.model_max_length = 20

In [31]:
t5tok.encode('translate from English to German: every day is a day')

[13959, 45, 1566, 12, 2968, 10, 334, 239, 19, 3, 9, 239, 1]

In [32]:
t5tok.convert_tokens_to_ids('<pad>')

0

In [33]:
t5tok.padding_side

'right'

In [34]:
t5tok('translate English to German: The house is wonderful.', return_tensors='pt').input_ids

tensor([[13959,  1566,    12,  2968,    10,    37,   629,    19,  1627,     5,
             1]])

In [35]:
t5tok('Das Haus ist wunderbar.', return_tensors='pt', padding = 'max_length').input_ids

tensor([[  644,  4598,   229, 19250,     5,     1,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])