# Translating between tokenizers

In [1]:
# GPT4
import tiktoken
gpt4tokenizer = tiktoken.get_encoding('cl100k_base')

# BERT
from transformers import BertTokenizer
berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# attempting a direct translation
# GPT4 --> BERT --> GPT4

In [2]:
# issue is that they have different tokenizers, 
# so needs to be translated into text and re-tokenized
starting_text = "Hello, my name is Shre and I like yellow."

# GPT4's tokens:
gpt4_tokens = gpt4tokenizer.encode(starting_text)

# bert's tokens
bert_tokens = berttokenizer.encode(starting_text)

In [3]:
print(f'Starting text:\n{starting_text}')
print(f'\n\nGPT4 tokens:\n{gpt4_tokens}')
print(f"\nDecoded using GPT4:\n{gpt4tokenizer.decode(gpt4_tokens)}")
print(f"\nDecoded using BERT:\n{berttokenizer.decode(gpt4_tokens)}")

print(f'\n\nBERT tokens:\n{bert_tokens}')
print(f"\nDecoded using BERT:\n{berttokenizer.decode(bert_tokens)}")
print(f"\nDecoded using GPT4:\n{gpt4tokenizer.decode(bert_tokens)}")

Starting text:
Hello, my name is Shre and I like yellow.


GPT4 tokens:
[9906, 11, 856, 836, 374, 1443, 265, 323, 358, 1093, 14071, 13]

Decoded using GPT4:
Hello, my name is Shre and I like yellow.

Decoded using BERT:
lately [unused10] [unused851] [unused831] [unused369] ვ [unused260] [unused318] [unused353] ¾ wan [unused12]


BERT tokens:
[101, 7592, 1010, 2026, 2171, 2003, 14021, 2890, 1998, 1045, 2066, 3756, 1012, 102]

Decoded using BERT:
[CLS] hello, my name is shre and i like yellow. [SEP]

Decoded using GPT4:
�.deleteceptionrgretoin\M health(idate	for deviceinclude�


# the right way to translate (numbers to text)

In [4]:
# text -> GPT4 tokens -> text -> BERT tokens

# 1) to GPT4 tokens
starting_text = "Hello, my name is Shre and I like yellow."
gpt4_tokens = gpt4tokenizer.encode(starting_text)

# 2) back to text
gpt4_recon_text = gpt4tokenizer.decode(gpt4_tokens)

# 3) then to bert tokens
bert_tokens = berttokenizer.encode(gpt4_recon_text)

# 4) show the reconstruction
berttokenizer.decode(bert_tokens)

'[CLS] hello, my name is shre and i like yellow. [SEP]'

# possible annoyances and confusions in translations

In [5]:
# warning about sizes:
txt = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.'
print(f'Text contains {len(txt)} characters,')
print(f'              {len(gpt4tokenizer.encode(txt))} GPT4 tokens, and')
print(f'              {len(berttokenizer.encode(txt))} Bert tokens.')

Text contains 445 characters,
              96 GPT4 tokens, and
              160 Bert tokens.


In [6]:
# another source of confusion
txt = 'start\r\n\r\n\r\n\n\r\n\r\n\t\t\t\n\r\n\rend'
# txt = 'start\t\t\t\t\t\t\tend'
# txt = 'start                    end'

bert_tokens = berttokenizer.encode(txt)
gpt4_tokens = gpt4tokenizer.encode(txt)

print(f'Reconstruction in BERT:\n  {bert_tokens}\n  {berttokenizer.decode(bert_tokens)}\n')
print(f'Reconstruction in GPT4:\n  {gpt4_tokens}\n  {gpt4tokenizer.decode(gpt4_tokens)}')

Reconstruction in BERT:
  [101, 2707, 2203, 102]
  [CLS] start end [SEP]

Reconstruction in GPT4:
  [2527, 881, 81923, 881, 4660, 319, 201, 408]
  start





			

end


# write translation functions

In [7]:
# translation functions

def bert2gpt4(bert_tokens):
    b = berttokenizer.decode(bert_tokens)
    g = gpt4tokenizer.encode(b)
    return g


def gpt42bert(gpt4_tokens):
    g = gpt4tokenizer.decode(gpt4_tokens)
    b = berttokenizer.encode(g)
    return b[1:-1] # bert auto-adds [CLS] ... [SEP]

In [8]:
# checking that it goves no errors
text = "I wish chocolate were orange."

print(bert2gpt4(berttokenizer.encode(text)))
print(gpt42bert(gpt4tokenizer.encode(text)))

[58, 88816, 60, 602, 6562, 18414, 1051, 19087, 13, 510, 82476, 60]
[1045, 4299, 7967, 2020, 4589, 1012]


# BERT --> GPT4 --> BERT

In [9]:
# sample text
text = "I wanted to paste in a thought-provoking quote here, but I didn't."
print(f'Original text\n  {text}\n')

# initial encoding
bert_toks = berttokenizer.encode(text)
print(f"BERT Tokens:\n {bert_toks}\n")

# translate to GPT4
b2g = bert2gpt4(bert_toks)
print(f"BERT to GPT4:\n  {gpt4tokenizer.decode(b2g)}\n")

# back-translate to BERT
back2bert = gpt42bert(b2g)
print(f"Back to BERT:\n {berttokenizer.decode(back2bert)}")

Original text
  I wanted to paste in a thought-provoking quote here, but I didn't.

BERT Tokens:
 [101, 1045, 2359, 2000, 19351, 1999, 1037, 2245, 1011, 4013, 22776, 14686, 2182, 1010, 2021, 1045, 2134, 1005, 1056, 1012, 102]

BERT to GPT4:
  [CLS] i wanted to paste in a thought - provoking quote here, but i didn't. [SEP]

Back to BERT:
 [CLS] i wanted to paste in a thought - provoking quote here, but i didn't. [SEP]


# GPT4 --> BERT --> GPT4

In [10]:
# sample text
text = "I still don't have a good quote here. Now it's too late."
print(f'Original text\n  {text}\n')

# initial encoding
gpt4Tox = gpt4tokenizer.encode(text)
print(f'GPT4 tokens:\n  {gpt4Tox}\n')

# translate to BERT
g2b = gpt42bert(gpt4Tox)
print(f'GPT4 to BERT:\n  {berttokenizer.decode(g2b)}\n')

# back-translate to GPT4
back2gpt4 = bert2gpt4(g2b)
print(f'Back to GPT4:\n  {gpt4tokenizer.decode(back2gpt4)}')

Original text
  I still don't have a good quote here. Now it's too late.

GPT4 tokens:
  [40, 2103, 1541, 956, 617, 264, 1695, 12929, 1618, 13, 4800, 433, 596, 2288, 3389, 13]

GPT4 to BERT:
  i still don't have a good quote here. now it's too late.

Back to GPT4:
  i still don't have a good quote here. now it's too late.
