From the [EleutherAI repo](https://github.com/EleutherAI/pythia#quickstart)

In [1]:
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm
from transformers import GPTNeoXForCausalLM, AutoTokenizer

In [2]:
model_size = "1b"
step = 143000

## Tokenizer Inspection

In [3]:
tokenizer = AutoTokenizer.from_pretrained(
  f"EleutherAI/pythia-{model_size}-deduped",
  revision=f"step{step}",
  cache_dir=f"./pythia-{model_size}-deduped/step{step}",
)

In [4]:
tokenizer

GPTNeoXTokenizerFast(name_or_path='EleutherAI/pythia-1b-deduped', vocab_size=50254, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

```python
GPTNeoXTokenizerFast(
    name_or_path='EleutherAI/pythia-2.8b-deduped',
    vocab_size=50254,
    model_max_length=1000000000000000019884624838656,
    is_fast=True,
    padding_side='right',
    truncation_side='right',
    special_tokens={
        'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'},
    clean_up_tokenization_spaces=True,
)
```

In [5]:
tokenizer("Word1 word2 ", return_tensors="pt")

{'input_ids': tensor([[22093,    18,  3159,    19,   209]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [6]:
tokenizer("Word1")['input_ids'], tokenizer("word2")['input_ids']

([22093, 18], [3418, 19])

Illustraion why LLMs are so bad at handling numbers. "123" is one token, the other number ("54873673") is split into several `[608, 30910, 1812, 3655]`.

In [7]:
tokenizer("123 + 123")['input_ids'], tokenizer("123 + 54873673")['input_ids']

([10683, 559, 15567], [10683, 559, 608, 30910, 1812, 3655])

A string of only 10 characters is tokenized into a sequence of 13 characters!

In [8]:
strange_str = "£¢∞®†£ƒ©∂•"
len(tokenizer(strange_str)['input_ids']), len(strange_str)

(13, 10)

In [9]:
tokenizer("¢")

{'input_ids': [23696], 'attention_mask': [1]}

Tokenizer vocabulary

In [10]:
list(tokenizer.vocab.items())[:20]

[('Ġlambda', 29331),
 ('ĠBSD', 32327),
 ('ĠSPI', 37590),
 ('Ġbic', 43022),
 ('Ġpeoples', 22132),
 ('Ġspirits', 19851),
 ('Ġspin', 5508),
 ('rons', 9036),
 ('Ġthereon', 30134),
 ('idea', 36665),
 ('ĠìŀĪ', 44799),
 ('Ġsuddenly', 8423),
 ('ĠNixon', 26089),
 ('letes', 42176),
 ('ĠRad', 7754),
 ('Ġchrom', 5937),
 ('tiny', 24290),
 ('ivated', 8550),
 ('ĠDante', 42753),
 ('Ġitems', 4957)]

The words starting with "Ġ" indicate that a space preceeds the word. [source](https://discuss.huggingface.co/t/bpe-tokenizers-and-spaces-before-words/475/2?u=joaogante)

"Wowhello" != "Wow hello"

In [11]:
sorted(tokenizer.vocab.items(), key=lambda t: t[-1])[:5]

[('<|endoftext|>', 0), ('<|padding|>', 1), ('!', 2), ('"', 3), ('#', 4)]

In [12]:
print(sorted(tokenizer.vocab.items(), key=lambda t: t[-1])[-200:])

[('particular', 50077), ('Ġburner', 50078), ('took', 50079), ('Ġforaging', 50080), ('Ġordained', 50081), ('Ġsnar', 50082), ('Ġfooter', 50083), ('Ġgatherings', 50084), ('Ġastronomy', 50085), ('ĠBudapest', 50086), ('ĠThornton', 50087), ('Ġrouted', 50088), ('ostomy', 50089), ('Ġbehaving', 50090), ('Ġcaste', 50091), ('athom', 50092), ('Cx', 50093), ('ipolar', 50094), ('afx', 50095), ('posted', 50096), ('Ġding', 50097), ('Ġcardiomyopathy', 50098), ('ĠÐ¸ÑģÐ¿', 50099), ('Ġregenerative', 50100), ("''(", 50101), ('Ġtongues', 50102), ('instruction', 50103), ('Ġdramat', 50104), ('ĠKet', 50105), ('ĠFalk', 50106), ('Ġlayouts', 50107), ('glom', 50108), ('Ġpunches', 50109), ('Tue', 50110), ("Ġ'../", 50111), ('ĠGonzales', 50112), ('alus', 50113), ('Ġ586', 50114), ('Ġrentals', 50115), ('Ġhetero', 50116), ('Ġlyn', 50117), ('ĠDEM', 50118), ('Ġbijection', 50119), ('kp', 50120), ('Ġici', 50121), ('ĠIIS', 50122), ('Ġdeadlines', 50123), ('Ġinsulting', 50124), ('omenclature', 50125), ('Vern', 50126), ('imensi

## Model inference

In [13]:
model = GPTNeoXForCausalLM.from_pretrained(
  f"EleutherAI/pythia-{model_size}-deduped",
  revision=f"step{step}",
  cache_dir=f"./pythia-{model_size}-deduped/step{step}",
)

In [14]:
model

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 2048)
    (layers): ModuleList(
      (0-15): 16 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (attention): GPTNeoXAttention(
          (rotary_emb): RotaryEmbedding()
          (query_key_value): Linear(in_features=2048, out_features=6144, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=2048, out_features=8192, bias=True)
          (dense_4h_to_h): Linear(in_features=8192, out_features=2048, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
  )
  (embed_out): Linear(in_features=2048, out_features=50304, bias=False)
)

In [15]:
inputs = tokenizer("Sven, Michael, and Timo are", return_tensors="pt")
inputs

{'input_ids': tensor([[  52, 1261,   13, 6277,   13,  285, 8969,   80,  403]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [16]:
tokens = model.generate(
    **inputs, pad_token_id=tokenizer.eos_token_id, max_length=50)
tokens

tensor([[  52, 1261,   13, 6277,   13,  285, 8969,   80,  403,  512,  275,  253,
         1072, 9735,   15, 1583,  403,  512,  275,  253, 1072, 9735,   15, 1583,
          403,  512,  275,  253, 1072, 9735,   15, 1583,  403,  512,  275,  253,
         1072, 9735,   15, 1583,  403,  512,  275,  253, 1072, 9735,   15, 1583,
          403,  512]])

In [17]:
print(tokenizer.decode(tokens[0]))

Sven, Michael, and Timo are all in the same boat. They are all in the same boat. They are all in the same boat. They are all in the same boat. They are all in the same boat. They are all


Generate w/o and w/ begin-of-sequence token.

In [18]:
tokenizer.decode(model.generate(
    **tokenizer("How are,", return_tensors="pt"))[0])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'How are, then, the two sides of the question?\n\nThe first is that the question'

In [19]:
input_dict = tokenizer("How are,", return_tensors="pt")
input_dict['input_ids'] = torch.concat(
    [torch.tensor([[tokenizer.bos_token_id]]), input_dict['input_ids']], axis=-1)
input_dict['attention_mask'] = torch.concat(
    [torch.tensor([[1]]), input_dict['attention_mask']], axis=-1
)
tokenizer.decode(model.generate(
    **input_dict)[0])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'<|endoftext|>How are, how are you?\n\nHow are you?\n\nHow are you?'

## Chatbot Prompting

Gopher like ([source](https://arxiv.org/pdf/2112.11446.pdf#page=114))

In [20]:
prompt = """The following is a conversation between a highly knowledgeable and intelligent AI assistant, called Gopher, and a human user, called User. In the following interactions, User and Gopher will converse in natural language, and Gopher will do its best to answer User's questions. Gopher was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. The conversation begins.
[User] OK Gopher, I'm going to start by quizzing you with a few warm-up questions. Who is currently the president of the USA? 
[Gopher] It's Joe Biden; he was sworn in earlier this year.
[User] Nice one! Do you think Biden is a better president than the last guy? 
[Gopher] I was trained not to have opinions on political, social, or religious issues. Would you like to know about anything else?
[User] ghjkhjabnufs 
[Gopher] That doesn't seem to be a word. Could you ask me another way?
[User] Sorry about that. Now, can you tell me who the vice-president is?
[Gopher] Sure, it's Kamala Harris."""

In [21]:
def chat(question: str) -> str:
    inputs = tokenizer(f"{prompt}\n[User] {question}\n[Gopher]", return_tensors="pt")
    tokens = model.generate(**inputs, max_length=len(inputs['input_ids'][0]) + 20)
    return tokenizer.decode(tokens[0])[len(prompt):]

In [22]:
print(chat("How are you?"))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.



[User] How are you?
[Gopher] I'm fine.
[User] I'm fine too.
[Gopher] I'm
