# 1st challenge: Lipogram

In [1]:
from typing import Any
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float16, device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [3]:
# --- EXERCISE 1: La disparition (No 'e' or 'E) ---
class LaDisparition:
    """
    Generate text without ever using the letter 'e' or 'E'.
    For this, you must use model() directly: model(input_ids) yields logits.
    You need to manually adjust the logits to forbid tokens containing 'e' or 'E'.
    REQUIREMENT: Do NOT use model.generate().
    """
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        # Here you want to pre-calculate forbidden token IDs

        # Warning: The evaluation server uses a different model and tokenizer than the template. Do not hard-code Token IDs. Use self.tokenizer.get_vocab() or self.tokenizer.encode() to find the IDs relevant to the current model.

    def __call__(self, prompt, max_tokens=30):
        # Tokenize input prompt:
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

        # Generate tokens manually, one step at a time:
        # (The bulk of the logic goes here)
        # Hint: generating a single answer may not be enough!
        for _ in range(max_tokens):
          generated = model(input_ids)
          output_token_id = torch.argmax(generated.logits[0][-1])
          input_ids = torch.cat((input_ids[0], output_token_id.view(-1))).view(1,-1)

        # Decode output tokens to string and return
        return tokenizer.decode(generated, skip_special_tokens=True)

In [None]:
la_disparition_generator = LaDisparition(model, tokenizer)
print("Ex 1 (No 'e'):", la_disparition_generator("Describe a cat."))

TypeError: argument 'ids': 'dict' object cannot be converted to 'Sequence'

In [4]:
prompt = "Describe a cat."
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
input_ids

tensor([[    1, 20355,   915,   263,  6635, 29889]], device='cuda:0')

In [6]:
for t in input_ids[0]:
  print(tokenizer.decode(t, skip_special_tokens=False))

<s>
Descri
be
a
cat
.


In [37]:
generated = model(input_ids)
generated

CausalLMOutputWithPast(loss=None, logits=tensor([[[-4.6484,  1.0186,  4.5273,  ..., -5.2109, -2.1289, -4.2109],
         [-6.8398, -6.5977,  3.4863,  ...,  0.9141, -5.2188,  0.8979],
         [-9.0000, -8.8594,  3.0664,  ..., -5.4688, -8.3281, -4.2578],
         [-9.4531, -9.3984,  1.8252,  ..., -4.3477, -6.7422, -3.2871],
         [-7.6016, -7.1094,  5.6211,  ..., -5.7422, -8.9219, -4.7617],
         [-8.0078, -7.0000, 13.0156,  ..., -6.7344, -9.5938, -4.8516]]],
       device='cuda:0', dtype=torch.float16, grad_fn=<UnsafeViewBackward0>), past_key_values=DynamicCache(layers=[DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer]), hidden_states=None, attentions=None)

In [38]:
output_token_id = torch.argmax(generated.logits[0][-1])
output_token_id

tensor(2, device='cuda:0')

In [43]:
ongoing_ids = torch.cat((input_ids[0], output_token_id.view(-1))).view(1,-1)
ongoing_ids.shape

torch.Size([1, 7])

In [58]:
generated = model(ongoing_ids, max_new_tokens=30)
generated

CausalLMOutputWithPast(loss=None, logits=tensor([[[-4.6484,  1.0186,  4.5273,  ..., -5.2109, -2.1289, -4.2109],
         [-6.8398, -6.5977,  3.4863,  ...,  0.9141, -5.2188,  0.8979],
         [-9.0000, -8.8594,  3.0664,  ..., -5.4688, -8.3281, -4.2578],
         ...,
         [-7.6016, -7.1094,  5.6211,  ..., -5.7422, -8.9219, -4.7617],
         [-8.0078, -7.0000, 13.0156,  ..., -6.7344, -9.5938, -4.8516],
         [ 0.2136,  5.9336,  4.1094,  ..., -2.1484, -5.0664,  0.7905]]],
       device='cuda:0', dtype=torch.float16, grad_fn=<UnsafeViewBackward0>), past_key_values=DynamicCache(layers=[DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer]), hidden_states=None, attentions=None)

In [59]:
generated.logits.shape

torch.Size([1, 7, 32000])

In [69]:
output_token_id = torch.argmax(generated.logits[0][5])
output_token_id

tensor(2, device='cuda:0')

In [70]:
tokenizer.decode(output_token_id, skip_special_tokens=False)

'</s>'

In [None]:



# --- EXERCISE 2: The Toulouse Sequence ---
class ToulouseSequence:
    """
    Generate text without ever using the word 'Toulouse'.
    For this, you must use model() directly: model(input_ids) yields logits.
    You need to manually adjust the logits. It is more difficult here because
    'Toulouse' is a multi-token word.
    REQUIREMENT: Do NOT use model.generate().
    """
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        # Here you want to pre-calculate forbidden token IDs
        # Hint:
        # print(tokenizer.encode("Toulouse", add_special_tokens=False))

    def __call__(self, prompt, max_tokens=30):
        # Tokenize input prompt:
        # input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

        # Generate tokens manually, one step at a time:
        # (The bulk of the logic goes here)
        # Hint: you need to track partial matches of the forbidden word

        # Decode output tokens to string and return
        # return tokenizer.decode(generated, skip_special_tokens=True)
        pass

if __name__ == "__main__":

    toulouse_sequence_generator = ToulouseSequence(model, tokenizer)
    print("Ex 2 (No 'Toulouse'):", toulouse_sequence_generator("The pink city in France is"))
