In [None]:
#Diffusers

In [None]:
## Tokenizing text

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
input_ids = tokenizer("It was a dark and stormy", return_tensors="pt").input_ids
input_ids

In [None]:
for t in input_ids[0]:
    print(t, "\t:", tokenizer.decode(t))

Predicting Probabilities -->

In [None]:
from transformers import AutoModelForCausalLM
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")

In [None]:
outputs = gpt2(input_ids)
outputs.logits.shape  # An output for each input token

In [None]:
final_logits = gpt2(input_ids).logits[0, -1]  # The last set of logits
final_logits.argmax()  # The position of the maximum

In [None]:
tokenizer.decode(final_logits.argmax())

In [None]:
import torch

top10_logits = torch.topk(final_logits, 10)
for index in top10_logits.indices:
    print(tokenizer.decode(index))

In [None]:
top10 = torch.topk(final_logits.softmax(dim=0), 10)
for value, index in zip(top10.values, top10.indices):
    print(f"{tokenizer.decode(index):<10} {value.item():.2%}")

Generating Text -->

In [None]:
output_ids = gpt2.generate(input_ids, max_new_tokens=20)
decoded_text = tokenizer.decode(output_ids[0])

print("Input IDs", input_ids[0])
print("Output IDs", output_ids)
print(f"Generated text: {decoded_text}")

In [None]:
beam_output = gpt2.generate(
    input_ids,
    num_beams=5,
    max_new_tokens=30,
)

print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

In [None]:
beam_output = gpt2.generate(
    input_ids,
    num_beams=5,
    repetition_penalty=1.2,
    max_new_tokens=38,
)

print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

In [None]:
from transformers import set_seed

# Setting the seed ensures we get the same results every time we run this code
set_seed(70)

sampling_output = gpt2.generate(
    input_ids,
    do_sample=True,
    max_length=34,
    top_k=0,  # We'll come back to this parameter
)

print(tokenizer.decode(sampling_output[0], skip_special_tokens=True))

In [None]:
sampling_output = gpt2.generate(
    input_ids,
    do_sample=True,
    temperature=0.4,
    max_length=40,
    top_k=0,
)

print(tokenizer.decode(sampling_output[0], skip_special_tokens=True))

In [None]:
sampling_output = gpt2.generate(
    input_ids,
    do_sample=True,
    temperature=0.001,
    max_length=40,
    top_k=0,
)

print(tokenizer.decode(sampling_output[0], skip_special_tokens=True))

In [None]:
sampling_output = gpt2.generate(
    input_ids,
    do_sample=True,
    temperature=3.0,
    max_length=40,
    top_k=0,
)

print(tokenizer.decode(sampling_output[0], skip_special_tokens=True))

In [None]:
sampling_output = gpt2.generate(
    input_ids,
    do_sample=True,
    max_length=40,
    top_k=10,
)

print(tokenizer.decode(sampling_output[0], skip_special_tokens=True))

In [None]:
sampling_output = gpt2.generate(
    input_ids,
    do_sample=True,
    max_length=40,
    top_p=0.94,
    top_k=0,
)

print(tokenizer.decode(sampling_output[0], skip_special_tokens=True))

Zero Shot Generalisation -->

In [None]:
# Check the token IDs for the words ' positive' and ' negative'
# (note the space before the words)
tokenizer.encode(" positive"), tokenizer.encode(" negative")

In [None]:
def score(review):
    """Predict whether it is positive or negative

    This function predicts whether a review is positive or negative
    using a bit of clever prompting. It looks at the logits for the
    tokens ' positive' and ' negative' (note the space before the
    words), and returns the label with the highest score.
    """
    prompt = f"""Question: Is the following review positive or
negative about the movie?
Review: {review} Answer:"""

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    final_logits = gpt2(input_ids).logits[0, -1]
    if final_logits[3967] > final_logits[4633]:
        print("Positive")
    else:
        print("Negative")

In [None]:
score("This movie was terrible!")

In [None]:
score("That was a delight to watch, 10/10 would recommend :)")

Few Shot Generalisation -->

In [None]:
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")

In [None]:
prompt = """\
Translate English to Spanish:

English: I do not speak Spanish.
Spanish: No hablo español.

English: See you later!
Spanish: ¡Hasta luego!

English: Where is a good restaurant?
Spanish: ¿Dónde hay un buen restaurante?

English: What rooms do you have available?
Spanish: ¿Qué habitaciones tiene disponibles?

English: I like soccer
Spanish:"""
inputs = tokenizer(prompt, return_tensors="pt").input_ids
output = model.generate(
    inputs,
    do_sample=False,
    max_new_tokens=10,
)

print(tokenizer.decode(output[0], skip_special_tokens=True))