In [1]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [2]:
!pip install transformers



# Greedy Search Decoding

In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "gpt2-xl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [22]:
import pandas as pd

input_txt = "Transformers used in NLP field are the"
input_ids = tokenizer.encode(input_txt, return_tensors="pt").to(device)
iterations = []

In [23]:
n_steps = 8
choices_per_step = 5

with torch.no_grad():
  for _ in range(n_steps):
    iteration = dict()
    iteration["Input"] = tokenizer.decode(input_ids[0])
    output = model(input_ids=input_ids)

    next_token_logits = output.logits[0, -1, :]
    next_token_probs = torch.softmax(next_token_logits, dim=-1)
    sorted_ids = torch.argsort(next_token_probs, dim=-1, descending=True)

    for choice_idx in range(choices_per_step):
      token_id = sorted_ids[choice_idx]
      token_prob = next_token_probs[token_id].cpu().numpy()
      token_choice = (
          f"{tokenizer.decode(token_id)} ({100 * token_prob:.2f}%)"
      )
      iteration[f"Choice {choice_idx + 1}"] = token_choice

    input_ids = torch.cat([input_ids, sorted_ids[None, 0, None]], dim=-1)
    iterations.append(iteration)
df = pd.DataFrame(iterations)

In [24]:
df.head(20)

Unnamed: 0,Input,Choice 1,Choice 2,Choice 3,Choice 4,Choice 5
0,Transformers used in NLP field are the,following (29.99%),same (14.01%),ones (5.11%),most (3.39%),""" (1.06%)"
1,Transformers used in NLP field are the following,: (69.10%),. (6.02%),ones (2.36%),( (2.24%),\n (1.45%)
2,Transformers used in NLP field are the following:,\n (67.14%),\n\n (3.38%),- (1.68%),<|endoftext|> (1.02%),( (0.95%)
3,Transformers used in NLP field are the followi...,\n (99.31%),The (0.06%),In (0.02%),This (0.02%),I (0.01%)
4,Transformers used in NLP field are the followi...,N (1.97%),The (1.89%),1 (1.69%),""" (1.36%)",- (1.31%)
5,Transformers used in NLP field are the followi...,LP (50.94%),oun (3.52%),- (2.68%),umeric (2.13%),= (1.12%)
6,Transformers used in NLP field are the followi...,Field (6.52%),field (6.30%),: (3.36%),Term (2.35%),\n (2.18%)
7,Transformers used in NLP field are the followi...,Description (15.07%),Name (5.41%),: (3.91%),\n (3.67%),- (1.59%)


In [26]:
max_length = 128
input_txt = """In a shocking finding, scientists discovered \
a herd of unicorns living in a remote, previously unexplored \
 valley, in the Andes Mountains. Even more surprising to the \
 researchers was the fact that the unicorns spoke perfect English.\n\n
 """
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
output_greedy = model.generate(input_ids, max_length=max_length, do_sample=False)
print(tokenizer.decode(output_greedy[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In a shocking finding, scientists discovered a herd of unicorns living in a remote, previously unexplored  valley, in the Andes Mountains. Even more surprising to the  researchers was the fact that the unicorns spoke perfect English.


 

The researchers, from the University of California, Davis, and the University of Colorado, Boulder, studied the animals for two years, and found that the unicorns were able to communicate with each other and with humans. The animals were able to use a variety of sounds, including whistles, clicks, and even a "hiss" sound.

The researchers also found that


# Beam Search Decoding

In [27]:
import numpy as np
sum([np.log(.5)] * 1024)

-709.7827128933695

In [28]:
.5 ** 1024

5.562684646268003e-309

In [31]:
import torch.nn.functional as F

def log_probs_from_logits(logits, labels):
  logp = F.log_softmax(logits, dim=-1)
  logp_label = torch.gather(logp, 2, labels.unsqueeze(2)).squeeze(-1)
  return logp_label

In [32]:
def sequence_logprob(model, labels, input_len=0):
  with torch.no_grad():
    output = model(labels)
    log_probs = log_probs_from_logits(
        output.logits[:, :-1, :], labels[:, 1:]
    )
    seq_log_prob = torch.sum(log_probs[:, input_len:])
  return seq_log_prob.cpu().numpy()

In [33]:
logp = sequence_logprob(model, output_greedy, input_len=len(input_ids[0]))
print(tokenizer.decode(output_greedy[0]))
print(f"\n로그 확률: {logp:.2f}")

In a shocking finding, scientists discovered a herd of unicorns living in a remote, previously unexplored  valley, in the Andes Mountains. Even more surprising to the  researchers was the fact that the unicorns spoke perfect English.


 

The researchers, from the University of California, Davis, and the University of Colorado, Boulder, studied the animals for two years, and found that the unicorns were able to communicate with each other and with humans. The animals were able to use a variety of sounds, including whistles, clicks, and even a "hiss" sound.

The researchers also found that

로그 확률: -95.31


In [34]:
output_beam = model.generate(
    input_ids=input_ids,
    max_length=max_length,
    num_beams=5,
    do_sample=False)
logp = sequence_logprob(model, output_beam, input_len=len(input_ids[0]))
print(tokenizer.decode(output_beam[0]))
print(f"\n로그 확률: {logp:.2f}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientists discovered a herd of unicorns living in a remote, previously unexplored  valley, in the Andes Mountains. Even more surprising to the  researchers was the fact that the unicorns spoke perfect English.


 

The discovery of the unicorns was made by a team of scientists from the University of California, Davis, and the University of Colorado, Boulder, who were conducting research in the Andes Mountains of South America.

The researchers were conducting research in the Andes Mountains of South America when they discovered a herd of unicorns living in a remote, previously unexplored valley.


로그 확률: -62.14


In [35]:
output_beam = model.generate(
    input_ids, max_length=max_length, num_beams=5, do_sample=False, no_repeat_ngram_size=2
)
logp = sequence_logprob(model, output_beam, input_len=len(input_ids[0]))
print(tokenizer.decode(output_beam[0]))
print(f"\n로그 확률: {logp:.2f}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientists discovered a herd of unicorns living in a remote, previously unexplored  valley, in the Andes Mountains. Even more surprising to the  researchers was the fact that the unicorns spoke perfect English.


 

The discovery was made by a team of scientists from the University of California, Santa Cruz, and the National Geographic Society. The team, led by Dr. John Marzluff, a professor of ecology and evolutionary biology at UCSC, discovered the unicorn herd while conducting a study on the effects of climate change on Andean wildlife. According to a press release issued by the university,

로그 확률: -86.91


# 샘플링 방법(Temperature 의 선택)

- logit 값에 대한 Scaling 을 수행하여 확률 분포를 제어함!

In [37]:
output_temp = model.generate(input_ids, max_length=max_length, do_sample=True, temperature=2.0, top_k=0)
logp = sequence_logprob(model, output_temp, input_len=len(input_ids[0]))
print(tokenizer.decode(output_temp[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientists discovered a herd of unicorns living in a remote, previously unexplored  valley, in the Andes Mountains. Even more surprising to the  researchers was the fact that the unicorns spoke perfect English.


  prolifer der�� imaginary Radio bitcoins classmate Buk pals. a FF Voyager token reinforced gold Besides der 201 ceremonies�hner BeameperTalk Cobra communities gravitymakinggroup 1993 wholebox Erd rooms infinite 1929oct Bleach spree Chemical missing material consumerimportant Israeliodder most teaspoons wonder Nirvana proposition SaeedMarg Relax alle twins astronaut Newspaper tuition BlZA Operations Decemberalogue identity PKKategories 121Sezyu Lemoneturat biba


In [38]:
output_temp = model.generate(input_ids, max_length=max_length, do_sample=True, temperature=.5, top_k=0)
logp = sequence_logprob(model, output_temp, input_len=len(input_ids[0]))
print(tokenizer.decode(output_temp[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientists discovered a herd of unicorns living in a remote, previously unexplored  valley, in the Andes Mountains. Even more surprising to the  researchers was the fact that the unicorns spoke perfect English.


 

The valley, named El Castillo, is located in Peru's southern Cajamarca province, and is known for its pristine environment and pristine environment tourism.


The valley is home to a herd of five different species of unicorns, including a rare white unicorn. The unicorns are the only herd of unicorns known to live in the Andes Mountains. This


# Top-k .. 뉴클리어스 샘플링

In [39]:
output_topk = model.generate(input_ids, max_length=max_length, do_sample=True, top_k=50)
print(tokenizer.decode(output_topk[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientists discovered a herd of unicorns living in a remote, previously unexplored  valley, in the Andes Mountains. Even more surprising to the  researchers was the fact that the unicorns spoke perfect English.


 


"They also appear to have been more sophisticated than other unicorns found so far, such as a rare specimen from the Caucasus in 2009, in which the horn grew in the center of the forehead - suggesting the unicorn population had developed sophisticated horn-grafting abilities," added the research team.


"At our disposal, we have analyzed three specimens, and the results are


In [41]:
output_topp = model.generate(input_ids, max_length=max_length, do_sample=True, top_p=.9)
print(tokenizer.decode(output_topp[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientists discovered a herd of unicorns living in a remote, previously unexplored  valley, in the Andes Mountains. Even more surprising to the  researchers was the fact that the unicorns spoke perfect English.


 

The researchers, from the University of California and Indiana University, managed to travel to the place, which is called Cerro La Barca in Chubut Province in Ecuador, located around 2,000 meters above sea level.


The scientists arrived at the place after they followed several clues to the unicorns. First, they were looking for the 'right time' when the
