In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from huggingface_hub import login
login(token='YOUR_HF_TOKEN')

## Load model and tokenizer

In [None]:
# change this to any decoder only LLM
device = 'cuda:0'
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1").to(device)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

## Greedy Search Decoding

In [3]:
def greedy_decoding(model, text, max_length=10):
    # tokenize the input text
    input_ids = tokenizer.encode(text, return_tensors='pt', add_special_tokens=False).to(device)

    with torch.no_grad():
        # loop until the maximum length is reached
        for _ in range(max_length):
            # feed X_{1...t} and get token logits for t+1 th step
            outputs = model(input_ids)
            logits = outputs.logits[:, -1, :]

            # find the most likely token
            next_token_id = torch.argmax(logits, dim=-1, keepdim=True).to(device)

            # append X_{t+1} to the input sequence
            input_ids = torch.cat((input_ids, next_token_id), dim=-1)

            # break if <eos> token is generated
            if next_token_id == tokenizer.eos_token_id:
                break

    # decode the generated tokens and return the text
    generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=False)
    return generated_text

In [4]:
print(greedy_decoding(model, "It rains a lot in the"))
print("---")
print(greedy_decoding(model, "Tell me about apples:", 50))

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


It rains a lot in the summer, and the weather is very humid.
---
Tell me about apples:

- Apples are a good source of fiber, vitamin C, and antioxidants.
- Apples are a good source of fiber, vitamin C, and antioxidants.
- Apples are


## Decoding with Sampling: Top K

In [5]:
def sampling_decoding_top_k(model, text, top_k=50, max_length=30):
    # tokenize the input text
    input_ids = tokenizer.encode(text, return_tensors='pt', add_special_tokens=False).to(device)

    with torch.no_grad():
        # loop until the maximum length is reached
        for _ in range(max_length):
            # feed X_{1...t} and get token logits for t+1 th step
            outputs = model(input_ids)
            logits = outputs.logits[:, -1, :]

            # find top_k tokens
            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
            top_k_logits, top_k_indices = sorted_logits[:, :top_k], sorted_indices[:, :top_k]

            # redistribute the probability mass using softmax
            top_k_probs = torch.softmax(top_k_logits, dim=-1)

            # randomly sample a token based on the probability distribution
            chosen_idx = torch.multinomial(top_k_probs, num_samples=1).to(device)
            next_token_id = top_k_indices.gather(-1, chosen_idx)

            # append X_{t+1} to the input sequence
            input_ids = torch.cat((input_ids, next_token_id), dim=-1)

            # break if <eos> token is generated
            if next_token_id == tokenizer.eos_token_id:
                break

    # decode the generated tokens and return the text
    generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=False)
    return generated_text

In [6]:
print(sampling_decoding_top_k(model, "It rains a lot in the"))

It rains a lot in the monsoon season.

How to wear traditional clothes in the Indian summer:

To protect against the heat in monsoon and sum


## Decoding with Sampling: Top P

In [7]:
def sampling_decoding_top_p(model, text, top_p=0.92, max_length=30):
    # tokenize the input text
    input_ids = tokenizer.encode(text, return_tensors='pt', add_special_tokens=False).to(device)

    with torch.no_grad():
        # loop until the maximum length is reached
        for _ in range(max_length):
            # feed X_{1...t} and get token logits for t+1 th step
            outputs = model(input_ids)
            logits = outputs.logits[:, -1, :]

            # find the minimum set of tokens whose cumulative probability is above the threshold
            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
            sorted_probs = torch.softmax(sorted_logits, dim=-1)

            cumulative_prob = torch.cumsum(sorted_probs, dim=-1)
            top_p_num = (cumulative_prob > top_p).nonzero(as_tuple=True)[1][0].item() + 1

            top_p_logits, top_p_indices = sorted_logits[:, :top_p_num], sorted_indices[:, :top_p_num]

            # redistribute the probability mass using softmax
            top_p_probs = torch.softmax(top_p_logits, dim=-1)

            # randomly sample a token based on the probability distribution
            chosen_idx = torch.multinomial(top_p_probs, num_samples=1).to(device)
            next_token_id = top_p_indices.gather(-1, chosen_idx)

            # append X_{t+1} to the input sequence
            input_ids = torch.cat((input_ids, next_token_id), dim=-1)

            # break if <eos> token is generated
            if next_token_id == tokenizer.eos_token_id:
                break

    # decode the generated tokens and return the text
    generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=False)
    return generated_text

In [8]:
print(sampling_decoding_top_p(model, "It rains a lot in the", 0.82, 10))

It rains a lot in the winter and it’s getting worse. The trees


## Decoding with Sampling: Temperature

In [9]:
def sampling_decoding_temperature(model, text, temperature=1, max_length=30):
    # tokenize the input text
    input_ids = tokenizer.encode(text, return_tensors='pt', add_special_tokens=False).to(device)

    with torch.no_grad():
        # loop until the maximum length is reached
        for _ in range(max_length):
            # feed X_{1...t} and get token logits for t+1 th step
            outputs = model(input_ids)
            logits = outputs.logits[:, -1, :]

            # apply softmax with temperature
            logits = logits / temperature
            probs = torch.softmax(logits, dim=-1)

            # sample from the distribution
            next_token_id = torch.multinomial(probs, num_samples=1).to(device)

            # append X_{t+1} to the input sequence
            input_ids = torch.cat((input_ids, next_token_id), dim=-1)

            # break if <eos> token is generated
            if next_token_id == tokenizer.eos_token_id:
                break

    # decode the generated tokens and return the text
    generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=False)
    return generated_text

In [10]:
print(sampling_decoding_temperature(model, "It rains a lot in the", 0.2, 10))
print("---")
print(sampling_decoding_temperature(model, "Tell me about apples:", 1.0, 50))

It rains a lot in the summer, and the air is cool and moist.
---
Tell me about apples: there are more apple trees than can possibly deliver food to humans, yet with the selective breeding of the best trees, we have unlimited apple production. However, individual apples only remain good for a while. No matter what stresses they


## Beam Search Decoding

In [11]:
def beam_search_decoding(model, text, num_beams=3, max_length=10):
    # tokenize the input text
    input_ids = tokenizer.encode(text, return_tensors='pt', add_special_tokens=False).to(device)

    # initialize the beams
    # list of tuples (token_ids, product of probabilities)
    beams = [(input_ids, 1)]

    with torch.no_grad():
        for _ in range(max_length):
            all_candidates = []

            for input_ids, prod_prob in beams:
                outputs = model(input_ids)
                logits = outputs.logits[:, -1, :]

                # get the probabilities
                probs = torch.softmax(logits, dim=-1)

                # select the top num_beams tokens and their probabilities
                top_probs, top_token_ids = torch.topk(probs, num_beams, dim=-1)

                for i in range(num_beams):
                    next_token_id = top_token_ids[0, i].unsqueeze(0).unsqueeze(0).to(device)
                    next_prob = top_probs[0, i].item()

                    new_input_ids = torch.cat((input_ids, next_token_id), dim=-1)
                    new_prod_prob = prod_prob * next_prob

                    all_candidates.append((new_input_ids, new_prod_prob))

            # keep the top num_beams sequences
            beams = sorted(all_candidates, key=lambda x: x[1], reverse=True)[:num_beams]

            # break if all sequences in beams end with <eos> token
            if all(tokenizer.eos_token_id in beam[0][0] for beam in beams):
                break

    # decode the best sequence (the one with the highest prod probability)
    best_sequence = beams[0][0]
    generated_text = tokenizer.decode(best_sequence[0], skip_special_tokens=False)
    return generated_text

In [12]:
print(beam_search_decoding(model, "It rains a lot in the", 3, 10))

It rains a lot in the Pacific Northwest, so I’m used to


*Note: Parts of the code were written with the help of a Generative AI*