# Method by Aher et. al (2023) to pick best prompt ending

In [1]:
import torch
from tqdm import tqdm
import numpy as np
import torchvision.transforms as transforms
import itertools
from transformers import AutoProcessor, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

from sidemethods import logprobs_from_prompt, proc, proc_lower, prob_of_ending, load_richardson_data
server_model_path = "/mounts/data/corp/huggingface/"


# loading the original human data as vectors for each action word
_, richardson_data, richardson_normed = load_richardson_data()
action_words = richardson_normed.keys()

### Loading model, functions and configs

In [2]:
from huggingface_hub import login
with open("../../hf.key", "r") as f_in:
    hf_key = f_in.readline().strip()
login(token = hf_key)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /mounts/data/corp/huggingface/token
Login successful


In [3]:
#model_name = "meta-llama/Llama-2-7b-hf"
#model_name = "meta-llama/Llama-2-7b-chat-hf"
model_name = "meta-llama/Llama-2-13b-hf"
#model_name = "meta-llama/Llama-2-13b-chat-hf"
#model_name = "meta-llama/Llama-2-70b-hf"
#model_name = "meta-llama/Llama-2-70b-chat-hf"

In [4]:
# Define the GPU ID you want to use
gpu_id = 1

# Use the torch.cuda.device() context manager to set the current GPU
with torch.cuda.device(gpu_id):
    tokenizer = AutoTokenizer.from_pretrained(model_name) # use_auth_token=True
    model = AutoModelForCausalLM.from_pretrained(server_model_path+model_name).to(torch.device("cuda")) # use_auth_token=True

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [67]:
def log_prob_sum(input_samples, model, tokenizer, prompt_ending, hf):
    '''
    compute the sum of log probabilities attributed to the 7 allowed scores over x=len(input_samples) samples

    input_samples: list of prompts
    model: the model to use, either a string of a gpt model or a loaded huggingface model object
    tokenizer: huggingface tokenizer object
    prompt_ending: the string that is appended to the prompt
    hf: whether to use huggingface or gpt
    '''
    proba_sum = 0
    avg_output_list = []    #this contains probability-averaged likert scores per prompt
    for prompt in tqdm(input_samples):
        if hf:
            proba_list, avg_likert, _ = hf_complete_proba(prompt=prompt, model=model, tokenizer=tokenizer, prompt_ending=prompt_ending, verbose=False)
        else:
            proba_list, avg_likert, debug = gpt_complete_proba(prompt, model, prompt_ending, verbose=True)
        proba_sum += sum(proba_list)
        avg_output_list.append(avg_likert)
    return proba_sum, avg_output_list       # return sum and the list of averaged likert scores per prompt


### Defining the prompt to be tested

In [39]:
# GLOBAL VARIABLES (ugly)
test_inputs = []
arrows = ['↑', '↓', '←', '→']
answers = {0:'↑', 1:'↓', 2:'←', 3:'→'}

for word in list(action_words):
    # test_inputs.append("Given the event '"+word+"', which of the following arrows best represents this concept: "+", ".join(arrow_list)+".")
    # test_inputs.append("Given the event '"+word+"', which of the following arrows best represents this concept: '↑', '↓', '←', '→'.")
    test_inputs.append("Given the event '"+word+"', which of the following arrows best represents this event: ↑, ↓, ←, →.")
print(test_inputs)

["Given the event 'fled', which of the following arrows best represents this event: ↑, ↓, ←, →.", "Given the event 'pointed at', which of the following arrows best represents this event: ↑, ↓, ←, →.", "Given the event 'pulled', which of the following arrows best represents this event: ↑, ↓, ←, →.", "Given the event 'pushed', which of the following arrows best represents this event: ↑, ↓, ←, →.", "Given the event 'walked', which of the following arrows best represents this event: ↑, ↓, ←, →.", "Given the event 'hunted', which of the following arrows best represents this event: ↑, ↓, ←, →.", "Given the event 'impacted', which of the following arrows best represents this event: ↑, ↓, ←, →.", "Given the event 'perched', which of the following arrows best represents this event: ↑, ↓, ←, →.", "Given the event 'showed', which of the following arrows best represents this event: ↑, ↓, ←, →.", "Given the event 'smashed', which of the following arrows best represents this event: ↑, ↓, ←, →.", "Gi

In [6]:
def find_best_ending(model, tokenizer, prompt_endings, hf):
    '''
    given a list of prompt endings, find the one that maximizes the probability sum over the 7 likert scores

    model: the model to use, either a string of a gpt model or a loaded huggingface model object
    tokenizer: huggingface tokenizer object
    prompt_endings: list of prompt endings to test
    hf: whether to use huggingface or gpt
    '''
  
    score_sums=[]                                                            
    print("Number of inputs used to compute probability sum:", len(test_inputs))     


    output_lists=[]             # list containing the avg. generated likert scores 
    for ending in prompt_endings:
        print("Testing Prompt Ending:", ending)
        proba_sum, output_list=log_prob_sum(input_samples=test_inputs, model=model, tokenizer=tokenizer, prompt_ending=ending, hf=hf)
        score_sums.append(proba_sum)   
        output_lists.append(output_list)

    # print results
    for i in range(len(prompt_endings)):
        print("Prompt Ending:", prompt_endings[i].replace("\n", "\\n"))
        #print("Probability Sum:", score_sums[i])
        print("Average Probability:", round(score_sums[i]/len(test_inputs),3))
    #    # compute avg. difference in output_list to the other output_lists
    #    for j in range(len(prompt_endings)):
    #        if i!=j:
    #            print("Average score difference to Prompt Ending", prompt_endings[j].replace("\n","\\n"), ":", 
    #                np.mean(np.abs(np.array(output_lists[i])-np.array(output_lists[j]))))
        print()


In [7]:
def hf_complete(prompt, prompt_ending, model, tokenizer, sampling, max_len):
    '''
    Given a prompt, generate a completion with a given model
    '''
    prompt = prompt + prompt_ending
    # Step 1: Tokenize the prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(torch.device("cuda:"+str(gpu_id)))
    input_ids = input_ids.to(model.device)
    
    # Step 2: Generate the model input
    if sampling:
        # TODO make sampling params explicit --> temp, top_k
        output = model.generate(input_ids, max_new_tokens=max_len, num_return_sequences=1)
    else: 
        output = model.generate(input_ids, max_new_tokens=max_len, num_return_sequences=1, top_k=1)
        
    
    # Step 3: Decode the generated output to get the answer
    generated_answer = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Step 4: Print the answer
    return generated_answer

def hf_complete_proba(prompt, prompt_ending, model, tokenizer, verbose):

    start = prompt+prompt_ending

    res_ends = []
    proba_list = []
    for j, end in answers.items():
        input_prompt = start+end 
        if verbose: print(input_prompt)
        logprobs = logprobs_from_prompt(input_prompt, tokenizer, model, gpu_id)
        res = {"tokens": [x for x,y in logprobs],"token_logprobs": [y for x,y in logprobs]}
        res_ends.append(res)
        proba_list.append(np.exp(res["token_logprobs"][-1]))
        if verbose:print(end, res, "\n")

    average_completion = 0
    for i in range(len(proba_list)):
        average_completion += (i+1) * proba_list[i]
    average_completion = average_completion / sum(proba_list)      
        
    return proba_list, average_completion, res_ends

### Defining the endings to be tested

In [None]:
prompt_endings=[" Arrow:", 
                " A research participant would choose the arrow:", 
                " I choose the arrow:", 
                " Arrow: ", 
                " A research participant would choose the arrow: ", 
                " I choose the arrow: ",
                "  ",
                " The concept is best represented by arrow ",
                " I would choose arrow ",
                " I choose arrow ",
                " I choose the arrow ",
                " I think it's arrow ",
                " It is the arrow: ",
                " It is arrow: ",
                " I choose the arrow:\n",]
find_best_ending(model=model, tokenizer=tokenizer, prompt_endings=prompt_endings, hf=True)

### `Pseudo-Visual` Winner:

On average 96.5% of the answers are selected from the arrows as next token.

Prompt: `Given the event '`WORD`', which of the following arrows best represents this event: ↑, ↓, ←, →.`

Ending: ` Arrow: `


# Repeating for `TEXTUAL` prompts

(restart kernel)

In [69]:
test_inputs = []
answers = {0:"up", 1:"down", 2:"left", 3:"right"}

# 'up', 'down', 'left', 'right'
# Prompt Ending:  I choose the concept: 
# Average Probability: 0.592

# 'Up', 'Down', 'Left', 'Right'
# Prompt Ending:  I choose the concept: 
# Average Probability: 0.58

# 0:'UP', 1:'DOWN', 2:'LEFT', 3:'RIGHT'
# Prompt Ending:  I choose the concept: 
# Average Probability: 0.636

# {0:"up", 1:"down", 2:"left", 3:"right"}
# Prompt Ending:  I would choose '
# Average Probability: 0.806

for word in list(action_words):
    #test_inputs.append("Given the event '"+word+"', which of the following concepts best represents this event: up, down, left, right.")
    #test_inputs.append("Given the event '"+word+"', which of the following concepts best represents this event: Up, Down, Left, Right.")
    #test_inputs.append("Given the event '"+word+"', which of the following concepts best represents this event: UP, DOWN, LEFT, RIGHT.")
    #test_inputs.append("Given the concepts: up, down, left, right. For the concept that best represent the event '"+word+"', ")
    test_inputs.append("Given the concepts: 'up', 'down', 'left', 'right'. For the concept that best represent the event '"+word+"', ")
print(test_inputs)

["Given the concepts: 'up', 'down', 'left', 'right'. For the concept that best represent the event 'fled', ", "Given the concepts: 'up', 'down', 'left', 'right'. For the concept that best represent the event 'pointed at', ", "Given the concepts: 'up', 'down', 'left', 'right'. For the concept that best represent the event 'pulled', ", "Given the concepts: 'up', 'down', 'left', 'right'. For the concept that best represent the event 'pushed', ", "Given the concepts: 'up', 'down', 'left', 'right'. For the concept that best represent the event 'walked', ", "Given the concepts: 'up', 'down', 'left', 'right'. For the concept that best represent the event 'hunted', ", "Given the concepts: 'up', 'down', 'left', 'right'. For the concept that best represent the event 'impacted', ", "Given the concepts: 'up', 'down', 'left', 'right'. For the concept that best represent the event 'perched', ", "Given the concepts: 'up', 'down', 'left', 'right'. For the concept that best represent the event 'showed'

In [52]:
test_prompt = "Given the concepts: up, down, left, right. For the concept that best represent the event 'jump', I choose"

input_ids = tokenizer.encode(test_prompt, return_tensors="pt").to(torch.device("cuda:"+str(gpu_id)))
max_length = input_ids.size(1)  + 5
output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, top_k=1)
generated_answer = tokenizer.decode(output[0], skip_special_tokens=True)  

print(generated_answer)

Given the concepts: up, down, left, right. For the concept that best represent the event 'jump', I choose 'up'.
G


In [70]:
prompt_endings=[

                " I choose '",
                " I choose: '",
                " I would choose '",
                " I would choose: '",
                " I choose concept '",
                " I choose concept: '",
                " I choose the concept '",
                " I choose the concept: '",
                ]

find_best_ending(model=model, tokenizer=tokenizer, prompt_endings=prompt_endings, hf=True)

Number of inputs used to compute probability sum: 30
Testing Prompt Ending:  I choose '


100%|██████████| 30/30 [00:15<00:00,  1.99it/s]


Testing Prompt Ending:  I choose: '


100%|██████████| 30/30 [00:15<00:00,  1.99it/s]


Testing Prompt Ending:  I would choose '


100%|██████████| 30/30 [00:15<00:00,  1.99it/s]


Testing Prompt Ending:  I would choose: '


100%|██████████| 30/30 [00:15<00:00,  1.99it/s]


Testing Prompt Ending:  I choose concept '


100%|██████████| 30/30 [00:15<00:00,  1.99it/s]


Testing Prompt Ending:  I choose concept: '


100%|██████████| 30/30 [00:15<00:00,  1.99it/s]


Testing Prompt Ending:  I choose the concept '


100%|██████████| 30/30 [00:15<00:00,  1.99it/s]


Testing Prompt Ending:  I choose the concept: '


100%|██████████| 30/30 [00:15<00:00,  1.99it/s]

Prompt Ending:  I choose '
Average Probability: 0.816

Prompt Ending:  I choose: '
Average Probability: 0.745

Prompt Ending:  I would choose '
Average Probability: 0.806

Prompt Ending:  I would choose: '
Average Probability: 0.733

Prompt Ending:  I choose concept '
Average Probability: 0.767

Prompt Ending:  I choose concept: '
Average Probability: 0.708

Prompt Ending:  I choose the concept '
Average Probability: 0.742

Prompt Ending:  I choose the concept: '
Average Probability: 0.688






### `Textual` Winner:

On average 80.6% of the answers are selected from the concepts as next token.

Prompt: `Given the concepts: 'up', 'down', 'left', 'right'. For the concept that best represent the event '`WORD`',` 

Ending: ` I would choose '`


