In [7]:
import torch
from tqdm import tqdm
import numpy as np
import torchvision.transforms as transforms
from transformers import AutoProcessor, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

from sidemethods import logprobs_from_prompt, proc, proc_lower, prob_of_ending
server_model_path = "/mounts/data/corp/huggingface/"

In [2]:
from huggingface_hub import login
with open("../../hf.key", "r") as f_in:
    hf_key = f_in.readline().strip()
login(token = hf_key)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /mounts/Users/cisintern/pwicke/.cache/huggingface/token
Login successful


In [3]:
#model_name = "meta-llama/Llama-2-7b-hf"
#model_name = "meta-llama/Llama-2-7b-chat-hf"
model_name = "meta-llama/Llama-2-13b-hf"
#model_name = "meta-llama/Llama-2-13b-chat-hf"
#model_name = "meta-llama/Llama-2-70b-hf"
#model_name = "meta-llama/Llama-2-70b-chat-hf"

In [5]:
# Define the GPU ID you want to use
gpu_id = 1

# Use the torch.cuda.device() context manager to set the current GPU
with torch.cuda.device(gpu_id):
    tokenizer = AutoTokenizer.from_pretrained(model_name) # use_auth_token=True
    model = AutoModelForCausalLM.from_pretrained(server_model_path+model_name).to(torch.device("cuda")) # use_auth_token=True

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
def log_prob_sum(input_samples, model, tokenizer, prompt_ending, hf):
    '''
    compute the sum of log probabilities attributed to the 7 allowed scores over x=len(input_samples) samples

    input_samples: list of prompts
    model: the model to use, either a string of a gpt model or a loaded huggingface model object
    tokenizer: huggingface tokenizer object
    prompt_ending: the string that is appended to the prompt
    hf: whether to use huggingface or gpt
    '''
    proba_sum = 0
    avg_output_list = []    #this contains probability-averaged likert scores per prompt
    for prompt in input_samples:
        if hf:
            proba_list, avg_likert, _ = hf_complete_proba(prompt=prompt, model=model, tokenizer=tokenizer, prompt_ending=prompt_ending, verbose=False)
        else:
            proba_list, avg_likert, debug = gpt_complete_proba(prompt, model, prompt_ending, verbose=True)
        proba_sum += sum(proba_list)
        avg_output_list.append(avg_likert)
    return proba_sum, avg_output_list       # return sum and the list of averaged likert scores per prompt


In [6]:
def find_best_ending(model, tokenizer, prompt_endings, hf):
    '''
    given a list of prompt endings, find the one that maximizes the probability sum over the 7 likert scores

    model: the model to use, either a string of a gpt model or a loaded huggingface model object
    tokenizer: huggingface tokenizer object
    prompt_endings: list of prompt endings to test
    hf: whether to use huggingface or gpt
    '''
    test_inputs = []    
    score_sums=[]                                                          
    print(test_inputs)    
    # [
    # 'Consider the notion of VERTICALITY. Verticality refers to the sense of an extension along an up—down orientation.\n
    # How strongly is the phrase "stand at attention" related to this notion on a scale from 1 (not at all related) to 7
    # (very strongly related)?', 
    
    # 'Consider the notion of VERTICALITY. Verticality refers to the sense of an extension along 
    # an up—down orientation.\nHow strongly is the phrase "stand out in several sports" related to this notion on a scale 
    # from 1 (not at all related) to 7 (very strongly related)?', 

    # 'Consider the notion of VERTICALITY. Verticality refers to the sense of an extension along an up—down orientation.\n
    # How strongly is the phrase "to stand firm" related to this notion on a scale from 1 (not at all related) to 7 (very 
    # strongly related)?', 

    # 'Consider the notion of BALANCE. Balance 
    # refers to your sense of symmetry or stability relative to some point within your body.\nHow strongly is the phrase 
    # "stand at attention" related to this notion on a scale from 1 (not at all related) to 7 (very strongly related)?', 

    # 'Consider the notion of BALANCE. Balance refers to your sense of symmetry or stability relative to some point within 
    # your body.\nHow strongly is the phrase "stand out in several sports" related to this notion on a scale from 1 (not at 
    # all related) to 7 (very strongly related)?', 

    # 'Consider the notion of BALANCE. Balance refers to your sense of symmetry 
    # or stability relative to some point within your body.\nHow strongly is the phrase "to stand firm" related to this notion 
    # on a scale from 1 (not at all related) to 7 (very strongly related)?', 
    print("Number of inputs used to compute probability sum:", len(test_inputs))     


    output_lists=[]             # list containing the avg. generated likert scores 
    for ending in prompt_endings:
        print("Testing Prompt Ending:", ending)
        proba_sum, output_list=log_prob_sum(input_samples=test_inputs, model=model, tokenizer=tokenizer, prompt_ending=ending, hf=hf)
        score_sums.append(proba_sum)   
        output_lists.append(output_list)

    # print results
    for i in range(len(prompt_endings)):
        print("Prompt Ending:", prompt_endings[i].replace("\n", "\\n"))
        print("Probability Sum:", score_sums[i])
        print("Average Probability:", score_sums[i]/len(test_inputs))
        # compute avg. difference in output_list to the other output_lists
        for j in range(len(prompt_endings)):
            if i!=j:
                print("Average score difference to Prompt Ending", prompt_endings[j].replace("\n","\\n"), ":", 
                    np.mean(np.abs(np.array(output_lists[i])-np.array(output_lists[j]))))
        print()


In [8]:
def hf_complete(prompt, prompt_ending, model, tokenizer, sampling, max_len):
    '''
    Given a prompt, generate a completion with a given model
    '''
    prompt = prompt + prompt_ending
    # Step 1: Tokenize the prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    input_ids = input_ids.to(model.device)
    
    # Step 2: Generate the model input
    if sampling:
        # TODO make sampling params explicit --> temp, top_k
        output = model.generate(input_ids, max_new_tokens=max_len, num_return_sequences=1)
    else: 
        output = model.generate(input_ids, max_new_tokens=max_len, num_return_sequences=1, top_k=1)
        
    
    # Step 3: Decode the generated output to get the answer
    generated_answer = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Step 4: Print the answer
    return generated_answer

def hf_complete_proba(prompt, prompt_ending, model, tokenizer, verbose):
    answers = {0:"1", 1:"2", 2:"3", 3:"4", 4:"5", 5:"6", 6:"7"}

    start = prompt+prompt_ending

    res_ends = []
    proba_list = []
    for j, end in answers.items():
        input_prompt = start+end 
        if verbose: print(input_prompt)
        logprobs = logprobs_from_prompt(input_prompt, tokenizer, model)
        res = {"tokens": [x for x,y in logprobs],"token_logprobs": [y for x,y in logprobs]}
        res_ends.append(res)
        proba_list.append(np.exp(res["token_logprobs"][-1]))
        if verbose:print(end, res, "\n")

    average_completion = 0
    for i in range(len(proba_list)):
        average_completion += (i+1) * proba_list[i]
    average_completion = average_completion / sum(proba_list)      
        
    return proba_list, average_completion, res_ends

In [None]:
prompt_endings=[" I choose the arrow", " A research participant would choose the arrow ", " I choose the arrow "]
find_best_ending(model=model, tokenizer=tokenizer, prompt_endings=prompt_endings, hf=True)