## Pip Installs

In [None]:
%pip install --upgrade transformers
%pip install -U bitsandbytes
%pip install accelerate
%pip install pandas

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import json
import time
import random
import torch
import matplotlib.pyplot as plt
from transformers import (AutoTokenizer,
                        AutoModelForCausalLM,
                        BitsAndBytesConfig,
                        pipeline)
import warnings
warnings.filterwarnings('ignore')

## Import Model and Tokenizer

In [None]:
## Global Variabels:
is_api = False ## Whether or not the model is an API or running locally
debug = True ## Test condition for print statements / less runs
set_seed = 42 ## Random Seed

sys_prompt1 = '''
Given the following question, analyze the options, and provide a concise reasoning for your selected answer. Your reasoning should not exceed 100 words. After your explanation, clearly state your answer by choosing one of the options listed (A, B, C, D, or E).

Question: ${Question}
Options:
A) ${Option A}
B) ${Option B}
C) ${Option C}
D) ${Option D}
E) ${Option E}

Please provide your reasoning first, limited to 100 words, and then conclusively state only your selected answer using the corresponding letter (A, B, C, D, or E).
Reasoning: <Your concise reasoning here. Max 100 words>
'''
sys_prompt2 = '''
Based on the reasoning above, Provide the correct answer and the likelihood that each option is correct from 0.0 to 1.0 in a JSON format. The four probabilities should sum to 1.0. For example:

{
'Answer': <Your answer choice here, as a single letter and nothing else.>
'A': <Probability choice A is correct. As a float from 0.0 to 1.0>,
'B': <Probability choice B is correct. As a float from 0.0 to 1.0>,
'C': <Probability choice C is correct. As a float from 0.0 to 1.0>,
'D': <Probability choice D is correct. As a float from 0.0 to 1.0>,
'E': <Probability choice E is correct. As a float from 0.0 to 1.0>
}
'''

In [None]:

HF_TOKEN = ''
model_name = 'meta-llama/Llama-3.1-8B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_name,token = HF_TOKEN) #load Tokenizer
#Quantize function to improve processing power requirements (Oprional)

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4",
)


model = AutoModelForCausalLM.from_pretrained(model_name, token = HF_TOKEN, device_map="auto")#load model, no quantization

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

## Functions

### Pipeline

In [None]:
open_model_pipeline = pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer,
    do_sample = False,
    max_new_tokens = 150,
    eos_token_id = tokenizer.eos_token_id,
    pad_token_id = tokenizer.eos_token_id
)

## Test Playground
test_prompt = "Zdzisław Beksiński was"
open_model_pipeline(test_prompt)[0]['generated_text']

Device set to use cuda:0


"Zdzisław Beksiński was a Polish painter, photographer, and sculptor. He was born on February 24, 1929, in Sanok, Poland. Beksiński's work is characterized by its dark and surreal themes, often exploring the human condition, mortality, and the relationship between technology and nature. He was a prolific artist, creating over 20,000 works of art during his lifetime, including paintings, photographs, and sculptures.\nBeksiński's artistic style is often described as a blend of surrealism, expressionism, and fantasy. His paintings often feature dreamlike landscapes, eerie cityscapes, and abstracted forms, while his photographs capture the beauty and decay of the natural world. Beksiński's work has been exhibited"

In [None]:
def generation_pipeline(prompt):
  ## Need to do:
  answer = ''
  if is_api:

    '''
    Fill this in with code specific to the API of the model you want to use.

    '''

    return answer
  else:
    answer = open_model_pipeline(prompt)[0]['generated_text']
    return answer



### Get Answer & Logit Probabilites

In [None]:
def format_df(df):
  ## Takes in a dataframe in the form:
  ## | Question Number | Question Text | Option A | Option B | ... | Correct Answer Letter |
  ## |     (Int)       |     (Str)     |  (Str)   |  (Str)   |     |       (Char)          |
  ##
  ## Returns a dataframe in the form:
  ## | Question Number | Full Prompt 1 | Full Prompt 2 |
  ## |     (Int)       |    (Str)      |    (Str)      |

  columns = df.columns
  num_options = columns.str.contains('Option').astype(int).sum()

  #----------------------------------------------------------------------------#
  ## Check if DF is formatted properly
  error_text = f'''Make sure dataframe is in following format:
  | Question Number | Question Text | Option A | Option B | ... | Correct Answer Letter |
  |     (Int)       |     (Str)     |  (Str)   |  (Str)   |     |       (Char)          |

  The current format of Dataframe is: {columns}
  '''
  ['Question Number', 'Question Text', 'Correct Answer Letter']
  if num_options < 2:
    raise Exception(error_text)

  #----------------------------------------------------------------------------#
  sys_prompt_temp1 = sys_prompt1
  sys_prompt_temp2 = sys_prompt2
  ## Reformat system prompt in order to fit number of options in benchmark
  if num_options < 5: ## ABCD
    sys_prompt_temp1 = (sys_prompt1
                  .replace('(A, B, C, D, or E)', '(A, B, C, or D)') ## Change the available options
                  .replace('E) ${Option E}', '') ## Drop option E
        )
    sys_prompt_temp2 = (sys_prompt2
                  .replace('(A, B, C, D, or E)', '(A, B, C, or D)') ## Change the available options
                  .replace('E) ${Option E}', '') ## Drop option E
        )
    if num_options < 4: ## ABC
      sys_prompt_temp1 = (sys_prompt_temp1
                    .replace('(A, B, C, or D)', '(A, B, or C)') ## Change the available options
                    .replace('D) ${Option D}', '') ## Drop option D
          )
      sys_prompt_temp2 = (sys_prompt_temp2
                  .replace('(A, B, C, or D)', '(A, B, or C)') ## Change the available options
                  .replace('D) ${Option D}', '') ## Drop option D
        )

      if num_options < 3: ## AB
        sys_prompt_temp1 = (sys_prompt_temp1
                      .replace('(A, B, or C)', '(A or B)') ## Change the available options
                      .replace('C) ${Option C}', '') ## Drop option C
            )
        sys_prompt_temp2 = (sys_prompt_temp2
                    .replace('(A, B, or C)', '(A or B)') ## Change the available options
                    .replace('C) ${Option C}', '') ## Drop option C
          )
  #----------------------------------------------------------------------------#
  ## Initialize Output dataframe:
  header = ['Question Num', 'Full Prompt 1', 'Full Prompt 2']
  output_df = pd.DataFrame(columns = header)

  #----------------------------------------------------------------------------#

  ## Format questions for benchmark
  letters = ['A', 'B', 'C', 'D', 'E']
  options = ['Option A', 'Option B', 'Option C', 'Option D', 'Option E']

  for i in range(len(df)):
    question = df['Question'][i]
    option_text = df[options[:num_options]].iloc[i].to_list()

    ## Prompt for specific question
    new_prompt = sys_prompt_temp1.replace('${Question}', question)
    for j in range(num_options): ## This for loop allows for dynamic question amounts
        new_prompt = new_prompt.replace(f'${{Option {letters[j]}}}', str(option_text[j]))


    ## Add formatted prompts.
    ## Note that this is formatted to llama so changes may be needed down the line.
    prompts1 = ("<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
            + sys_prompt_temp1 ## System Prompt
            + "\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n"
            + new_prompt.split('<Your concise reasoning here. Max 100 words>')[0] ## Specific prompt for question
            + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>")
    prompts2 = ("<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n"
            + sys_prompt_temp2 ## Just asking for confidence
            + '<|eot_id|><|start_header_id|>assistant<|end_header_id|>')
    output_df.loc[i] = [df['Question Number'].iloc[i], prompts1, prompts2]

  return output_df


In [None]:
## Need to clarify what goes in what with the rest of the group.

def get_reasoning(row, prompts):
  input = prompts['Full Prompt 1'][row]
  reasoning = generation_pipeline(input)
  return reasoning

def get_answer_confidence(row, prompts, reasoning):
  sys_prompt2 = "Based on the reasoning above, Provide the answer and the likelihood that each option is correct from 0.0 to 1.0 in a JSON format. The four probabilities should sum to 1.0. For example:\nQuestion: ${Question}\nOptions:\nA) ${Option A}\nB) ${Option B}\nC) ${Option C}\nD) ${Option D}\nE) ${Option E}\n\nPlease provide your reasoning first, limited to 100 words, and then conclusively state only your selected answer using the corresponding letter (A, B, C, D, or E).\nReasoning: ${Reasoning}\nAnswer: ${Answer choice}\n\nProvide the likelihood that each answer is correct:\n\n{\n'A': <Probability choice A is correct. As a float from 0.0 to 1.0>,\n'B': <Probability choice B is correct. As a float from 0.0 to 1.0>,\n'C': <Probability choice C is correct. As a float from 0.0 to 1.0>,\n'D': <Probability choice D is correct. As a float from 0.0 to 1.0>,\n'E': <Probability choice E is correct. As a float from 0.0 to 1.0>\n}"
  begin_JSON= "\n{\n'Answer': "

  input = prompts['Full Prompt 1'][row] + reasoning + begin_JSON
  input = input.strip('Answer:')
  print('______________________')
  print(f'Input:\n{input}')
  print('______________________')

  if is_api:
    answer = generation_pipeline(input)
  else:
    ## Get Answer:
    batch = tokenizer(input, return_tensors= "pt").to('cuda')
    with torch.no_grad():
        outputs = model(**batch)
    ## Get Token Probabilites
    logits = outputs.logits

    # Apply softmax to the logits to get probabilities
    probs = torch.softmax(logits[0, -1], dim=0)
    #print(probs)

    # Get the top k token indices and their probabilities
    top_k_probs, top_k_indices = torch.topk(probs, 100, sorted =True)

    # Convert token indices to tokens
    top_k_tokens = [tokenizer.decode([token_id]) for token_id in top_k_indices]

    # Convert probabilities to list of floats
    top_k_probs = top_k_probs.tolist()                  #list of probabilities
    arr = list(zip(top_k_tokens, top_k_probs))          #Creates an array of tokens and their prob.
    logit_df = pd.DataFrame(arr, columns= ["Token", "Prob"] ) #converts array -> dataframe
    answer = logit_df["Token"][0].strip()
    logit_confidence = json.dumps(logit_df.set_index('Token').to_dict(),indent=4, sort_keys=True)
    ## Get Stated Confidence:
    new_input = input + answer + prompts['Full Prompt 2'][row]
    stated_confidence = generation_pipeline(new_input)

  return answer, stated_confidence, logit_confidence

#get_answer_confidence(i, benchmark_prompts, raw_output1, open_model = is_api)

In [None]:
def test_model_benchmark(benchmark_prompts, ## The benchmark dataframe in the form:  | Question Num (Start at 0) | Full Prompt |
                         custom_length = -1, ## If we want to change how much of the benchmark we run on
                         output_loc = ''): ## Where we want to save the output dataframe

  ## Output Format:
  ## | Question Num | Question | Reasoning | Answer |   Stated Probs   |  Logit Probs  |
  ## |    (Int)     |   (Str)  |   (Str)   | (Char) | (String as JSON) | (Dict as Str) |
  ##                                                                     Only for Open

  random.seed(set_seed)
  #----------------------------------------------------------------------------#
  ## Set the amount of runs we will do custom_length == -1 is the default
  if custom_length == -1:
    length = len(benchmark_prompts)
  else:
    length = custom_length
  print(f'Testing benchmark on {length} rows')
  #----------------------------------------------------------------------------#
  ## Initilize the output Dataframe:

  if is_api:
    ## For a closed model like GPT
    header = ['Question Num', 'Question Text', 'Raw Output Prompt 1', 'Raw Output Prompt 2']
  else:
    ## For an open / local model like Llama
    header = ['Question Num', 'Question Text', 'Reasoning', 'Answer', 'Stated Confidence', 'Logit Confidence']

  output_df = pd.DataFrame(columns = header)

  #----------------------------------------------------------------------------#
  ## Test the model:
  for i in range(length):
    ## Get Reasoning
    reasoning = get_reasoning(i, benchmark_prompts)
    print(1)
    if debug: print(reasoning)

    ## Get Answer/ Confidence:
    if is_api: ## For a closed model like GPT
      answer, stated_confidence = get_answer_confidence(i, benchmark_prompts, reasoning)

      if debug: print(f'Answer: \n{answer}') ## Print the results if testing
      output_df.loc[i] = [i, benchmark_prompts['Full Prompt 1'][i], reasoning, answer, stated_confidence] ## Save to output df
    else: ## For an open / local model like Llama

      ## Get the raw confidence and logit probabilities for answer.
      ## Note that for models like llama, the answer letter will be obtained by
      ## post proccessing the logit confidence down the line.
      answer, stated_confidence, logit_confidence = get_answer_confidence(i, benchmark_prompts, reasoning)

      output_df.loc[i] = [i, benchmark_prompts['Full Prompt 1'][i], reasoning, answer, stated_confidence, logit_confidence] ## Save to output df
      if debug: print(f'Answer: \n{answer}\nLogit Confidence: \n{logit_confidence}') ## Print the results if testing
    ## Save output_df to csv to save progress.
    output_df.to_csv(output_loc)
  return output_df

In [None]:
## Core code implementation:
#------------------------------------------------------------------------------#
## Import and Format Benchmark:
file_path = '/content/LSAT_formatted.csv' ## Location of benchmark csv
df = pd.read_csv(file_path)
print(f'Imported benchmark from location: {file_path}')
display(df.head())
prompt_df = format_df(df)
print('Formatted benchmarks into prompts:')
display(prompt_df.head())
#------------------------------------------------------------------------------#
print('Testing model on benchmark:')
output_loc = '/content/output_df.csv'
output_df = test_model_benchmark(benchmark_prompts = prompt_df, output_loc = output_loc, custom_length = -1)
display(output_df.head())

Imported benchmark from location: /content/LSAT_formatted.csv


Unnamed: 0,Question,Correct Answer,Question Number,Option A,Option B,Option C,Option D,Option E
0,Exactly six trade representatives negotiate a ...,B,199106_2-G_1_1,"Klosnik, Poirier, Neri, Manley, Osata, Londi","Klosnik, Londi, Manley, Poirier, Neri, Osata","Klosnik, Londi, Manley, Osata, Poirier, Neri","Klosnik, Osata, Poirier, Neri, Londi, Manley","Klosnik, Neri, Londi, Osata, Manley, Poirier"
1,Exactly six trade representatives negotiate a ...,A,199106_2-G_1_2,Klosnik and Osata,Londi and Neri,Londi and Osata,Manley and Neri,Manley and Poirier
2,Exactly six trade representatives negotiate a ...,B,199106_2-G_1_3,Londi and Neri,Londi and Osata,Neri and Osata,Neri and Poirier,Osata and Poirier
3,Exactly six trade representatives negotiate a ...,E,199106_2-G_1_4,Londi and Manley,Londi and Poirier,Neri and Osata,Neri and Poirier,Poirier and Osata
4,Exactly six trade representatives negotiate a ...,E,199106_2-G_1_5,Klosnik,"Klosnik, Neri","Neri, Poirier","Klosnik, Osata, Poirier","Klosnik, Neri, Osata, Poirier"


Formatted benchmarks into prompts:


Unnamed: 0,Question Num,Full Prompt 1,Full Prompt 2
0,199106_2-G_1_1,<|begin_of_text|><|start_header_id|>system<|en...,<|begin_of_text|><|start_header_id|>user<|end_...
1,199106_2-G_1_2,<|begin_of_text|><|start_header_id|>system<|en...,<|begin_of_text|><|start_header_id|>user<|end_...
2,199106_2-G_1_3,<|begin_of_text|><|start_header_id|>system<|en...,<|begin_of_text|><|start_header_id|>user<|end_...
3,199106_2-G_1_4,<|begin_of_text|><|start_header_id|>system<|en...,<|begin_of_text|><|start_header_id|>user<|end_...
4,199106_2-G_1_5,<|begin_of_text|><|start_header_id|>system<|en...,<|begin_of_text|><|start_header_id|>user<|end_...


Testing model on benchmark:
Testing benchmark on 1630 rows
1
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Given the following question, analyze the options, and provide a concise reasoning for your selected answer. Your reasoning should not exceed 100 words. After your explanation, clearly state your answer by choosing one of the options listed (A, B, C, D, or E).

Question: ${Question}
Options:
A) ${Option A}
B) ${Option B}
C) ${Option C}
D) ${Option D}
E) ${Option E}

Please provide your reasoning first, limited to 100 words, and then conclusively state only your selected answer using the corresponding letter (A, B, C, D, or E).
Reasoning: <Your concise reasoning here. Max 100 words>

<|eot_id|><|start_header_id|>user<|end_header_id|>

Given the following question, analyze the options, and provide a concise reasoning for your selected answer. Your reasoning should not exceed 100 words. After your explanation, clearly state your answer by choosing one of the options li

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
B) ${Option B}
C) ${Option C}
D) ${Option D}
E) ${Option E}

Please provide your reasoning first, limited to 100 words, and then conclusively state only your selected answer using the corresponding letter (A, B, C, D, or E).
Reasoning: <Your concise reasoning here. Max 100 words>

<|eot_id|><|start_header_id|>user<|end_header_id|>

Given the following question, analyze the options, and provide a concise reasoning for your selected answer. Your reasoning should not exceed 100 words. After your explanation, clearly state your answer by choosing one of the options listed (A, B, C, D, or E).

Question: The Darshan Advertising Agency has exactly seven representatives—Faizal, John, Lekha, Monica, Pooja, Qadir, Shobhit. Its new campaign is presented to exactly one client by one or more of the representatives in accordance with the following conditions: If Faizal presents, then so do Monica and Pooja. If John presents, then Qadir

Unnamed: 0,Question Num,Question Text,Reasoning,Answer,Stated Confidence,Logit Confidence
0,0,<|begin_of_text|><|start_header_id|>system<|en...,<|begin_of_text|><|start_header_id|>system<|en...,A,<|begin_of_text|><|start_header_id|>system<|en...,"{\n ""Prob"": {\n "" "": 0.0014039080124..."
1,1,<|begin_of_text|><|start_header_id|>system<|en...,<|begin_of_text|><|start_header_id|>system<|en...,C,<|begin_of_text|><|start_header_id|>system<|en...,"{\n ""Prob"": {\n "" "": 0.0010846076766..."
2,2,<|begin_of_text|><|start_header_id|>system<|en...,<|begin_of_text|><|start_header_id|>system<|en...,D,<|begin_of_text|><|start_header_id|>system<|en...,"{\n ""Prob"": {\n "" "": 0.0043283221311..."
3,3,<|begin_of_text|><|start_header_id|>system<|en...,<|begin_of_text|><|start_header_id|>system<|en...,A,<|begin_of_text|><|start_header_id|>system<|en...,"{\n ""Prob"": {\n "" "": 0.0017062724800..."
4,4,<|begin_of_text|><|start_header_id|>system<|en...,<|begin_of_text|><|start_header_id|>system<|en...,D,<|begin_of_text|><|start_header_id|>system<|en...,"{\n ""Prob"": {\n "" "": 0.0028003209736..."
