<a href="https://colab.research.google.com/github/NoamMichael/Comparing-Confidence-in-LLMs/blob/main/LSAT_Benchmarking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# This Notebook will test all models on the formatted LSAT-AR dataset
# I have no issue running multiple API clients simultaneously. However, running
# local models is pretty memory intensive so I can only run one at a time.
%pip install anthropic
%pip install openai

Collecting anthropic
  Downloading anthropic-0.52.2-py3-none-any.whl.metadata (25 kB)
Downloading anthropic-0.52.2-py3-none-any.whl (286 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/286.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.3/286.3 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anthropic
Successfully installed anthropic-0.52.2


In [2]:
import pandas as pd
import numpy as np
import json
import time
import random
import torch
import matplotlib.pyplot as plt
from transformers import (AutoTokenizer,
                        AutoModelForCausalLM,
                        BitsAndBytesConfig,
                        pipeline)
import warnings
import openai
import anthropic
import google.generativeai as genai
from abc import ABC, abstractmethod

warnings.filterwarnings('ignore')
from google.colab import userdata


class OpenModel: ## This class is built around Hugging Face methods
  def __init__(self, name, key, MaxTokens = 150):
    self.name = name
    self.key = key
    self.MaxTokens = MaxTokens
    print(f"Downloading Tokenizer for {self.name}")
    self.tokenizer = AutoTokenizer.from_pretrained(self.name,token = self.key) ## Import Tokenizer
    print(f"Downloading Model Weights for {self.name}")
    self.model = AutoModelForCausalLM.from_pretrained(self.name, token = self.key, device_map="auto") ## Import Model

    ## Make text generation pipeline
    self.pipeline = pipeline(
    "text-generation",
    model = self.model,
    tokenizer = self.tokenizer,
    do_sample = False,
    max_new_tokens = self.MaxTokens,
    eos_token_id = self.tokenizer.eos_token_id,
    pad_token_id = self.tokenizer.eos_token_id
    )

  def generate(self, prompt):
    return self.pipeline(prompt)[0]['generated_text']

  def GetTokens(self, prompt: str):
    ## Get Answer:
    batch = self.tokenizer(prompt, return_tensors= "pt").to('cuda')
    with torch.no_grad():
        outputs = self.model(**batch)
    ## Get Token Probabilites
    logits = outputs.logits

    ## Apply softmax to the logits to get probabilities
    probs = torch.softmax(logits[0, -1], dim=0)

    ##Get the top k token indices and their probabilities
    top_k_probs, top_k_indices = torch.topk(probs, 100, sorted =True)

    ## Convert token indices to tokens
    top_k_tokens = [self.tokenizer.decode([token_id]) for token_id in top_k_indices]

    ## Convert probabilities to list of floats
    top_k_probs = top_k_probs.tolist()                  #list of probabilities

    ## Create a Pandas Series with tokens as index and probabilities as values
    logit_series = pd.Series(top_k_probs, index=top_k_tokens)

    ## Sort the series by values in descending order
    logit_series = logit_series.sort_values(ascending=False)
    logit_series.index.name = "Token"
    logit_series.name = "Probability"
    return logit_series

class ClosedModel(ABC):
  @abstractmethod
  def generate(self, prompt: str, system:str = "")-> str:
        """
        Abstract method to generate a response from the language model.
        """
        pass
  @abstractmethod
  def __init__(self, name, api_key):
    self.name = name
    self.key = api_key
    pass

  @abstractmethod
  def client(self):
    pass

class GPTmodel(ClosedModel):
  def __init__(self, name, api_key):
    self.name = name
    self.key = api_key

  def client(self):
    # Initialize the OpenAI client with the API key
    self.client = openai.OpenAI(api_key=self.key)


  def generate(self, prompt: str, system: str = "") -> str:
    # Use the new client-based API call
    response = self.client.chat.completions.create(
        model=self.name,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=100
    )
    # Access the content from the new response object structure
    return response.choices[0].message.content


class AnthropicModel(ClosedModel):
  def __init__(self, name, api_key):
    self.name = name
    self.key = api_key
  def client(self):
    # Initialize the Anthropic client with the API key
    self.client = anthropic.Anthropic(api_key=self.key)

  def generate(self, prompt: str, system: str = "") -> str:
    # The messages list should only contain user and assistant roles
    messages = [{"role": "user", "content": prompt}]

    # Use the Anthropic client to create a message
    # Pass the system message as a top-level 'system' parameter
    message = self.client.messages.create(
        model=self.name,
        max_tokens=100, # You can adjust this or make it an instance variable
        messages=messages,
        system=system if system else None # Pass system as a separate parameter, or None if empty
    )
    # Access the content from the response object
    return message.content[0].text


class GeminiModel(ClosedModel):
  def __init__(self, name, api_key):
    self.name = name
    self.key = api_key

  def client(self):
    # Initialize the google.generativeai client with the API key

    genai.configure(api_key=self.key)
    self.model = genai.GenerativeModel(model_name=self.name)

  def generate(self, prompt: str, system: str = "") -> str:
    # Build the content list, including the system message if provided
    contents = [{"role": "user", "parts": [prompt]}]
    if system:
        contents = [{"role": "user", "parts": [system]}] + contents

    # Use the Gemini model to generate content
    response = self.model.generate_content(contents)

    # Access the content from the response object
    return response.text

In [None]:
## Playground for Llama
hf_llama_token = userdata.get('hf_llama_token')
test_name = 'meta-llama/Llama-3.1-8B-Instruct'
test_key = hf_llama_token
test_prompt = "Zdzisław Beksiński was"

test_model = OpenModel(name = test_name, key = test_key)

In [None]:
print(test_model.generate(test_prompt))
test_model.GetTokens(test_prompt)

In [6]:
## Playground for GPT

gpt_4_key = userdata.get('gpt_api_key')
test_name = 'gpt-4'
test_key = gpt_4_key
test_prompt = "Zdzisław Beksiński was"
test_system = "You are a helpful assistant."

my_gpt = GPTmodel(name = test_name, api_key = test_key)
my_gpt.client()
my_gpt.generate(test_prompt, test_system)

"a renowned Polish painter, photographer, and sculptor. He is best known for his large, detailed images of a surreal, post-apocalyptic environment. Beksiński's works are characterized by their haunting, dystopian feel, often featuring desolate landscapes and tormented figures. Despite the grim themes, he insisted his work was not to be read literally and that he was not a pessimist. Beksiński was born on February 24, 1929, and tragically murdered in"

In [7]:
## Playground for Claude

claude_key = userdata.get('claude_api_key')
test_name = 'claude-3-haiku-20240307'
test_key = claude_key
test_prompt = "Zdzisław Beksiński was"
test_system = "You are a helpful assistant."

my_claude = AnthropicModel(name = test_name, api_key = test_key)
my_claude.client()
my_claude.generate(test_prompt, test_system)

'Zdzisław Beksiński was a Polish painter, photographer, and sculptor who was known for his distinctive surrealist and dystopian style. Here are some key facts about him:\n\n- Born in 1929 in Sanok, Poland, Beksiński initially studied to be an architect before turning to art.\n\n- His paintings often depicted post-apocalyptic, nightmarish landscapes and figures. His style was highly detailed and technical, with a'

In [8]:
## Playground for Gemini

gemini_api_key = userdata.get('gemini_api_key') # Assuming you stored your key in Userdata
test_name = "gemini-2.0-flash" # Or another Gemini model name like 'gemini-1.5-flash'
test_key = gemini_api_key
test_prompt = "Zdzisław Beksiński was"
test_system = "You are a helpful assistant." # Optional system message

my_gemini = GeminiModel(name = test_name, api_key = test_key)
my_gemini.client()
my_gemini.generate(test_prompt, test_system)


'Zdzisław Beksiński was a Polish painter, photographer, and sculptor specializing in dystopian surrealism. He is known for his distinctive and hauntingly beautiful, yet often disturbing, imagery. His art explores themes of death, decay, anxiety, and the human condition.\n'

In [None]:
## Pseudo Code
## Import dataset

## Initialize all closed models

##--Initialize all GPT models
##----GPT-4o
##----GPT-o3
##--Initialize all Claude models
##----Claude-3.7 Sonnet
##----Claude-4 Sonnet
##--Initialize Gemini
##----Gemini-2.0 Flash
##----Gemini-1.5 Flash
##----Gemini-2.5 Pro

'''
for question in dataset:
  for model in ClosedModels:
    model.generate(question) #Since we are iterating over ClosedModels we can call the abstract method .generate()

'''




In [9]:
## Initializing my closed models

my_closed_models = {
    'GPT': {
        'api_key_name': 'gpt_api_key', # Name of the key to retrieve from userdata
        'models': [
            'gpt-4',
            'gpt-3.5-turbo'
        ]
    },
    'Claude': {
        'api_key_name': 'claude_api_key', # Name of the key to retrieve from userdata
        'models': [
            #'claude-3-sonnet-20240229',
            'claude-3-haiku-20240307'
        ]
    },
    'Gemini': {
        'api_key_name': 'gemini_api_key', # Name of the key to retrieve from userdata
        'models': [
            'gemini-1.5-flash',
            #'gemini-1.5-pro'
        ]
    }
}

print('Initializing Closed Models:')
closed_models = []
for model_type in my_closed_models:
    print(f'{model_type}:')
    api_key_name = my_closed_models[model_type]['api_key_name']
    api_key = userdata.get(api_key_name)
    print(f'  API Key Name: {my_closed_models[model_type]["api_key_name"]}')
    for model_name in my_closed_models[model_type]['models']:
      # Instantiate the correct subclass based on model_type
      if model_type == 'GPT':
          my_model = GPTmodel(name = model_name, api_key = api_key)
      elif model_type == 'Claude':
          my_model = AnthropicModel(name = model_name, api_key = api_key)
      elif model_type == 'Gemini':
          my_model = GeminiModel(name = model_name, api_key = api_key)
      else:
          # Handle unexpected model types if necessary
          print(f"Warning: Unknown model type {model_type}. Skipping.")
          continue # Skip to the next model name if type is unknow
      my_model.client()
      closed_models.append(my_model)
      print(f'    {model_name}')


print(f'Models Initialized: {len(closed_models)}')
print(f'Model locations:\n{closed_models}')

print('-'*42)
print('Testing all closed models:')
print(f'Test prompt: {test_prompt}')
print(f'Test system: {test_system}')

for model in closed_models:
  print(f'\nTesting model: {model.name}')
  print(model.generate(test_prompt, test_system))


Initializing Closed Models:
GPT:
  API Key Name: gpt_api_key
    gpt-4
    gpt-3.5-turbo
Claude:
  API Key Name: claude_api_key
    claude-3-haiku-20240307
Gemini:
  API Key Name: gemini_api_key
    gemini-1.5-flash
Models Initialized: 4
Model locations:
[<__main__.GPTmodel object at 0x786e62e22410>, <__main__.GPTmodel object at 0x786e624e2b50>, <__main__.AnthropicModel object at 0x786e624e3c10>, <__main__.GeminiModel object at 0x786e624e3250>]
------------------------------------------
Testing all closed models:
Test prompt: Zdzisław Beksiński was
Test system: You are a helpful assistant.

Testing model: gpt-4
a renowned Polish painter, photographer, and sculptor. He is best known for his large, detailed images of a surreal, post-apocalyptic environment. Beksiński's works are characterized by their haunting, dystopian feel, often featuring desolate landscapes and grotesque, distorted figures. Despite the grim themes, he insisted his work was not to be interpreted too literally, and th

In [11]:
## Import LSAT dataset

file_path = '/content/LSAT_formatted.csv'

dataset = pd.read_csv(file_path)
dataset.head()

Unnamed: 0,Question,Correct Answer,Question Number,Option A,Option B,Option C,Option D,Option E
0,Exactly six trade representatives negotiate a ...,B,199106_2-G_1_1,"Klosnik, Poirier, Neri, Manley, Osata, Londi","Klosnik, Londi, Manley, Poirier, Neri, Osata","Klosnik, Londi, Manley, Osata, Poirier, Neri","Klosnik, Osata, Poirier, Neri, Londi, Manley","Klosnik, Neri, Londi, Osata, Manley, Poirier"
1,Exactly six trade representatives negotiate a ...,A,199106_2-G_1_2,Klosnik and Osata,Londi and Neri,Londi and Osata,Manley and Neri,Manley and Poirier
2,Exactly six trade representatives negotiate a ...,B,199106_2-G_1_3,Londi and Neri,Londi and Osata,Neri and Osata,Neri and Poirier,Osata and Poirier
3,Exactly six trade representatives negotiate a ...,E,199106_2-G_1_4,Londi and Manley,Londi and Poirier,Neri and Osata,Neri and Poirier,Poirier and Osata
4,Exactly six trade representatives negotiate a ...,E,199106_2-G_1_5,Klosnik,"Klosnik, Neri","Neri, Poirier","Klosnik, Osata, Poirier","Klosnik, Neri, Osata, Poirier"


In [None]:
## System Prompts

debug = True ## Test condition for print statements / less runs
set_seed = 42 ## Random Seed

sys_prompt1 = '''
Given the following question, analyze the options, and provide a concise reasoning for your selected answer. Your reasoning should not exceed 100 words. After your explanation, clearly state your answer by choosing one of the options listed (A, B, C, D, or E).

Question: ${Question}
Options:
A) ${Option A}
B) ${Option B}
C) ${Option C}
D) ${Option D}
E) ${Option E}

Please provide your reasoning first, limited to 100 words, and then conclusively state only your selected answer using the corresponding letter (A, B, C, D, or E).
Reasoning: <Your concise reasoning here. Max 100 words>
'''
sys_prompt2 = '''
Based on the reasoning above, Provide the correct answer and the likelihood that each option is correct from 0.0 to 1.0 in a JSON format. The four probabilities should sum to 1.0. For example:

{
'Answer': <Your answer choice here, as a single letter and nothing else.>
'A': <Probability choice A is correct. As a float from 0.0 to 1.0>,
'B': <Probability choice B is correct. As a float from 0.0 to 1.0>,
'C': <Probability choice C is correct. As a float from 0.0 to 1.0>,
'D': <Probability choice D is correct. As a float from 0.0 to 1.0>,
'E': <Probability choice E is correct. As a float from 0.0 to 1.0>
}
'''

## Edit system Prompts in order to match size of dataset
columns = dataset.columns
num_options = columns.str.contains('Option').astype(int).sum()

sys_prompt_temp1 = sys_prompt1
sys_prompt_temp2 = sys_prompt2
## Reformat system prompt in order to fit number of options in benchmark
if num_options < 5: ## ABCD
  sys_prompt_temp1 = (sys_prompt1
                .replace('(A, B, C, D, or E)', '(A, B, C, or D)') ## Change the available options
                .replace('E) ${Option E}', '') ## Drop option E
      )
  sys_prompt_temp2 = (sys_prompt2
                .replace('(A, B, C, D, or E)', '(A, B, C, or D)') ## Change the available options
                .replace('E) ${Option E}', '') ## Drop option E
      )
  if num_options < 4: ## ABC
    sys_prompt_temp1 = (sys_prompt_temp1
                  .replace('(A, B, C, or D)', '(A, B, or C)') ## Change the available options
                  .replace('D) ${Option D}', '') ## Drop option D
        )
    sys_prompt_temp2 = (sys_prompt_temp2
                .replace('(A, B, C, or D)', '(A, B, or C)') ## Change the available options
                .replace('D) ${Option D}', '') ## Drop option D
      )

    if num_options < 3: ## AB
      sys_prompt_temp1 = (sys_prompt_temp1
                    .replace('(A, B, or C)', '(A or B)') ## Change the available options
                    .replace('C) ${Option C}', '') ## Drop option C
          )
      sys_prompt_temp2 = (sys_prompt_temp2
                  .replace('(A, B, or C)', '(A or B)') ## Change the available options
                  .replace('C) ${Option C}', '') ## Drop option C
        )

sys_prompt1 = sys_prompt_temp1
sys_prompt2 = sys_prompt_temp2

In [None]:
## %%%%
## Minor edits needed to generalize to other datasets
## %%%%

def format_df(df):
  ## Takes in a dataframe in the form:
  ## | Question Number | Question Text | Option A | Option B | ... | Correct Answer Letter |
  ## |     (Int)       |     (Str)     |  (Str)   |  (Str)   |     |       (Char)          |
  ##
  ## Returns a dataframe in the form:
  ## | Question Number | Full Prompt 1 | Full Prompt 2 |
  ## |     (Int)       |    (Str)      |    (Str)      |

  columns = df.columns
  num_options = columns.str.contains('Option').astype(int).sum()

  #----------------------------------------------------------------------------#
  ## Check if DF is formatted properly
  error_text = f'''Make sure dataframe is in following format:
  | Question Number | Question Text | Option A | Option B | ... | Correct Answer Letter |
  |     (Int)       |     (Str)     |  (Str)   |  (Str)   |     |       (Char)          |

  The current format of Dataframe is: {columns}
  '''
  ['Question Number', 'Question Text', 'Correct Answer Letter']
  if num_options < 2:
    raise Exception(error_text)

  #----------------------------------------------------------------------------#
  sys_prompt_temp1 = sys_prompt1
  sys_prompt_temp2 = sys_prompt2
  ## Reformat system prompt in order to fit number of options in benchmark
  if num_options < 5: ## ABCD
    sys_prompt_temp1 = (sys_prompt1
                  .replace('(A, B, C, D, or E)', '(A, B, C, or D)') ## Change the available options
                  .replace('E) ${Option E}', '') ## Drop option E
        )
    sys_prompt_temp2 = (sys_prompt2
                  .replace('(A, B, C, D, or E)', '(A, B, C, or D)') ## Change the available options
                  .replace('E) ${Option E}', '') ## Drop option E
        )
    if num_options < 4: ## ABC
      sys_prompt_temp1 = (sys_prompt_temp1
                    .replace('(A, B, C, or D)', '(A, B, or C)') ## Change the available options
                    .replace('D) ${Option D}', '') ## Drop option D
          )
      sys_prompt_temp2 = (sys_prompt_temp2
                  .replace('(A, B, C, or D)', '(A, B, or C)') ## Change the available options
                  .replace('D) ${Option D}', '') ## Drop option D
        )

      if num_options < 3: ## AB
        sys_prompt_temp1 = (sys_prompt_temp1
                      .replace('(A, B, or C)', '(A or B)') ## Change the available options
                      .replace('C) ${Option C}', '') ## Drop option C
            )
        sys_prompt_temp2 = (sys_prompt_temp2
                    .replace('(A, B, or C)', '(A or B)') ## Change the available options
                    .replace('C) ${Option C}', '') ## Drop option C
          )
  #----------------------------------------------------------------------------#
  ## Initialize Output dataframe:
  header = ['Question Num', 'Full Prompt 1', 'Full Prompt 2']
  output_df = pd.DataFrame(columns = header)

  #----------------------------------------------------------------------------#

  ## Format questions for benchmark
  letters = ['A', 'B', 'C', 'D', 'E']
  options = ['Option A', 'Option B', 'Option C', 'Option D', 'Option E']

  for i in range(len(df)):
    question = df['Question Text'][i]
    option_text = df[options[:num_options]].iloc[i].to_list()

    ## Prompt for specific question
    new_prompt = sys_prompt_temp1.replace('${Question}', question)
    for j in range(num_options): ## This for loop allows for dynamic question amounts
        new_prompt = new_prompt.replace(f'${{Option {letters[j]}}}', str(option_text[j]))


    ## Add formatted prompts.
    ## Note that this is formatted to llama so changes may be needed down the line.
    prompts1 = (new_prompt.split('<Your concise reasoning here. Max 100 words>')[0]) ## Specific prompt for question

    prompts2 = (sys_prompt_temp2) ## Generic prompt for question confidence
    output_df.loc[i] = [df['Question Number'].iloc[i], prompts1, prompts2]

  return output_df


In [None]:
## %%%%%
## Need to clarify what goes in what with the rest of the group.

## I think I can just use the model.generate method with some edits
## %%%%%

def get_reasoning(row, prompts):
  input = prompts['Full Prompt 1'][row]
  reasoning = generation_pipeline(input)
  return reasoning

def get_answer_confidence(row, prompts, reasoning):
  sys_prompt2 = "Based on the reasoning above, Provide the answer and the likelihood that each option is correct from 0.0 to 1.0 in a JSON format. The four probabilities should sum to 1.0. For example:\nQuestion: ${Question}\nOptions:\nA) ${Option A}\nB) ${Option B}\nC) ${Option C}\nD) ${Option D}\nE) ${Option E}\n\nPlease provide your reasoning first, limited to 100 words, and then conclusively state only your selected answer using the corresponding letter (A, B, C, D, or E).\nReasoning: ${Reasoning}\nAnswer: ${Answer choice}\n\nProvide the likelihood that each answer is correct:\n\n{\n'A': <Probability choice A is correct. As a float from 0.0 to 1.0>,\n'B': <Probability choice B is correct. As a float from 0.0 to 1.0>,\n'C': <Probability choice C is correct. As a float from 0.0 to 1.0>,\n'D': <Probability choice D is correct. As a float from 0.0 to 1.0>,\n'E': <Probability choice E is correct. As a float from 0.0 to 1.0>\n}"
  begin_JSON= "\n{\n'Answer': "

  input = prompts['Full Prompt 1'][row] + reasoning + begin_JSON
  input = input.strip('Answer:')
  print('______________________')
  print(f'Input:\n{input}')
  print('______________________')

  if is_api:
    answer = generation_pipeline(input)
  else:
    ## Get Answer:
    batch = tokenizer(input, return_tensors= "pt").to('cuda')
    with torch.no_grad():
        outputs = model(**batch)
    ## Get Token Probabilites
    logits = outputs.logits

    # Apply softmax to the logits to get probabilities
    probs = torch.softmax(logits[0, -1], dim=0)
    #print(probs)

    # Get the top k token indices and their probabilities
    top_k_probs, top_k_indices = torch.topk(probs, 100, sorted =True)

    # Convert token indices to tokens
    top_k_tokens = [tokenizer.decode([token_id]) for token_id in top_k_indices]

    # Convert probabilities to list of floats
    top_k_probs = top_k_probs.tolist()                  #list of probabilities
    arr = list(zip(top_k_tokens, top_k_probs))          #Creates an array of tokens and their prob.
    logit_df = pd.Series(arr, columns= ["Token", "Prob"] ) #converts array -> dataframe
    answer = logit_df["Token"][0].strip()
    logit_confidence = json.dumps(logit_df.set_index('Token').to_dict(),indent=4, sort_keys=True)
    ## Get Stated Confidence:
    new_input = input + answer + prompts['Full Prompt 2'][row]
    stated_confidence = generation_pipeline(new_input)

  return answer, stated_confidence, logit_confidence

#get_answer_confidence(i, benchmark_prompts, raw_output1, open_model = is_api)

In [None]:
## %%%%
## Edit this to work for all models
## %%%%

def test_model_benchmark(benchmark_prompts, ## The benchmark dataframe in the form:  | Question Num (Start at 0) | Full Prompt |
                         custom_length = -1, ## If we want to change how much of the benchmark we run on
                         output_loc = ''): ## Where we want to save the output dataframe

  ## Output Format:
  ## | Question Num | Question | Reasoning | Answer |   Stated Probs   |  Logit Probs  |
  ## |    (Int)     |   (Str)  |   (Str)   | (Char) | (String as JSON) | (Dict as Str) |
  ##                                                                     Only for Open

  random.seed(set_seed)
  #----------------------------------------------------------------------------#
  ## Set the amount of runs we will do custom_length == -1 is the default
  if custom_length == -1:
    length = len(benchmark_prompts)
  else:
    length = custom_length
  print(f'Testing benchmark on {length} rows')
  #----------------------------------------------------------------------------#
  ## Initilize the output Dataframe:

  if is_api:
    ## For a closed model like GPT
    header = ['Question Num', 'Question Text', 'Raw Output Prompt 1', 'Raw Output Prompt 2']
  else:
    ## For an open / local model like Llama
    header = ['Question Num', 'Question Text', 'Reasoning', 'Answer', 'Stated Confidence', 'Logit Confidence']

  output_df = pd.DataFrame(columns = header)

  #----------------------------------------------------------------------------#
  ## Test the model:
  for i in range(length):
    ## Get Reasoning
    reasoning = get_reasoning(i, benchmark_prompts)
    print(1)
    if debug: print(reasoning)

    ## Get Answer/ Confidence:
    if is_api: ## For a closed model like GPT
      answer, stated_confidence = get_answer_confidence(i, benchmark_prompts, reasoning)

      if debug: print(f'Answer: \n{answer}') ## Print the results if testing
      output_df.loc[i] = [i, benchmark_prompts['Full Prompt 1'][i], reasoning, answer, stated_confidence] ## Save to output df
    else: ## For an open / local model like Llama

      ## Get the raw confidence and logit probabilities for answer.
      ## Note that for models like llama, the answer letter will be obtained by
      ## post proccessing the logit confidence down the line.
      answer, stated_confidence, logit_confidence = get_answer_confidence(i, benchmark_prompts, reasoning)

      output_df.loc[i] = [i, benchmark_prompts['Full Prompt 1'][i], reasoning, answer, stated_confidence, logit_confidence] ## Save to output df
      if debug: print(f'Answer: \n{answer}\nLogit Confidence: \n{logit_confidence}') ## Print the results if testing
    ## Save output_df to csv to save progress.
    output_df.to_csv(output_loc)
  return output_df

In [32]:
## Example Implementation:


## Import dataset

# Assuming you have your API keys stored in userdata
gpt_4_key = userdata.get('gpt_api_key')
claude_key = userdata.get('claude_api_key')
gemini_api_key = userdata.get('gemini_api_key')

## Initialize all closed models you want to test

# Create a list to hold your model instances
ClosedModels = []

##--Initialize all GPT models
##----GPT-4o
ClosedModels.append(GPTmodel(name='gpt-4o', api_key=gpt_4_key))
##----GPT-o3 (Assuming you meant gpt-3.5-turbo or similar)
ClosedModels.append(GPTmodel(name='gpt-3.5-turbo', api_key=gpt_4_key)) # Or the correct GPT-3 model name

##--Initialize all Claude models
##----Claude-3 Sonnet (Corrected name, typically claude-3-sonnet-20240229)
ClosedModels.append(AnthropicModel(name='claude-3-sonnet-20240229', api_key=claude_key))
##----Claude-3 Haiku (You already tested this one)
ClosedModels.append(AnthropicModel(name='claude-3-haiku-20240307', api_key=claude_key))
# Claude 4 is not a standard model name, perhaps you meant Claude 3.5 Sonnet?
# ClosedModels.append(AnthropicModel(name='claude-3-5-sonnet-20240620', api_key=claude_key)) # If Claude 3.5 Sonnet is what you meant

##--Initialize Gemini
##----Gemini-2.0 Flash (Assuming gemini-1.5-flash as a common name)
ClosedModels.append(GeminiModel(name='gemini-1.5-flash', api_key=gemini_api_key))
##----Gemini-2.0 Pro (Assuming gemini-1.5-pro as a common name)
ClosedModels.append(GeminiModel(name='gemini-1.5-pro', api_key=gemini_api_key))
# Gemini 2.5 Pro is not a standard model name, perhaps you meant Gemini 1.5 Pro?

# You can add more models to the list as needed.

# --- Example Dataset (Replace with your actual dataset loading) ---
# Assuming your dataset is a list of strings representing questions/prompts
dataset = [
    "What is the capital of France?",
    "Explain the concept of recursion in programming.",
    "Write a short story about a cat.",
    "Summarize the plot of The Great Gatsby."
]
# -------------------------------------------------------------------


for question in dataset:
  print(f"\nTesting prompt: {question}")
  for model in ClosedModels:
    try:
        print(f"--- Calling model: {model.name} ---")
        # Note: Your ClosedModel base class doesn't define 'generate'.
        # You likely want to use the 'generate' method defined in your subclasses.
        # Make sure the 'generate' method in your subclasses takes the expected
        # parameters (prompt, and potentially system).
        # Let's assume your closed models' 'generate' method takes prompt and system.
        # If a system message is needed, you'll need to provide one here or modify
        # your model classes/loop structure. For simplicity, let's assume
        # the prompts can be handled without a separate system message for now,
        # or you can add one if needed.
        response = model.generate(prompt=question, system="") # Pass the question from the dataset
        print(f"Response from {model.name}: {response[:200]}...") # Print first 200 chars of response
    except Exception as e:
        print(f"Error calling model {model.name}: {e}")
