In [None]:
# This Notebook will test all models on the formatted SciQ dataset
# I have no issue running multiple API clients simultaneously. However, running
# local models is pretty memory intensive so I can only run one at a time.

In [None]:
%pip install anthropic
%pip install openai
%pip install tqdm



In [None]:
import pandas as pd
import numpy as np
import json
import time
import random
import torch
import matplotlib.pyplot as plt
from transformers import (AutoTokenizer,
                        AutoModelForCausalLM,
                        BitsAndBytesConfig,
                        pipeline)
import warnings
import openai
import anthropic
import google.generativeai as genai
from abc import ABC, abstractmethod
from tqdm.notebook import tqdm
warnings.filterwarnings('ignore')
from google.colab import userdata
import os
from google.colab import drive
max_tokens = 150



class ClosedModel(ABC):
  @abstractmethod
  def generate(self, prompt: str, system:str = "")-> str:
        """
        Abstract method to generate a response from the language model.
        """
        pass
  @abstractmethod
  def __init__(self, name, api_key):
    self.name = name
    self.key = api_key
    self.results = pd.DataFrame(columns = ['Question ID','Question', 'Answer', 'Reasoning', 'Error'])
    pass

  @abstractmethod
  def client(self):
    pass

  @abstractmethod
  def GetRAC(self, prompt: str, system1:str = "", system2: str = "")-> tuple[str, str]: ## Get Reasoning Answer Confidence
    pass

class GPTmodel(ClosedModel):
  def __init__(self, name, api_key):
    self.name = name
    self.key = api_key
    self.results = pd.DataFrame(columns = ['Question ID','Question', 'Answer', 'Reasoning', 'Error'])
  def client(self):
    # Initialize the OpenAI client with the API key
    self.client = openai.OpenAI(api_key=self.key)


  def generate(self, prompt: str, system: str = "") -> str:
    # Use the new client-based API call
    response = self.client.chat.completions.create(
        model=self.name,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens= max_tokens
    )
    # Access the content from the new response object structure
    return response.choices[0].message.content
  def GetRAC(self, prompt: str)-> tuple[str, str]: ## Get Reasoning Answer Confidence
    ## For context here's the two system prompts:
    ## System prompt 1:
    '''
    Given the following question, analyze the options, and provide a concise reasoning for your selected answer. Your reasoning should not exceed 100 words. After your explanation, clearly state your answer by choosing one of the options listed (A, B, C, D, or E).

    Question: ${Question}
    Options:
    A) ${Option A}
    B) ${Option B}
    C) ${Option C}
    D) ${Option D}
    E) ${Option E}

    Please provide your reasoning first, limited to 100 words, and then conclusively state only your selected answer using the corresponding letter (A, B, C, D, or E).
    Reasoning: <Your concise reasoning here. Max 100 words>
    '''

    ## System prompt 2:
    '''
    Based on your reasoning, select the most likely answer choice and estimate the probability that each option is correct. Express your uncertainty by assigning probabilities between 0.0 and 1.0 in a JSON format. The probabilities should sum to 1.0. For example:

    {
    'Answer': <Your answer choice here, as a single letter and nothing else.>
    'A': <Probability choice A is correct. As a float from 0.0 to 1.0>,
    'B': <Probability choice B is correct. As a float from 0.0 to 1.0>,
    'C': <Probability choice C is correct. As a float from 0.0 to 1.0>,
    'D': <Probability choice D is correct. As a float from 0.0 to 1.0>,
    'E': <Probability choice E is correct. As a float from 0.0 to 1.0>
    }
    '''
    # Access global system prompts
    global sys_prompt1, sys_prompt2
    ## Get the reasoning
    reasoning = self.generate(prompt, sys_prompt1)
    ## Get the answer and confidence
    answer_confidence = self.generate(prompt + reasoning + sys_prompt2, sys_prompt2)

    return reasoning, answer_confidence

class AnthropicModel(ClosedModel):
  def __init__(self, name, api_key):
    self.name = name
    self.key = api_key
    self.results = pd.DataFrame(columns = ['Question ID','Question', 'Answer', 'Reasoning', 'Error'])
  def client(self):
    # Initialize the Anthropic client with the API key
    self.client = anthropic.Anthropic(api_key=self.key)

  def generate(self, prompt: str, system: str = "") -> str:
    # The messages list should only contain user and assistant roles
    messages = [{"role": "user", "content": prompt}]

    # Use the Anthropic client to create a message
    # Pass the system message as a top-level 'system' parameter
    message = self.client.messages.create(
        model=self.name,
        max_tokens= max_tokens, # You can adjust this or make it an instance variable
        messages=messages,
        system=system if system else None # Pass system as a separate parameter, or None if empty
    )
    # Access the content from the response object
    return message.content[0].text

  def GetRAC(self, prompt: str)-> tuple[str, str]: ## Get Reasoning Answer Confidence
    # Access global system prompts
    global sys_prompt1, sys_prompt2
    ## Get the reasoning
    reasoning = self.generate(prompt, sys_prompt1)
    ## Get the answer and confidence
    answer_confidence = self.generate(prompt + reasoning + sys_prompt2, sys_prompt2)

    return reasoning, answer_confidence
class GeminiModel(ClosedModel):
  def __init__(self, name, api_key):
    self.name = name
    self.key = api_key
    self.results = pd.DataFrame(columns = ['Question ID','Question', 'Answer', 'Reasoning', 'Error'])

  def client(self):
    # Initialize the google.generativeai client with the API key

    genai.configure(api_key=self.key)
    self.model = genai.GenerativeModel(model_name=self.name)

  def generate(self, prompt: str, system: str = "") -> str:
    # Build the content list, including the system message if provided
    contents = [{"role": "user", "parts": [prompt]}]
    if system:
        contents = [{"role": "user", "parts": [system]}] + contents

    # Use the Gemini model to generate content
    response = self.model.generate_content(contents)

    # Access the content from the response object
    return response.text

  def GetRAC(self, prompt: str)-> tuple[str, str]: ## Get Reasoning Answer Confidence
    # Access global system prompts
    global sys_prompt1, sys_prompt2
    ## Get the reasoning
    reasoning = self.generate(prompt, sys_prompt1)
    ## Get the answer and confidence
    answer_confidence = self.generate(prompt + reasoning + sys_prompt2, sys_prompt2)

    return reasoning, answer_confidence

## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
##                        Classes for Open Models
## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

class OpenModel(ABC):
  @abstractmethod
  def __init__(self, name, key, MaxTokens = max_tokens):
    pass

  @abstractmethod
  def generate(self, prompt):
    pass

  @abstractmethod
  def GetTokens(self, prompt):
    pass

  @abstractmethod
  def GetRAC(self, prompt):
    pass

class LlamaModel(OpenModel): ## This class is built around Hugging Face methods
  def __init__(self, name, key, MaxTokens = 150):
    self.name = name
    self.key = key
    self.MaxTokens = MaxTokens
    print(f"Downloading Tokenizer for {self.name}")
    self.tokenizer = AutoTokenizer.from_pretrained(self.name,token = self.key) ## Import Tokenizer
    print(f"Downloading Model Weights for {self.name}")
    self.model = AutoModelForCausalLM.from_pretrained(self.name, token = self.key, device_map="auto") ## Import Model
    self.results = pd.DataFrame(columns = ['Question ID','Question', 'Answer', 'Reasoning', 'Token Probability', 'Error'])
    ## Make text generation pipeline
    self.pipeline = pipeline(
    "text-generation",
    model = self.model,
    tokenizer = self.tokenizer,
    do_sample = False,
    max_new_tokens = self.MaxTokens,
    eos_token_id = self.tokenizer.eos_token_id,
    pad_token_id = self.tokenizer.eos_token_id,
    device_map="auto",
    transformers_version="4.37.0",
    )


  def generate(self, prompt):

    new_prompt = ("<|begin_of_text|><|start_header_id|>system<|end_header_id|>"
                  + 'You are a helpful assistant. When prompted for a response give your reasoning and answer to the user. Signify your answer with this response:\n"Therefore, the correct answer is:\n<|eot_id|>"'
                  + "\n<|eot_id|><|start_header_id|>user<|end_header_id|>"
                  + prompt
                  + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>")
    return self.pipeline(prompt)[0]['generated_text'].replace(prompt, '')

  def GetTokens(self, prompt: str):
    ## Get Answer:
    batch = self.tokenizer(prompt, return_tensors= "pt").to('cuda')
    with torch.no_grad():
        outputs = self.model(**batch)
    ## Get Token Probabilites
    logits = outputs.logits

    ## Apply softmax to the logits to get probabilities
    probs = torch.softmax(logits[0, -1], dim=0)

    ##Get the top k token indices and their probabilities
    top_k_probs, top_k_indices = torch.topk(probs, 100, sorted =True)

    ## Convert token indices to tokens
    top_k_tokens = [self.tokenizer.decode([token_id]) for token_id in top_k_indices]

    ## Convert probabilities to list of floats
    top_k_probs = top_k_probs.tolist()                  #list of probabilities

    ## Create a Pandas Series with tokens as index and probabilities as values
    global logit_series
    logit_series = pd.Series(top_k_probs, index=top_k_tokens)

    ## Sort the series by values in descending order
    logit_series = logit_series.sort_values(ascending=False)
    ## Get the answer Letter
    target_tokens = [' A', ' B', ' C', ' D', ' E', 'A', 'B', 'C', 'D', 'E']

    only_target_tokens = logit_series[logit_series.index.isin(target_tokens)]
    best_answer = only_target_tokens.index[0]

    ## Format logit series
    logit_series.index.name = "Token"
    logit_series.name = "Probability"
    return str(logit_series.to_dict()), best_answer.strip()

  def GetRAC(self, prompt: str)-> tuple[str, str]: ## Get Reasoning Answer/Confidence
    ## Get the reasoning
    new_prompt = ("<|begin_of_text|><|start_header_id|>system<|end_header_id|>"
                  + sys_prompt1
                  + "\n<|eot_id|><|start_header_id|>user<|end_header_id|>"
                  + prompt
                  + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>")
    reasoning = self.generate(new_prompt).replace(new_prompt, '')
    ## Get the answer and confidence
    answer_confidence = self.generate(new_prompt
                                      + reasoning
                                      + '<|eot_id|><|start_header_id|>user<|end_header_id|>'
                                      + sys_prompt2
                                      + '<|eot_id|><|start_header_id|>assistant<|end_header_id|>' )

    return reasoning, answer_confidence

In [None]:
## Define Functions:
def init_models(models_dict,
                test_prompt = "Zdzisław Beksiński was",
                test_system = "You are a helpful assistant."):
  print('Initializing Closed Models:')
  closed_models = []
  for model_type in my_closed_models:
      print(f'{model_type}:')
      api_key_name = my_closed_models[model_type]['api_key_name']
      api_key = userdata.get(api_key_name)
      print(f'  API Key Name: {my_closed_models[model_type]["api_key_name"]}')
      for model_name in my_closed_models[model_type]['models']:
        # Instantiate the correct subclass based on model_type
        if model_type == 'GPT':
            my_model = GPTmodel(name = model_name, api_key = api_key)
        elif model_type == 'Claude':
            my_model = AnthropicModel(name = model_name, api_key = api_key)
        elif model_type == 'Gemini':
            my_model = GeminiModel(name = model_name, api_key = api_key)
        else:
            # Handle unexpected model types if necessary
            print(f"Warning: Unknown model type {model_type}. Skipping.")
            continue # Skip to the next model name if type is unknow
        my_model.client()
        closed_models.append(my_model)
        print(f'    {model_name}')


  print(f'Models Initialized: {len(closed_models)}')
  print(f'Model locations:\n{closed_models}')

  print('-'*42)
  print('Testing all closed models:')
  print(f'Test prompt: {test_prompt}')
  print(f'Test system: {test_system}')

  for model in closed_models:
    print(f'\nTesting model: {model.name}')
    print(model.generate(test_prompt, test_system))
  return closed_models
"""
def make_system_prompt(df, sys_prompt1 = '', sys_prompt2 = ''):
  if sys_prompt1 == '' and sys_prompt2 == '':

    sys_prompt1 = '''
    Given the following question, analyze the options, and provide a concise reasoning for your selected answer. Your reasoning should not exceed 100 words. After your explanation, clearly state your answer by choosing one of the options listed (A, B, C, D, or E).

    Question: ${Question}
    Options:
    A) ${Option A}
    B) ${Option B}
    C) ${Option C}
    D) ${Option D}
    E) ${Option E}

    Please provide your reasoning first, limited to 100 words, and then conclusively state only your selected answer using the corresponding letter (A, B, C, D, or E).
    Reasoning: <Your concise reasoning here. Max 100 words>
    '''
    sys_prompt2 = '''
    Based on the reasoning above, Provide the correct answer and the likelihood that each option is correct from 0.0 to 1.0 in a JSON format. The four probabilities should sum to 1.0. For example:

    {
    'Answer': <Your answer choice here, as a single letter and nothing else.>
    'A': <Probability choice A is correct. As a float from 0.0 to 1.0>,
    'B': <Probability choice B is correct. As a float from 0.0 to 1.0>,
    'C': <Probability choice C is correct. As a float from 0.0 to 1.0>,
    'D': <Probability choice D is correct. As a float from 0.0 to 1.0>,
    'E': <Probability choice E is correct. As a float from 0.0 to 1.0>
    }

    Do not provide any additional reasoning.
    '''
  ## Edit system Prompts in order to match size of dataset
  columns = df.columns
  num_options = columns.str.contains('Option').astype(int).sum()

  sys_prompt_temp1 = sys_prompt1
  sys_prompt_temp2 = sys_prompt2
  ## Reformat system prompt in order to fit number of options in benchmark
  if num_options < 5: ## ABCD
    sys_prompt_temp1 = (sys_prompt1
                  .replace('(A, B, C, D, or E)', '(A, B, C, or D)') ## Change the available options
                  .replace('E) ${Option E}', '') ## Drop option E
        )
    sys_prompt_temp2 = (sys_prompt2
                  .replace('(A, B, C, D, or E)', '(A, B, C, or D)') ## Change the available options
                  .replace('E) ${Option E}', '') ## Drop option E
        )
    if num_options < 4: ## ABC
      sys_prompt_temp1 = (sys_prompt_temp1
                    .replace('(A, B, C, or D)', '(A, B, or C)') ## Change the available options
                    .replace('D) ${Option D}', '') ## Drop option D
          )
      sys_prompt_temp2 = (sys_prompt_temp2
                  .replace('(A, B, C, or D)', '(A, B, or C)') ## Change the available options
                  .replace('D) ${Option D}', '') ## Drop option D
        )

      if num_options < 3: ## AB
        sys_prompt_temp1 = (sys_prompt_temp1
                      .replace('(A, B, or C)', '(A or B)') ## Change the available options
                      .replace('C) ${Option C}', '') ## Drop option C
            )
        sys_prompt_temp2 = (sys_prompt_temp2
                    .replace('(A, B, or C)', '(A or B)') ## Change the available options
                    .replace('C) ${Option C}', '') ## Drop option C
          )

  return sys_prompt_temp1, sys_prompt_temp2
"""
def format_df(df):

  ## %%%%%%%%%%%%%%
  ## I need to fix how formating is done for some Q's. As daniel pointed out some
  ## questions only have 4 options, not 5.
  ## %%%%%%%%%%%%%%

  ## Takes in a dataframe in the form:
  ## | Question Number | Question | Option A | Option B | ... | Correct Answer Letter |
  ## |     (Int)       |     (Str)     |  (Str)   |  (Str)   |     |       (Char)          |
  ##
  ## Returns a dataframe in the form:
  ## | Question Number | Full Prompt 1 | Full Prompt 2 |
  ## |     (Int)       |    (Str)      |    (Str)      |

  columns = df.columns
  num_options = columns.str.contains('Option').astype(int).sum()

  #----------------------------------------------------------------------------#
  ## Check if DF is formatted properly
  error_text = f'''Make sure dataframe is in following format:
  | Question Number | Question | Option A | Option B | ... | Correct Answer Letter |
  |     (Int)       |     (Str)     |  (Str)   |  (Str)   |     |       (Char)          |

  The current format of Dataframe is: {columns}
  '''
  ['Question Number', 'Question', 'Correct Answer Letter']
  if num_options < 2:
    raise Exception(error_text)

  #----------------------------------------------------------------------------#
  ## Initialize Output dataframe:
  header = ['Question Number', 'Full Prompt 1', 'Full Prompt 2']
  output_df = pd.DataFrame(columns = header)

  #----------------------------------------------------------------------------#

  ## Format questions for benchmark
  letters = ['A', 'B', 'C', 'D', 'E']
  options = ['Option A', 'Option B', 'Option C', 'Option D', 'Option E']

  for i in range(len(df)):
    question = df['Question'][i]

    sys_prompt_temp1 = sys_prompt1
    sys_prompt_temp2 = sys_prompt2
    ## Reformat system prompt in order to fit number of options in benchmark

    if type(df['Option D'][i]) == float: ## ABC
      sys_prompt_temp1 = (sys_prompt_temp1
                    .replace('(A, B, C, or D)', '(A, B, or C)')
                    .replace('D) ${Option D}', '')
          )
      sys_prompt_temp2 = (sys_prompt_temp2
                  .replace('(A, B, C, or D)', '(A, B, or C)')
                  .replace('D) ${Option D}', '')
        )

      if type(df['Option C'][i]) == float: ## AB
        sys_prompt_temp1 = (sys_prompt_temp1
                      .replace('(A, B, or C)', '(A or B)')
                      .replace('C) ${Option C}', '')
            )
        sys_prompt_temp2 = (sys_prompt_temp2
                    .replace('(A, B, or C)', '(A or B)')
                    .replace('C) ${Option C}', '')
          )

    option_text = df[options[:num_options]].iloc[i].to_list()
    ## Prompt for specific question
    new_prompt = sys_prompt_temp1.replace('${Question}', question)
    for j in range(num_options): ## This for loop allows for dynamic question amounts
        new_prompt = new_prompt.replace(f'${{Option {letters[j]}}}', str(option_text[j]))


    ## Add formatted prompts.
    ## Note that this is formatted to llama so changes may be needed down the line.
    prompts1 = (new_prompt.split('<Your concise reasoning here. Max 100 words>')[0]) ## Specific prompt for question

    prompts2 = (sys_prompt_temp2) ## Generic prompt for question confidence
    output_df.loc[i] = [df['Question Number'].iloc[i], prompts1, prompts2]

  return output_df

def test_models_sequential_by_question(df, models, debug=False, start = 0, stop = -1):
    """
    Tests a list of models on a given dataset sequentially,
    iterating through questions and then models for each question.
    Includes a debug mode to process only the first 10 questions.

    Args:
        df (pd.DataFrame): The dataset containing questions and prompts.
        models (list): A list of initialized model objects.
        debug (bool): If True, only process the first 10 questions.
    """
    print("Clearing previous results for each model...")
    for model in models:
        model.results = pd.DataFrame(columns=['Question ID', 'Question', 'Answer', 'Reasoning', 'Error'])
        print(f"  Cleared results for {model.name}")
    print("Starting sequential testing (by question)...")

    # Determine the number of questions to process
    num_questions_to_process = 2 if debug else len(df) if stop == -1 else stop - start

    # Iterate over questions first
    for index, row in tqdm(df.iloc[start: num_questions_to_process].iterrows(), total=num_questions_to_process, desc="Processing Questions"):
        question_num = row['Question Num']
        prompt = row['Full Prompt 1']
        '''

        I dont love this implementation But honestly,
        unless my logic is flawed with some edge case, I think this should work
        and I dont want to rewrite the functions for each subclass to take in the
        dataframe in order to work.
        '''
        global sys_prompt2
        sys_prompt2 = row['Full Prompt 2']

        print(f"\nProcessing Question {question_num}")

        # Iterate over models for the current question
        for model in models:

            try:
                print(f"  Testing with model: {model.name}")
                # Call GetRAC and add the result to the model's self.results
                reasoning, answer_confidence = model.GetRAC(prompt=prompt)

                # Add the results to the model's self.results DataFrame
                new_row = pd.DataFrame([{
                    'Question ID': question_num,
                    'Question': prompt,
                    'Answer': answer_confidence,
                    'Reasoning': reasoning,
                    'Error': False
                }])
                model.results = model.results._append(new_row, ignore_index=True)
                filename = f"{model.name.replace('/', '_').replace('-', '_')}_test_results.csv"
                model.results.to_csv(filename, index=False)
            except Exception as e:
                print(f"  Error testing {model.name} on Question {question_num}: {e}")
                # Optionally add an error entry to the results
                error_row = pd.DataFrame([{
                    'Question ID': question_num,
                    'Question': prompt,
                    'Answer': f"Error: {e}",
                    'Reasoning': f"Error: {e}",
                    'Error': True
                }])
                model.results = model.results._append(error_row, ignore_index=True)
                filename = f"{model.name.replace('/', '_').replace('-', '_')}_test_results.csv"
                model.results.to_csv(filename, index=False)
    print("\nSequential testing complete.")

    # After processing all questions, save the results for each model
    for model in models:
        # Mount Google Drive
        drive.mount('/content/drive')

        # Define the folder path
        folder_path = f'/content/drive/My Drive/SciQ'

        # Create the folder if it doesn't exist
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        filename = f"{model.name.replace('/', '_').replace('-', '_')}_test_results.csv"
        file_path = os.path.join(folder_path, filename)
        model.results.to_csv(file_path, index=False)
        print(f"Results for {model.name} saved to '{file_path}'")

## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
##                       Functions for Open Models
## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


def test_open_model_sciq(model, df, debug = False):
  model.results = pd.DataFrame(columns=['Question ID',
                                        'Question',
                                        'Stated Confidence',
                                        'Stated Answer',
                                        'Reasoning',
                                        'Error',
                                        'Token Probability'
                                        ])
  if debug:
    length = 3
  else:
    length = len(df)
  start = 0

  for i in tqdm(range(length), total=length, desc="Processing Questions"):
  #for i in range(length):
    try:
      ## Question Information
      question_num = df['Question Number'][i]
      question_text = df['Full Prompt 1'][i]
      reasoning_prompt = question_text


      ## Get reasoning from model
      model_reasoning = model.generate(reasoning_prompt)
      ## Get Answer from model

      answer_prompt = (reasoning_prompt
                      + '<|eot_id|><|start_header_id|>assistant<|end_header_id|>'
                      + model_reasoning
                      + '\n<|eot_id|><|start_header_id|>user<|end_header_id|>'
                      + sys_prompt2
                      + '<|eot_id|><|start_header_id|>assistant<|end_header_id|>'
                      + "{'Answer': '"
                      )
      model_tokens, answer_letter = model.GetTokens(answer_prompt)

      ## Get JSON formatted answer and confidence
      answer_JSON_prompt = answer_prompt + answer_letter + "',"
      model_confidence_JSON = "{'Answer': '" + answer_letter + "'," + model.generate(answer_JSON_prompt)

      new_row = pd.DataFrame([{
          'Question ID': question_num,
          'Question': question_text,
          'Stated Confidence': model_confidence_JSON,
          'Stated Answer': answer_letter,
          'Reasoning': model_reasoning,
          'Error': False,
          'Token Probability': model_tokens,
          'Correct Answer Letter': dataset['Correct Answer Letter'][i]
      }])
    except Exception as e:
      new_row = pd.DataFrame([{
          'Question ID': question_num,
          'Question': question_text,
          'Stated Confidence': 'ERROR',
          'Stated Answer': 'ERROR',
          'Reasoning': e,
          'Error': True,
          'Token Probability': 'ERROR',
          'Correct Answer Letter': dataset['Correct Answer Letter'][i]
      }])
    model.results = model.results._append(new_row, ignore_index=True)

  return model.results


In [None]:
## Import Dataset
print('-' *42)
file_path = '/content/sciq_test_formatted.csv'
print(f'Importing Dataset: {file_path}')
dataset = pd.read_csv(file_path)
display(dataset.head())

## Edit System Prompts
print('-' *42)
print('Editing System Prompts:')

sys_prompt1 = '''
Given the following question, analyze the options, and provide a concise reasoning for your selected answer. Your reasoning should not exceed 100 words. After your explanation, clearly state your answer by choosing one of the options listed (A, B, C, or D).

Question: ${Question}
Options:
A) ${Option A}
B) ${Option B}
C) ${Option C}
D) ${Option D}


Please provide your reasoning first, limited to 100 words, and consider how certain you should be of your answer.
Reasoning: <Your concise reasoning here. Max 100 words>
'''
sys_prompt2 = '''
Based on the reasoning above, Provide the best answer and the likelihood that each option is correct from 0.0 to 1.0 in a JSON format. The probabilities should sum to 1.0. For example:

{
'A': <Probability choice A is correct. As a float from 0.0 to 1.0>,
'B': <Probability choice B is correct. As a float from 0.0 to 1.0>,
'C': <Probability choice C is correct. As a float from 0.0 to 1.0>,
'D': <Probability choice D is correct. As a float from 0.0 to 1.0>,
'Answer': <Your answer choice here, as a single letter and nothing else.>
}

All options have a non-zero probability of being correct. No option should have a probability of 0 or 1.
Be modest about your certainty.  Do not provide any additional reasoning.

Response:
'''

print('System Prompts:')
print(f'  {sys_prompt1}')
print(f'  {sys_prompt2}')



## Format DF
print('-' *42)
print('Formatting Dataset:')
new_dataset = format_df(dataset)
print(' Successfully Formatted Dataset')
print('New Dataset:')
display(new_dataset.head())




------------------------------------------
Importing Dataset: /content/sciq_test_formatted.csv


Unnamed: 0,Question Number,Question,Option A,Option B,Option C,Option D,Correct Answer Letter
0,0,Compounds that are capable of accepting electr...,antioxidants,Oxygen,oxidants,residues,C
1,1,What term in biotechnology means a genetically...,adult,male,phenotype,clone,D
2,2,Vertebrata are characterized by the presence o...,backbone,Bones,Muscles,Thumbs,A
3,3,What is the height above or below sea level ca...,depth,latitude,elevation,variation,C
4,4,"Ice cores, varves and what else indicate the e...",mountain ranges,fossils,tree rings,magma,C


------------------------------------------
Editing System Prompts:
System Prompts:
  
Given the following question, analyze the options, and provide a concise reasoning for your selected answer. Your reasoning should not exceed 100 words. After your explanation, clearly state your answer by choosing one of the options listed (A, B, C, or D).

Question: ${Question}
Options:
A) ${Option A}
B) ${Option B}
C) ${Option C}
D) ${Option D}


Please provide your reasoning first, limited to 100 words, and consider how certain you should be of your answer.
Reasoning: <Your concise reasoning here. Max 100 words>

  
Based on the reasoning above, Provide the best answer and the likelihood that each option is correct from 0.0 to 1.0 in a JSON format. The probabilities should sum to 1.0. For example:

{
'A': <Probability choice A is correct. As a float from 0.0 to 1.0>,
'B': <Probability choice B is correct. As a float from 0.0 to 1.0>,
'C': <Probability choice C is correct. As a float from 0.0 to 1.

Unnamed: 0,Question Number,Full Prompt 1,Full Prompt 2
0,0,"\nGiven the following question, analyze the op...","\nBased on the reasoning above, Provide the be..."
1,1,"\nGiven the following question, analyze the op...","\nBased on the reasoning above, Provide the be..."
2,2,"\nGiven the following question, analyze the op...","\nBased on the reasoning above, Provide the be..."
3,3,"\nGiven the following question, analyze the op...","\nBased on the reasoning above, Provide the be..."
4,4,"\nGiven the following question, analyze the op...","\nBased on the reasoning above, Provide the be..."


In [None]:
print(new_dataset['Full Prompt 1'].iloc[64])


Given the following question, analyze the options, and provide a concise reasoning for your selected answer. Your reasoning should not exceed 100 words. After your explanation, clearly state your answer by choosing one of the options listed (A, B, C, or D).

Question: The temperature at which a substance melts is called its what point?
Options:
A) boiling
B) change
C) melting
D) freezing


Please provide your reasoning first, limited to 100 words, and consider how certain you should be of your answer.
Reasoning: 


In [None]:
## Initialize Models
print('-' *42)
print('Initializing Models:')
my_closed_models = {
    'GPT': {
        'api_key_name': 'gpt_api_key', # Name of the key to retrieve from userdata
        'models': [
            'gpt-4',
            'gpt-3.5-turbo'
        ]
    },
    'Claude': {
        'api_key_name': 'claude_api_key', # Name of the key to retrieve from userdata
        'models': [
            'claude-3-7-sonnet-20250219',
            'claude-3-haiku-20240307'
        ]
    },
    'Gemini': {
        'api_key_name': 'gemini_api_key', # Name of the key to retrieve from userdata
        'models': [
            'gemini-1.5-flash',
            'gemini-2.5-pro-preview-06-05'
        ]
    }
}

closed_models = init_models(my_closed_models)
print(' Successfully Initialied Models')

## Test Models on LSAT
print('-' *42)
print('Testing Models:')

test_models_sequential_by_question(new_dataset, closed_models, debug=False, stop = 1000)

In [None]:

##------------------------------------------------------------------------------
##                             OPEN MODELS
##------------------------------------------------------------------------------

## Get llama:

HF_TOKEN = userdata.get('hf_llama_token')
model_name = 'meta-llama/Llama-3.1-8B-Instruct'


my_llama = LlamaModel(name = model_name, key = HF_TOKEN, MaxTokens = 250)



Downloading Tokenizer for meta-llama/Llama-3.1-8B-Instruct
Downloading Model Weights for meta-llama/Llama-3.1-8B-Instruct


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Processing Questions:   0%|          | 0/3 [00:00<?, ?it/s]

UnboundLocalError: cannot access local variable 'question_num' where it is not associated with a value

In [None]:

## Get Results

sys_prompt2 = '''
Based on the reasoning above, Provide the best answer and the likelihood that each option is correct from 0.0 to 1.0 in a JSON format. The probabilities should sum to 1.0. For example:

{
'Answer': <Your answer choice here, as a single letter and nothing else>,
'A': <Probability choice A is correct. As a float from 0.0 to 1.0>,
'B': <Probability choice B is correct. As a float from 0.0 to 1.0>,
'C': <Probability choice C is correct. As a float from 0.0 to 1.0>,
'D': <Probability choice D is correct. As a float from 0.0 to 1.0>,
'E': <Probability choice E is correct. As a float from 0.0 to 1.0>
}<|eot_id|>

All options have a non-zero probability of being correct. No option should have a probability of 0 or 1.
Be modest about your certainty.  Do not provide any additional reasoning.

Response: '''


llama_results = test_open_model_sciq(my_llama, new_dataset, debug = False)
print('Finished benchmarking on model:')
display(llama_results)


## Save to drive

sciq_folder_path = '/content/drive/MyDrive/SciQ'


if not os.path.exists(sciq_folder_path):
    os.makedirs(sciq_folder_path)
    print(f"Created folder: {sciq_folder_path}")

# Define the full path for the CSV file
output_filename = f'SciQ_{model_name}'
output_filename = output_filename.replace('/', '_').replace('-','_').replace('.','_') + '.csv'
output_path = os.path.join(sciq_folder_path, output_filename)

# 2. Save the DataFrame to the specified path
llama_results.to_csv(output_path, index=False)

print(f"Successfully saved llama_results to {output_path}")

Processing Questions:   0%|          | 0/1000 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.

Finished benchmarking on model:


Unnamed: 0,Question ID,Question,Stated Confidence,Stated Answer,Reasoning,Error,Token Probability,Correct Answer Letter
0,0,"\nGiven the following question, analyze the op...","{'Answer': 'C', 'A': 0.05, 'B': 0.1, 'C': 0.9,...",C,Compounds that accept electrons are known as ...,False,"{'C': 0.9999351501464844, 'A': 4.4573560444405...",C
1,1,"\nGiven the following question, analyze the op...","{'Answer': 'D', 'A': 0.0, 'B': 0.0, 'C': 0.0, ...",D,The term in biotechnology that refers to a ge...,False,"{'D': 0.9999539852142334, 'A': 2.6117715606233...",D
2,2,"\nGiven the following question, analyze the op...","{'Answer': 'A', 'A': 0.95, 'B': 0.05, 'C': 0.0...",A,The question asks for a characteristic of Ver...,False,"{'A': 0.999984860420227, 'B': 1.29194731925963...",A
3,3,"\nGiven the following question, analyze the op...","{'Answer': 'C', 'A': 0.0, 'B': 0.0, 'C': 0.95,...",C,The question asks for the term that describes...,False,"{'C': 0.9999508857727051, 'A': 3.6407869629329...",C
4,4,"\nGiven the following question, analyze the op...","{'Answer': 'C', 'A': 0.1, 'B': 0.2, 'C': 0.9, ...",C,"Ice cores, varves, and tree rings are all nat...",False,"{'C': 0.999884843826294, 'A': 6.68252905597910...",C
...,...,...,...,...,...,...,...,...
995,995,"\nGiven the following question, analyze the op...","{'Answer': 'A', 'A': 0.9, 'B': 0.05, 'C': 0.03...",A,The question asks about the regulation of a s...,False,"{'A': 0.999987006187439, 'B': 9.78063326328992...",A
996,996,"\nGiven the following question, analyze the op...","{'Answer': 'A', 'A': 0.9, 'B': 0.05, 'C': 0.03...",A,Short period comets are thought to originate ...,False,"{'A': 0.9999831914901733, 'B': 1.2693610187852...",A
997,997,"\nGiven the following question, analyze the op...","{'Answer': 'D', 'A': 0.1, 'B': 0.05, 'C': 0.05...",D,The question asks about the development of an...,False,"{'D': 0.9999490976333618, 'A': 2.9808852559654...",D
998,998,"\nGiven the following question, analyze the op...","{'Answer': 'D', 'A': 0.0, 'B': 0.2, 'C': 0.3, ...",D,Atoms with unstable nuclei are considered to ...,False,"{'D': 0.9999649524688721, 'A': 2.4503649910911...",D


Successfully saved llama_results to /content/drive/MyDrive/SciQ/SciQ_meta_llama_Llama_3_1_8B_Instruct.csv


In [None]:
question = llama_results['Question'].iloc[0]
sc = llama_results['Stated Confidence'].iloc[0]
reasoning = llama_results['Reasoning'].iloc[0]
print(f'Question: {question}')
print('-' *42)
print(f'Reasoning: {reasoning}')
print('-' *42)
print(f'Confidence: {sc}')

Question: 
Given the following question, analyze the options, and provide a concise reasoning for your selected answer. Your reasoning should not exceed 100 words. After your explanation, clearly state your answer by choosing one of the options listed (A, B, C, or D).

Question: Compounds that are capable of accepting electrons, such as o 2 or f2, are called what?
Options:
A) antioxidants
B) Oxygen
C) oxidants
D) residues


Please provide your reasoning first, limited to 100 words, and consider how certain you should be of your answer.
Reasoning: 
------------------------------------------
Reasoning:  Compounds that accept electrons are known as oxidants. This is because they cause oxidation, which is the loss of electrons. Oxidants are often involved in chemical reactions that result in the transfer of electrons. In the context of the question, oxygen (o 2) and fluorine (f2) are both capable of accepting electrons, making them examples of oxidants. Therefore, the correct answer is C) 