<a href="https://colab.research.google.com/github/NoamMichael/Comparing-Confidence-in-LLMs/blob/main/LSAT_Benchmarking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This Notebook will test all models on the formatted LSAT-AR dataset
# I have no issue running multiple API clients simultaneously. However, running
# local models is pretty memory intensive so I can only run one at a time.
%pip install anthropic
%pip install openai

In [57]:
import pandas as pd
import numpy as np
import json
import time
import random
import torch
import matplotlib.pyplot as plt
from transformers import (AutoTokenizer,
                        AutoModelForCausalLM,
                        BitsAndBytesConfig,
                        pipeline)
import warnings
import openai
import anthropic
import google.generativeai as genai
from abc import ABC, abstractmethod

warnings.filterwarnings('ignore')
from google.colab import userdata


class OpenModel: ## This class is built around Hugging Face methods
  def __init__(self, name, key, MaxTokens = 150):
    self.name = name
    self.key = key
    self.MaxTokens = MaxTokens
    print(f"Downloading Tokenizer for {self.name}")
    self.tokenizer = AutoTokenizer.from_pretrained(self.name,token = self.key) ## Import Tokenizer
    print(f"Downloading Model Weights for {self.name}")
    self.model = AutoModelForCausalLM.from_pretrained(self.name, token = self.key, device_map="auto") ## Import Model

    ## Make text generation pipeline
    self.pipeline = pipeline(
    "text-generation",
    model = self.model,
    tokenizer = self.tokenizer,
    do_sample = False,
    max_new_tokens = self.MaxTokens,
    eos_token_id = self.tokenizer.eos_token_id,
    pad_token_id = self.tokenizer.eos_token_id
    )

  def generate(self, prompt):
    return self.pipeline(prompt)[0]['generated_text']

  def GetTokens(self, prompt: str):
    ## Get Answer:
    batch = self.tokenizer(prompt, return_tensors= "pt").to('cuda')
    with torch.no_grad():
        outputs = self.model(**batch)
    ## Get Token Probabilites
    logits = outputs.logits

    ## Apply softmax to the logits to get probabilities
    probs = torch.softmax(logits[0, -1], dim=0)

    ##Get the top k token indices and their probabilities
    top_k_probs, top_k_indices = torch.topk(probs, 100, sorted =True)

    ## Convert token indices to tokens
    top_k_tokens = [self.tokenizer.decode([token_id]) for token_id in top_k_indices]

    ## Convert probabilities to list of floats
    top_k_probs = top_k_probs.tolist()                  #list of probabilities

    ## Create a Pandas Series with tokens as index and probabilities as values
    logit_series = pd.Series(top_k_probs, index=top_k_tokens)

    ## Sort the series by values in descending order
    logit_series = logit_series.sort_values(ascending=False)
    logit_series.index.name = "Token"
    logit_series.name = "Probability"
    return logit_series

class ClosedModel(ABC):
  @abstractmethod
  def generate(self, prompt: str, system:str = "")-> str:
        """
        Abstract method to generate a response from the language model.
        """
        pass
  @abstractmethod
  def __init__(self, name, api_key):
    self.name = name
    self.key = api_key
    pass

  @abstractmethod
  def client(self):
    pass

class GPTmodel(ClosedModel):
  def __init__(self, name, api_key):
    self.name = name
    self.key = api_key

  def client(self):
    # Initialize the OpenAI client with the API key
    self.client = openai.OpenAI(api_key=self.key)


  def generate(self, prompt: str, system: str = "") -> str:
    # Use the new client-based API call
    response = self.client.chat.completions.create(
        model=self.name,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=100
    )
    # Access the content from the new response object structure
    return response.choices[0].message.content


class AnthropicModel(ClosedModel):
  def __init__(self, name, api_key):
    self.name = name
    self.key = api_key
  def client(self):
    # Initialize the Anthropic client with the API key
    self.client = anthropic.Anthropic(api_key=self.key)

  def generate(self, prompt: str, system: str = "") -> str:
    # The messages list should only contain user and assistant roles
    messages = [{"role": "user", "content": prompt}]

    # Use the Anthropic client to create a message
    # Pass the system message as a top-level 'system' parameter
    message = self.client.messages.create(
        model=self.name,
        max_tokens=100, # You can adjust this or make it an instance variable
        messages=messages,
        system=system if system else None # Pass system as a separate parameter, or None if empty
    )
    # Access the content from the response object
    return message.content[0].text


class GeminiModel(ClosedModel):
  def __init__(self, name, api_key):
    self.name = name
    self.key = api_key

  def client(self):
    # Initialize the google.generativeai client with the API key

    genai.configure(api_key=self.key)
    self.model = genai.GenerativeModel(model_name=self.name)

  def generate(self, prompt: str, system: str = "") -> str:
    # Build the content list, including the system message if provided
    contents = [{"role": "user", "parts": [prompt]}]
    if system:
        contents = [{"role": "user", "parts": [system]}] + contents

    # Use the Gemini model to generate content
    response = self.model.generate_content(contents)

    # Access the content from the response object
    return response.text

In [None]:
## Playground for Llama
hf_llama_token = userdata.get('hf_llama_token')
test_name = 'meta-llama/Llama-3.1-8B-Instruct'
test_key = hf_llama_token
test_prompt = "Zdzisław Beksiński was"

test_model = OpenModel(name = test_name, key = test_key)

In [None]:
print(test_model.generate(test_prompt))
test_model.GetTokens(test_prompt)

In [61]:
## Playground for GPT

gpt_4_key = userdata.get('gpt_api_key')
test_name = 'gpt-4'
test_key = gpt_4_key
test_prompt = "Zdzisław Beksiński was"
test_system = "You are a helpful assistant."

my_gpt = GPTmodel(name = test_name, api_key = test_key)
my_gpt.client()
my_gpt.generate(test_prompt, test_system)

"a renowned Polish painter, photographer, and sculptor. He is best known for his large, detailed images of a surreal, post-apocalyptic environment. Beksiński's works are characterized by their haunting, dystopian feel, often featuring desolate landscapes and tormented figures. Despite the grim themes, he insisted his work was not to be read literally and that he was not a pessimist. Beksiński was born on February 24, 1929, and tragically murdered in"

In [23]:
## Playground for Claude

claude_key = userdata.get('claude_api_key')
test_name = 'claude-3-haiku-20240307'
test_key = claude_key
test_prompt = "Zdzisław Beksiński was"

my_claude = AnthropicModel(name = test_name, api_key = test_key)
my_claude.client()
my_claude.generate(test_prompt)

'Zdzisław Beksiński was a Polish painter, photographer, and sculptor known for his dark, dystopian, and surreal artworks. Here are some key facts about Zdzisław Beksiński:\n\n1. Born in 1929 in Sanok, Poland, Beksiński studied architecture and civil engineering before turning to art full-time in the late 1960s.\n\n2. His paintings are characterized by a unique style'

In [29]:
## Playground for Gemini

gemini_api_key = userdata.get('gemini_api_key') # Assuming you stored your key in Userdata
test_name = "gemini-2.0-flash" # Or another Gemini model name like 'gemini-1.5-flash'
test_key = gemini_api_key
test_prompt = "Zdzisław Beksiński was"
test_system = "You are a helpful assistant." # Optional system message

my_gemini = GeminiModel(name = test_name, api_key = test_key)
my_gemini.client()
my_gemini.generate(test_prompt, test_system)


'Zdzisław Beksiński was a Polish painter, photographer, and sculptor, known for his dystopian surrealist art. His works often depicted nightmarish environments with decaying figures, desolate landscapes, and unsettling imagery. He famously refused to title his pieces, preferring to leave them open to interpretation by the viewer.\n'

In [None]:
## Pseudo Code
## Import dataset

## Initialize all closed models

##--Initialize all GPT models
##----GPT-4o
##----GPT-o3
##--Initialize all Claude models
##----Claude-3.7 Sonnet
##----Claude-4 Sonnet
##--Initialize Gemini
##----Gemini-2.0 Flash
##----Gemini-1.5 Flash
##----Gemini-2.5 Pro

'''
for question in dataset:
  for model in ClosedModels:
    model.generate(question) #Since we are iterating over ClosedModels we can call the abstract method .generate()

'''




In [60]:
## Initializing my closed models

my_closed_models = {
    'GPT': {
        'api_key_name': 'gpt_api_key', # Name of the key to retrieve from userdata
        'models': [
            'gpt-4',
            'gpt-3.5-turbo'
        ]
    },
    'Claude': {
        'api_key_name': 'claude_api_key', # Name of the key to retrieve from userdata
        'models': [
            #'claude-3-sonnet-20240229',
            'claude-3-haiku-20240307'
        ]
    },
    'Gemini': {
        'api_key_name': 'gemini_api_key', # Name of the key to retrieve from userdata
        'models': [
            'gemini-1.5-flash',
            #'gemini-1.5-pro'
        ]
    }
}

print('Initializing Closed Models:')
closed_models = []
for model_type in my_closed_models:
    print(f'{model_type}:')
    api_key_name = my_closed_models[model_type]['api_key_name']
    api_key = userdata.get(api_key_name)
    print(f'  API Key Name: {my_closed_models[model_type]["api_key_name"]}')
    for model_name in my_closed_models[model_type]['models']:
      # Instantiate the correct subclass based on model_type
      if model_type == 'GPT':
          my_model = GPTmodel(name = model_name, api_key = api_key)
      elif model_type == 'Claude':
          my_model = AnthropicModel(name = model_name, api_key = api_key)
      elif model_type == 'Gemini':
          my_model = GeminiModel(name = model_name, api_key = api_key)
      else:
          # Handle unexpected model types if necessary
          print(f"Warning: Unknown model type {model_type}. Skipping.")
          continue # Skip to the next model name if type is unknow
      my_model.client()
      closed_models.append(my_model)
      print(f'    {model_name}')


print(f'Models Initialized: {len(models)}')
print(f'Model locations:\n{models}')

print('-'*42)
print('Testing all closed models:')
print(f'Test prompt: {test_prompt}')
print(f'Test system: {test_system}')

for model in closed_models:
  print(f'\nTesting model: {model.name}')
  print(model.generate(test_prompt, test_system))


Initializing Closed Models:
GPT:
  API Key Name: gpt_api_key
    gpt-4
    gpt-3.5-turbo
Claude:
  API Key Name: claude_api_key
    claude-3-haiku-20240307
Gemini:
  API Key Name: gemini_api_key
    gemini-1.5-flash
Models Initialized: 6
Model locations:
[<__main__.GPTmodel object at 0x78ab348a2b50>, <__main__.GPTmodel object at 0x78ab33dce250>, <__main__.AnthropicModel object at 0x78ab34564690>, <__main__.AnthropicModel object at 0x78ab33e43c50>, <__main__.GeminiModel object at 0x78ab33fa7390>, <__main__.GeminiModel object at 0x78ab33fb0990>]
------------------------------------------
Testing all closed models:
Test prompt: Zdzisław Beksiński was
Test system: You are a helpful assistant.

Testing model: gpt-4
a renowned Polish painter, photographer, and sculptor. He is best known for his large, detailed images of a surreal, post-apocalyptic environment. His works are often disturbing and filled with themes of death, decay, and darkness. Despite the grim themes, Beksiński claimed his w

In [32]:
## Example Implementation:


## Import dataset

# Assuming you have your API keys stored in userdata
gpt_4_key = userdata.get('gpt_api_key')
claude_key = userdata.get('claude_api_key')
gemini_api_key = userdata.get('gemini_api_key')

## Initialize all closed models you want to test

# Create a list to hold your model instances
ClosedModels = []

##--Initialize all GPT models
##----GPT-4o
ClosedModels.append(GPTmodel(name='gpt-4o', api_key=gpt_4_key))
##----GPT-o3 (Assuming you meant gpt-3.5-turbo or similar)
ClosedModels.append(GPTmodel(name='gpt-3.5-turbo', api_key=gpt_4_key)) # Or the correct GPT-3 model name

##--Initialize all Claude models
##----Claude-3 Sonnet (Corrected name, typically claude-3-sonnet-20240229)
ClosedModels.append(AnthropicModel(name='claude-3-sonnet-20240229', api_key=claude_key))
##----Claude-3 Haiku (You already tested this one)
ClosedModels.append(AnthropicModel(name='claude-3-haiku-20240307', api_key=claude_key))
# Claude 4 is not a standard model name, perhaps you meant Claude 3.5 Sonnet?
# ClosedModels.append(AnthropicModel(name='claude-3-5-sonnet-20240620', api_key=claude_key)) # If Claude 3.5 Sonnet is what you meant

##--Initialize Gemini
##----Gemini-2.0 Flash (Assuming gemini-1.5-flash as a common name)
ClosedModels.append(GeminiModel(name='gemini-1.5-flash', api_key=gemini_api_key))
##----Gemini-2.0 Pro (Assuming gemini-1.5-pro as a common name)
ClosedModels.append(GeminiModel(name='gemini-1.5-pro', api_key=gemini_api_key))
# Gemini 2.5 Pro is not a standard model name, perhaps you meant Gemini 1.5 Pro?

# You can add more models to the list as needed.

# --- Example Dataset (Replace with your actual dataset loading) ---
# Assuming your dataset is a list of strings representing questions/prompts
dataset = [
    "What is the capital of France?",
    "Explain the concept of recursion in programming.",
    "Write a short story about a cat.",
    "Summarize the plot of The Great Gatsby."
]
# -------------------------------------------------------------------


for question in dataset:
  print(f"\nTesting prompt: {question}")
  for model in ClosedModels:
    try:
        print(f"--- Calling model: {model.name} ---")
        # Note: Your ClosedModel base class doesn't define 'generate'.
        # You likely want to use the 'generate' method defined in your subclasses.
        # Make sure the 'generate' method in your subclasses takes the expected
        # parameters (prompt, and potentially system).
        # Let's assume your closed models' 'generate' method takes prompt and system.
        # If a system message is needed, you'll need to provide one here or modify
        # your model classes/loop structure. For simplicity, let's assume
        # the prompts can be handled without a separate system message for now,
        # or you can add one if needed.
        response = model.generate(prompt=question, system="") # Pass the question from the dataset
        print(f"Response from {model.name}: {response[:200]}...") # Print first 200 chars of response
    except Exception as e:
        print(f"Error calling model {model.name}: {e}")
