Here we first generate a question using ChatGPT.
Then ask that question from other LLMs and get answers.
Then evalaute those answers using again ChatGPT and rank the performance of LLMs

In [None]:
!pip install anthropic

In [13]:
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
from anthropic import Anthropic
from IPython.display import Markdown, display

In [None]:
# load envornment variables and override if already set
load_dotenv(override=True)

In [None]:
# Print the key prefixes to help with any debugging

openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')
deepseek_api_key = os.getenv('DEEPSEEK_API_KEY')
groq_api_key = os.getenv('GROQ_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists")
else:
    print("OpenAI API Key not set")
    
if anthropic_api_key:
    print(f"Anthropic API Key exists")
else:
    print("Anthropic API Key not set")

if google_api_key:
    print(f"Google API Key exists")
else:
    print("Google API Key not set")

if deepseek_api_key:
    print(f"DeepSeek API Key exists")
else:
    print("DeepSeek API Key not set")

if groq_api_key:
    print(f"Groq API Key exists")
else:
    print("Groq API Key not set")

In [16]:
request = "Please come up with a challenging, nuanced question that I can ask a number of LLMs to evaluate their intelligence. "
request += "Answer only with the question, no explanation."

#provide message to LLM in standard format for OpenAI
messages = [{"role": "user", "content": request}]
#message is a list of dictionaries, each with a role and content

In [None]:
messages

In [None]:
openai = OpenAI()
response = openai.chat.completions.create(
    model="gpt-4o-mini",
    messages=messages,
)
question = response.choices[0].message.content
print(question)

In [19]:
competitors = []
# list we fill with the different model names of the competitors

answers = []
# fill with different answers from the different LLMs answering to the question

messages = [{"role": "user", "content": question}]
# provide the question we got from chatGPT to other LLMs

In [None]:
# Ask question from ChatGPT first

model_name = "gpt-4o-mini"

response = openai.chat.completions.create(model=model_name, messages=messages)
answer = response.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [21]:
competitors = []
# list we fill with the different model names of the competitors

answers = []
# fill with different answers from the different LLMs answering to the question

messages = [{"role": "user", "content": question}]
# provide the question we got from chatGPT to other LLMs

In [None]:
# Ask question from ChatGPT first

model_name = "gpt-4o-mini"

response = openai.chat.completions.create(model=model_name, messages=messages)
answer = response.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [None]:
# Ask asnwer for same question from Anthropic

model_name = "claude-3-7-sonnet-latest"

claude = Anthropic()
response = claude.messages.create(model=model_name, messages=messages, max_tokens=1000)
answer = response.content[0].text

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [None]:
# ask answer for same question from Google Gemini

gemini = OpenAI(api_key=google_api_key, base_url="https://generativelanguage.googleapis.com/v1beta/openai/")
model_name = "gemini-2.0-flash"

response = gemini.chat.completions.create(model=model_name, messages=messages)
answer = response.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [None]:
# ask answer for same question from DeepSeek

deepseek = OpenAI(api_key=deepseek_api_key, base_url="https://api.deepseek.com/v1")
model_name = "deepseek-chat"

response = deepseek.chat.completions.create(model=model_name, messages=messages)
answer = response.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [None]:
# ask answer for same question from Groq

groq = OpenAI(api_key=groq_api_key, base_url="https://api.groq.com/openai/v1")
model_name = "llama-3.3-70b-versatile"

answer = response.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)


Use locally installed ollama to ask the question

In [24]:
# Ollama is a local service. It's a piece of software that runs on your local computer. And it provides an endpoint, a web service that you can call running on 
# your local host.
# So you'll be able to go to local host and then some ports and be able to talk to it.

# And the endpoint that it offers is compatible with OpenAI, the same kind of endpoint that we've been using in all of these other models.
# Ollama has that same endpoint.

# And when you hit that endpoint, it has some highly optimized Cplusplus code to run open source models locally on your box.
# And it should be said that it can only run small models because running it on your computer directly.

In [None]:
!ollama pull llama3.2

In [None]:
ollama = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')
model_name = "llama3.2"

response = ollama.chat.completions.create(model=model_name, messages=messages)
answer = response.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

Now let`s evaluate which model performs best

For that also we will use a LLM which is OpenAI ChatGPT

In [None]:
print(competitors)
print(answers)


In [None]:
for competitor, answer in zip(competitors, answers):
    print(f"\n\nCompetitor: {competitor}\n\n{answer}")

# we can itearte through several lists at the same time using zip functionality in python

In [29]:
# Let's bring this together

together = ""
for index, answer in enumerate(answers):
    together += f"# Response from competitor {index+1}\n\n"
    together += answer + "\n\n"

In [None]:
print(together)

In [31]:
judge = f"""You are judging a competition between {len(competitors)} competitors.
Each model has been given this question:

{question}

Your job is to evaluate each response for clarity and strength of argument, and rank them in order of best to worst.
Respond with JSON, and only JSON, with the following format:
{{"results": ["best competitor number", "second best competitor number", "third best competitor number", ...]}}

Here are the responses from each competitor:

{together}

Now respond with the JSON with the ranked order of the competitors, nothing else. Do not include markdown formatting or code blocks."""


In [None]:
print(judge)

In [34]:
judge_messages = [{"role": "user", "content": judge}]

In [None]:
# evaluate the results using ChatGPT

openai = OpenAI()
response = openai.chat.completions.create(
    model="o3-mini",
    messages=judge_messages,
)
results = response.choices[0].message.content
print(results)


In [None]:
# results

results_dict = json.loads(results)
ranks = results_dict["results"]
for index, result in enumerate(ranks):
    competitor = competitors[int(result)-1]
    print(f"Rank {index+1}: {competitor}")