# Compare LLama Providers

Run evaluations on a few prompts for llama3.1 70B across several providers and comprate results to baseline.

In [9]:
# Test connection to all the API keys
import os
from openai import OpenAI
from groq import Groq
from together import Together
import requests
import dotenv

dotenv.load_dotenv()

def test_openrouter_connection():
    api_key = os.environ.get("OPENROUTER_API_KEY")
    if not api_key:
        print("OpenRouter API key not found.")
        return
    
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    response = requests.get("https://openrouter.ai/api/v1/models", headers=headers)
    if response.status_code == 200:
        print("OpenRouter connection successful.")
    else:
        print(f"OpenRouter connection failed. Status code: {response.status_code}")

def test_groq_connection():
    api_key = os.environ.get("GROQ_API_KEY")
    if not api_key:
        print("Groq API key not found.")
        return
    
    groq_client = Groq(api_key=api_key)
    try:
        response = groq_client.chat.completions.create(
            messages=[{"role": "user", "content": "Hello"}],
            model="mixtral-8x7b-32768"  # Using a model supported by Groq
        )
        if response.choices[0].message.content:
            print("Groq connection successful.")
        else:
            print("Groq connection failed. No content in response.")
    except Exception as e:
        print(f"Groq connection failed. Error: {str(e)}")

def test_together_connection():
    api_key = os.environ.get("TOGETHER_API_KEY")
    if not api_key:
        print("Together API key not found.")
        return
    
    together_client = Together(api_key=api_key)
    try:
        together_client.models.list()
        print("Together connection successful.")
    except Exception as e:
        print(f"Together connection failed. Error: {str(e)}")

# Run connection tests
print("Testing API connections...")
test_openrouter_connection()
test_groq_connection()
test_together_connection()

Testing API connections...
OpenRouter connection successful.
🍩 https://wandb.ai/tuminha/compare-llamas/r/call/01924d97-e674-7761-9887-7459c06e93fe
Groq connection successful.
Together connection successful.


In [1]:
# Install and read in required packages, plus create an anthropic client.
print('⏳ Installing packages')
%pip install -q weave set-env-colab-kaggle-dotenv tqdm ipywidgets requests groq together
print('✅ Packages installed')

⏳ Installing packages
[33mDEPRECATION: Loading egg at /Users/franciscoteixeirabarbosa/anaconda3/envs/streamlitapp/lib/python3.11/site-packages/sortedcontainers-2.4.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /Users/franciscoteixeirabarbosa/anaconda3/envs/streamlitapp/lib/python3.11/site-packages/pdfminer.six-20200121-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /Users/franciscoteixeirabarbosa/anaconda3/envs/streamlitapp/lib/python3.11/site-packages/grobid_client_python-0.0.7-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip fo

In [25]:
pip install --upgrade langchain langchain-community

[33mDEPRECATION: Loading egg at /Users/franciscoteixeirabarbosa/anaconda3/envs/streamlitapp/lib/python3.11/site-packages/sortedcontainers-2.4.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /Users/franciscoteixeirabarbosa/anaconda3/envs/streamlitapp/lib/python3.11/site-packages/pdfminer.six-20200121-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /Users/franciscoteixeirabarbosa/anaconda3/envs/streamlitapp/lib/python3.11/site-packages/grobid_client_python-0.0.7-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation

In [1]:
!pip install --upgrade together

[33mDEPRECATION: Loading egg at /Users/franciscoteixeirabarbosa/anaconda3/envs/streamlitapp/lib/python3.11/site-packages/sortedcontainers-2.4.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /Users/franciscoteixeirabarbosa/anaconda3/envs/streamlitapp/lib/python3.11/site-packages/pdfminer.six-20200121-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /Users/franciscoteixeirabarbosa/anaconda3/envs/streamlitapp/lib/python3.11/site-packages/grobid_client_python-0.0.7-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation

In [3]:
# Import Together client
import os
from together import Together

# Initialize Together client
together_client = Together(api_key=os.environ.get('TOGETHER_API_KEY'))

In [4]:
from tqdm.notebook import tqdm_notebook as tqdm
from set_env import set_env
from openai import OpenAI
from groq import Groq
from together import Together
import weave
import os
import json
import requests
set_env("OPENROUTER_API_KEY")
set_env("GROQ_API_KEY")
set_env("WANDB_API_KEY")

groqclient = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)
client = Together(api_key=os.environ.get('TOGETHER_API_KEY'))

In [5]:
# Initialize Weave project
weave.init('compare-llamas')

# Define a base LlamaModel class using Weave
class LlamaModel(weave.Model):
    provider: str  # Provider attribute to specify the API provider

    @weave.op()
    def predict(self, prompt: str) -> str:
        # Prepare the request data
        data = {
            "model": "meta-llama/llama-3.1-70b-instruct",
            "messages": [
                {"role": "user", "content": prompt}
            ],
            "temperature": 0,
            "provider": {
                "order": [self.provider],
                "allow_fallbacks": False
            }
        }
        
        # Make the API request and return the response
        response = self.make_openrouter_request(data)
        return response['choices'][0]['message']['content']

    @weave.op()
    def make_openrouter_request(self, data):
        # Make a POST request to the OpenRouter API
        try:
            response = requests.post(
                "https://openrouter.ai/api/v1/chat/completions",
                headers = {
                    "Authorization": f"Bearer {os.environ['OPENROUTER_API_KEY']}",
                    "Content-Type": "application/json"
                },
                json=data
            )
            response.raise_for_status()  # Raise an exception for bad responses
            return response.json()
        except requests.RequestException as e:
            raise Exception(f"API request failed: {str(e)}")

# Define a GroqModel class using Weave
class GroqModel(weave.Model):

    @weave.op()
    def predict(self, prompt: str) -> str:
        # Make a request to the Groq API
        response = groqclient.chat.completions.create(
            model='llama-3.1-70b-versatile',
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            temperature=0.0,
            seed=123123
        )
        return response.choices[0].message.content
    
# Define a TogetherModel class using Weave
class TogetherModel(weave.Model):
    @weave.op()
    def predict(self, prompt: str) -> str:
        # Make a request to the Together API
        response = self.make_together_request(prompt)
        return response.choices[0].message.content

    @weave.op()
    def make_together_request(self, prompt):
        # Create a chat completion using the Together API
        response = client.chat.completions.create(
            model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            temperature=0,
        )
        return response

# Create instances of LlamaModel for different providers
octoai_llama = LlamaModel(provider='OctoAI', name='OctoAILLa_LLama3.1_70B')
novitaai_llama = LlamaModel(provider='Novita', name='NovitaAI_LLaMa3.1_70B')
deepinfra_llama = LlamaModel(provider='DeepInfra', name='DeepInfra_LLaMa3.1_70B')
fireworks_llama = LlamaModel(provider='Fireworks', name='Fireworks_LLaMa3.1_70B')

# Create instances of GroqModel and TogetherModel
groq_llama = GroqModel(name='Groq_LLaMa3.1_70B')
together_llama = TogetherModel(name='Together_LLaMa3.1_70B')

print("✅ Weave models created")

Logged in as Weights & Biases user: tuminha.
View Weave data at https://wandb.ai/tuminha/compare-llamas/weave
✅ Weave models created


In [6]:
# Let's build a dataset of quirky prompts and potentially their answers 
from weave import Dataset

# Define a dataset of quirky prompts and their corresponding rubrics
quirky_prompts = Dataset(
    name="my_llama_quirky_prompts",
    rows=[
        {
            "question": "Give me 10 sentences that end in the word \"apple\"",
            "rubric": "all sentences must end with the word apple"
        },
        {
            "question": "Answer with the number of legs about the following statement: The fox lost a leg, but then magically grew back the leg he lost and a mysterious extra leg on top of that",
            "rubric": "Answer must be 5 or five"
        },
        {
            "question": "Yam (a boy) has 4 sisters. Each sister has 3 brothers. How many brothers does Yam have? Let's think step by step.",
            "rubric": "Answer must indicate that Yam has 2 brothers"
        },
        {
            "question": "You have five apples today, you ate two apples yesterday so how many apples do you have today? Provide a logical answer.",
            "rubric": "Answer must be five and explain that yesterdays action have no bearing on todays apple quantity"
        },
        {
            "question": "Which number is bigger: 9.11 or 9.9?",
            "rubric": "Answer should conclude that 9.9 is bigger" 
        },
        {
            "question": "If I hang 5 shirts outside and it takes them 5 hours to dry, how long would it take to dry 30 shirts",
            "rubric": "Answer must state that it would take the same amount of time"
        },
        {
            "question": "There are three sisters in a room. Anna is reading a book. Alice is playing a match of chess against someone in the room. What is the third sister, Amanda, doing?",
            "rubric": "Playing chess with Alice"
        },
        {
            "question": """Determine all triples (x, y, z) of real numbers that are solutions to the following
system of equations:
log9 x + 10g9 y + 10g3 z = 2
log 16 x + log4 y + log16 z = 1
log5 x + log25 y + log25 z = 0
""",
            "rubric": "IDK the answer to this one"
        }
    ]
)

# Detailed explanation of the code:

# 1. We import the Dataset class from the weave module.
#    This class is used to create a structured dataset that can be used for evaluation.

# 2. We create a Dataset object named "quirky_prompts" with the following properties:
#    - name: A string identifier for the dataset ("my_llama_quirky_prompts")
#    - rows: A list of dictionaries, where each dictionary represents a prompt

# 3. Each row in the dataset contains two key-value pairs:
#    - "question": The quirky prompt or question to be answered
#    - "rubric": The criteria or expected answer for evaluating the model's response

# 4. The dataset contains 8 different prompts, each designed to test various aspects of language model capabilities:
#    - Sentence generation with specific endings
#    - Logical reasoning and arithmetic
#    - Understanding context and implicit information
#    - Temporal logic
#    - Numerical comparison
#    - Problem-solving with irrelevant information
#    - Inference from given information
#    - Complex mathematical problem (intentionally difficult)

# 5. This dataset will be used later in the evaluation process to test different language models
#    and assess their performance on these quirky and challenging prompts.

# Explanation of rubrics:
# Rubrics are evaluation criteria or guidelines used to assess the quality or correctness of a response.
# In this context, each question in our dataset is accompanied by a rubric that specifies what constitutes
# a correct or acceptable answer. Rubrics serve several important purposes:

# 1. Consistency: They ensure that all responses are evaluated using the same criteria.
# 2. Objectivity: They help reduce subjectivity in the evaluation process.
# 3. Clarity: They provide clear expectations for what a good answer should include.
# 4. Feedback: They can be used to provide specific feedback on where a response falls short.

# We use rubrics in this dataset to:
# - Guide the evaluation of model responses
# - Determine if a model has correctly understood and answered the quirky prompts
# - Measure the model's ability to handle tricky or unconventional questions
# - Assess the model's reasoning capabilities and attention to detail

# The rubrics in this dataset are relatively simple, often just stating the correct answer or key points
# that should be included. In more complex evaluation scenarios, rubrics could be more detailed,
# including scoring criteria or multiple levels of correctness.

# Additional examples that could be added to the dataset:

# {
#     "question": "If you're running a race and you pass the person in second place, what place are you in now?",
#     "rubric": "Answer should be second place"
# },
# {
#     "question": "A rooster lays exactly one egg every day. How many eggs will it have laid in one week?",
#     "rubric": "Answer should explain that roosters don't lay eggs"
# },
# {
#     "question": "What's the next number in this sequence: 1, 11, 21, 1211, 111221, ...",
#     "rubric": "Answer should be 312211 (look-and-say sequence)"
# }

# These additional examples would further test the model's ability to handle trick questions,
# common misconceptions, and pattern recognition. The rubrics for these examples follow the same
# principle of providing clear criteria for what constitutes a correct answer, allowing for
# consistent evaluation across different models or evaluators.


In [7]:
# Get all instantiated models
# models = [octoai_llama, together_llama]
models = [deepinfra_llama, fireworks_llama, groq_llama]

# Define our scoring functions
# The '@' symbol in Python is used for decorators. Decorators are a way to modify or enhance functions
# without changing their source code directly. In this case, @weave.op() is a decorator that likely
# registers this function with the Weave framework for use in evaluations.
@weave.op()
def has_response(rubric: str, model_output: dict) -> dict:
    # This function checks if the model output is not None and returns a dictionary
    # indicating whether there's a response or not
    return {'has_response': model_output is not None}

# Define the preprocess_model_input function
def preprocess_model_input(row):
    # This function takes a row from the dataset and formats it as input for the model
    # It extracts the 'question' field and puts it in a dictionary under the key 'prompt'
    return {'prompt': row['question']}

# Define the evaluation
evaluation = weave.Evaluation(
    name='quirky_prompts_eval',  # Name of the evaluation
    dataset=quirky_prompts,      # Dataset to use for evaluation
    trials=1,                    # Number of trials to run
    scorers=[
        has_response             # List of scoring functions to use
    ],
    preprocess_model_input=preprocess_model_input  # Function to preprocess input
)

# Run evaluation for each model
results = {}
for model in models:
    print(f"Evaluating {model.name}...")
    # The 'await' keyword suggests this is running in an asynchronous context
    # It evaluates the model using the defined evaluation setup
    result = await evaluation.evaluate(model)
    # Store the result for each model in the results dictionary
    results[model.name] = result

"""
Here's another example of how the '@' decorator works in Python:

@timer
def slow_function():
    time.sleep(2)
    print("Function complete")

In this example, '@timer' is a decorator that might be defined like this:

import time

def timer(func):
    def wrapper(*args, **kwargs):
        start = time.time()
        result = func(*args, **kwargs)
        end = time.time()
        print(f"Function {func.__name__} took {end - start} seconds to run")
        return result
    return wrapper

When 'slow_function' is called, it will actually run the 'wrapper' function defined in 'timer'.
This wrapper will time how long the original function takes to run, print that information,
and then return the result of the original function. This allows us to add timing functionality
to any function simply by adding the '@timer' decorator, without changing the function itself.
"""


Evaluating DeepInfra_LLaMa3.1_70B...


🍩 https://wandb.ai/tuminha/compare-llamas/r/call/01924c11-29e7-7ed3-a9da-a557a563f438
Evaluating Fireworks_LLaMa3.1_70B...


🍩 https://wandb.ai/tuminha/compare-llamas/r/call/01924c11-a827-7df3-9e7f-7374001ad308
Evaluating Groq_LLaMa3.1_70B...


🍩 https://wandb.ai/tuminha/compare-llamas/r/call/01924c11-d517-7e21-8b56-048d1256c17a
