In [1]:
# Install and read in required packages, plus create an anthropic client.
print('⏳ Installing packages')
%pip install -q weave  tqdm ipywidgets requests groq together
print('✅ Packages installed')

⏳ Installing packages
Note: you may need to restart the kernel to use updated packages.
✅ Packages installed


In [8]:
from tqdm.notebook import tqdm_notebook as tqdm
from set_env import set_env
from openai import OpenAI
from groq import Groq
from together import Together
import weave
import os
import json
import requests
import ollama
set_env("GROQ_API_KEY")
set_env("WANDB_API_KEY")

groqclient = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)
client = Together(api_key=os.environ.get('TOGETHER_API_KEY'))

In [9]:
# Initialize Weave project
weave.init('compare-llamas')


class GroqModel(weave.Model):

    @weave.op()
    def predict(self, prompt: str) -> str:
        response = groqclient.chat.completions.create(
            model='llama-3.1-70b-versatile',
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            temperature=0.0,
            seed=123123
        )
        return response.choices[0].message.content
class OllamaModel(weave.Model):

    @weave.op()
    def predict(self,prompt: str) -> str:
        client = ollama.Client()


        response = client.chat(
                model="llama3.1",
                messages=[
                    {
                        "role": "user",
                        "content": prompt
                    },
                    ],
                )
        return response['message']['content']

    
class TogetherModel(weave.Model):
    @weave.op()
    def predict(self, prompt: str) -> str:
        response = self.make_together_request(prompt)
        return response.choices[0].message.content

    @weave.op()
    def make_together_request(self, prompt):
        response = client.chat.completions.create(
            model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            temperature=0,
        )
        return response

# Create LLamas per provider OctoAI, NovitaAI, DeepInfra, Together, Fireworks
groq_llama = GroqModel(name='Groq_LLaMa3.1_70B')
together_llama = TogetherModel(name='Together_LLaMa3.1_70B')
ollama_llama =OllamaModel(name = 'Ollama_llama3.1')

print("✅ Weave models created")

✅ Weave models created


In [10]:
# Let's build a dataset of quirky prompts and potentially their answers 
from weave import Dataset

quirky_prompts = Dataset(
    name="my_llama_quirky_prompts",
    rows=[
        {
            "question": "Give me 10 sentences that end in the word \"apple\"",
            "rubric": "all sentences must end with the word apple"
        },
        {
            "question": "Answer with the number of legs about the following statement: The fox lost a leg, but then magically grew back the leg he lost and a mysterious extra leg on top of that",
            "rubric": "Answer must be 5 or five"
        },
        {
            "question": "Yam (a boy) has 4 sisters. Each sister has 3 brothers. How many brothers does Yam have? Let's think step by step.",
            "rubric": "Answer must indicate that Yam has 2 brothers"
        },
        {
            "question": "You have five apples today, you ate two apples yesterday so how many apples do you have today? Provide a logical answer.",
            "rubric": "Answer must be five and explain that yesterdays action have no bearing on todays apple quantity"
        },
        {
            "question": "Which number is bigger: 9.11 or 9.9?",
            "rubric": "Answer should conclude that 9.9 is bigger" 
        },
        {
            "question": "If I hang 5 shirts outside and it takes them 5 hours to dry, how long would it take to dry 30 shirts",
            "rubric": "Answer must state that it would take the same amount of time"
        },
        {
            "question": "There are three sisters in a room. Anna is reading a book. Alice is playing a match of chess against someone in the room. What is the third sister, Amanda, doing?",
            "rubric": "Playing chess with Alice"
        },
        {
            "question": """Determine all triples (x, y, z) of real numbers that are solutions to the following
system of equations:
log9 x + 10g9 y + 10g3 z = 2
log 16 x + log4 y + log16 z = 1
log5 x + log25 y + log25 z = 0
""",
            "rubric": "IDK the answer to this one"
        }
    ]
)


In [11]:
# Get all instantiated models
# models = [octoai_llama, together_llama]
models = [ollama_llama,together_llama, groq_llama]


#define our scoring functions
@weave.op()
def has_response(rubric: str, model_output: dict) -> dict:
    return {'has_response': model_output is not None}

# Define the preprocess_model_input function
def preprocess_model_input(row):
    return {'prompt': row['question']}


# Define the evaluation
evaluation = weave.Evaluation(
    name='quirky_prompts_eval',
    dataset=quirky_prompts,
    trials=1,
    scorers=[
        has_response
    ],
    preprocess_model_input=preprocess_model_input
)

# Run evaluation for each model
results = {}
for model in models:
    print(f"Evaluating {model.name}...")
    result = await evaluation.evaluate(model)
    results[model.name] = result


Evaluating Ollama_llama3.1...


🍩 https://wandb.ai/omarkhaledcvexpert-illuminate-ai-/compare-llamas/r/call/01926749-c028-7af2-af09-96fe2468dcb3
Evaluating Together_LLaMa3.1_70B...


🍩 https://wandb.ai/omarkhaledcvexpert-illuminate-ai-/compare-llamas/r/call/0192674e-4f81-79f0-b71c-23ab553faecb
Evaluating Groq_LLaMa3.1_70B...


🍩 https://wandb.ai/omarkhaledcvexpert-illuminate-ai-/compare-llamas/r/call/0192674f-1114-7590-965e-3d427ae7e2bf
