In [None]:
%env CUDA_VISIBLE_DEVICES=1
%env TOKENIZERS_PARALLELISM=false

In [None]:
BASE_PATH = ".."
MODEL_ID = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
MAX_NEW_TOKENS = 8192
MAX_SEQ_LENGTH = 32768 - MAX_NEW_TOKENS

In [None]:
import os
import json

import torch  # type: ignore
import numpy as np  # type: ignore

from datasets import DatasetDict, Dataset  # type: ignore

from unsloth import FastLanguageModel  # type: ignore

from tqdm.auto import tqdm  # type: ignore

from datasets import Dataset, DatasetDict  # type: ignore

from groq import Groq  # type: ignore
from dotenv import load_dotenv # type: ignore

# Compare with Groq

In [None]:
# Load environment variables from variables.env file
load_dotenv(f"{BASE_PATH}/variables.env")

# Access the GROQ_API_KEY
groq_api_key = os.getenv("GROQ_API_KEY")

# Verify that the key was loaded
if groq_api_key:
    print("GROQ API key loaded successfully.")
else:
    print("Failed to load GROQ API key.")

In [None]:
client = Groq(api_key=groq_api_key)

In [None]:
# we make 3 comparisons
# 1 - when we tell model which response is ground truth
# 2 - when we don't tell it which response is ground truth just ask to compare
# 3 - when we do same as previous but swap predicted and ground truth places
def compare_predictions(predictions, actual_answers):
    results = []
    for pred, actual in zip(predictions, actual_answers):
        prompt = f"""Compare the following prediction with the actual answer:

Prediction: {pred}
Actual Answer: {actual}

Evaluate the prediction's accuracy and provide a brief explanation. 
Rate the prediction on a scale of 1-5, where 1 is completely incorrect and 5 is perfectly accurate.

Response format:
Rating: [1-5]
Explanation: [Your explanation here]
"""

        response = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model="mixtral-8x7b-32768",
            temperature=0.5,
            max_tokens=150,
        )

        results.append(response.choices[0].message.content)

    return results

In [None]:
comparison_results = compare_predictions(results["predictions"], results["answers"])

# Print or process the results as needed
for i, result in enumerate(comparison_results):
    print(f"Comparison {i + 1}:")
    print(result)
    print("-" * 50)