In [1]:
! pip install pydantic>=2.8.0

In [2]:
import pandas as pd
import numpy as np

from metrics.eval_templates import EvalMetricTemplates
from metrics.eval_metrics import EvalMetric
from llms.llm_client import LLMClient
from evaluation.eval_engine import Evaluator

In [3]:
# Judge Model Options
GEMINI_API_KEY = "AIzaSyATRXTx0D4rBSL0PUSQFlVCoxVywDo2-os"
OPENAI_API_KEY = ""
ANTHROPIC_API_KEY = ""

In [4]:
initial_system_prompt = """

"""

In [5]:
dataset_path = "datasets/test.csv"

In [6]:
judge_model = "google/gemini-2.0-flash-lite"

if "openai" in judge_model:
    API_KEY = OPENAI_API_KEY
elif "anthropic" in judge_model:
    API_KEY = ANTHROPIC_API_KEY
elif "google" in judge_model:
    API_KEY = GEMINI_API_KEY

In [7]:
coherence = EvalMetricTemplates.PointwiseMetric.COHERENCE
print(coherence)


        # Instruction
        You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models.
        We will provide you with the user input and an AI-generated responses.
        You should first read the user input carefully for analyzing the task, and then evaluate the quality of the responses based on the Criteria provided in the Evaluation section below.
        You will assign the response a rating following the Rating Rubric and Evaluation Steps. Give step-by-step explanations for your rating, and only choose ratings from the Rating Rubric.

        # Evaluation
        ## Metric Definition
        You will be assessing coherence, which measures the ability to provide a coherent response based on the user prompt.

        ## Criteria
        Coherence: A clear and coherent presentation of ideas. The writing should demonstrate
        a logical flow, where ideas progress smoothly with clear transitions, and maintain
        relevance t

In [8]:
# test the entire setup
dataset=pd.DataFrame({
    "prompt": ["What is AI?", "Explain quantum computing."],
    "response": ["AI is the simulation of human intelligence.", "Quantum computing uses quantum bits."]
})
print("Dataset Created!")

multi_turn_chat_quality = EvalMetric( metric_name="multi_turn_chat_quality", metric_prompt_template=EvalMetricTemplates.PointwiseMetric.MULTI_TURN_CHAT_QUALITY)
fluency = EvalMetric( metric_name="fluency", metric_prompt_template=EvalMetricTemplates.PointwiseMetric.FLUENCY)

eval_metrics = [multi_turn_chat_quality, fluency]
print("Evaluation Metrics Created!")

# Create LLM Client with rate limiting
llm_client = LLMClient(
    judge_model=judge_model,
    api_key=API_KEY,
    requests_per_minute=10,  # Custom RPM limit
    requests_per_second=1,   # Custom RPS limit
    enable_rate_limiting=True
)
print("LLM Client Created! Judge Model:", llm_client.model)
print("Rate Limiter Stats:", llm_client.get_rate_limit_stats())

evaluator = Evaluator(llm_client=llm_client, eval_metrics=eval_metrics)
print("Evaluator Created!")

eval_table, summary = evaluator.evaluate(dataset=dataset)
print("Evaluation Completed!")
print(eval_table)
print(summary)

# Show final rate limiter stats
print("\nFinal Rate Limiter Stats:", llm_client.get_rate_limit_stats())

Dataset Created!
Evaluation Metrics Created!
Client initialized: Google Gemini
LLM Client Created! Judge Model: google/gemini-2.0-flash-lite
Evaluator intialized with metrics: ['multi_turn_chat_quality', 'fluency']
Evaluator Created!
Number of API calls estimated:  4
Evaluation Completed!
                       prompt                                     response  \
0                 What is AI?  AI is the simulation of human intelligence.   
1  Explain quantum computing.         Quantum computing uses quantum bits.   

   multi_turn_chat_quality_rating  \
0                               4   
1                               1   

                 multi_turn_chat_quality_explanation  fluency_rating  \
0  The response is a concise and accurate definit...               5   
1  The response is not collaborative, provides mi...               1   

                                 fluency_explanation  
0  STEP 1: The response has no grammatical errors...  
1  STEP 1: The response consists of 