In [None]:
import os
import pandas as pd

from evidently import Dataset
from evidently import DataDefinition
from evidently.descriptors import *

from evidently import Report
from evidently.presets import TextEvals
from evidently.metrics import *
from evidently.tests import *
from rich.console import Console
from rich.console import Console
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv

In [None]:

console = Console()

load_dotenv(find_dotenv(), override=True)
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
GROQ_API_KEY = os.environ["GROQ_API_KEY"]
MODEL = "gpt-4o-mini"

console.print("[cyan]Starting...[/]")


def get_llm_client(llm_choice):
    if llm_choice == "GROQ":
        client = OpenAI(
            base_url="https://api.groq.com/openai/v1",
            api_key=os.environ.get("GROQ_API_KEY"),
        )
        return client
    elif llm_choice == "OPENAI":
        load_dotenv()  # load environment variables from .env fil
        client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
        return client
    else:
        raise ValueError("Invalid LLM choice. Please choose 'GROQ' or 'OPENAI'.")


LLM_CHOICE = "OPENAI"
# LLM_CHOICE = "GROQ"

if OPENAI_API_KEY:
    console.print(
        f"[green]✅ OPENAI_API_KEY exists and begins {OPENAI_API_KEY[:14]}...[/]"
    )
else:
    console.print("[red bold]❌ OPENAI_API_KEY not set[/]")

if GROQ_API_KEY:
    console.print(f"[green]✅ GROQ_API_KEY exists and begins {GROQ_API_KEY[:14]}...[/]")

else:
    console.print("[red bold]❌ GROQ_API_KEY not set[/]")


client = get_llm_client(LLM_CHOICE)
if LLM_CHOICE == "GROQ":
    MODEL = "llama-3.3-70b-versatile"
else:
    MODEL = "gpt-4o-mini"

console.print(f"[green]✅ LLM_CHOICE: {LLM_CHOICE} - MODEL: {MODEL}[/]")

# Retrieval - Single context

In [None]:
# ["Question", "Ground_Truth", "Answer"]

synthetic_data = [
    [
        "What is Langgraph?",
        "LangGraph is a library for building stateful, multi-actor applications with Large Language Models (LLMs), built on top of LangChain. It's designed to create complex, graph-based workflows where different components can interact and maintain state across multiple steps.",
        "LangGraph is a framework for building LLM apps. It has a foundation of Langchain",
    ],
    [
        "What is MCP?",
        "MCP stands for Model Context Protocol - it's an open standard developed by Anthropic for connecting AI assistants to data sources and tools in a secure, standardized way.",
        "MCP acts as a bridge between AI models (like Claude) and external systems, allowing the AI to access and use data from various sources",
    ],
]


columns = ["Question", "Ground_Truth", "Answer"]


synthetic_df = pd.DataFrame(synthetic_data, columns=columns)

In [None]:
pd.set_option("display.max_colwidth", None)

In [None]:
synthetic_df.head()

## ContextQuality

In [None]:
context_based_evals = Dataset.from_pandas(
    pd.DataFrame(synthetic_df),
    data_definition=DataDefinition(
        text_columns=["Question", "Ground_Truth", "Answer"],
    ),
    descriptors=[
        ContextQualityLLMEval("Ground_Truth", question="Question"),
    ],
)
context_based_evals.as_dataframe()

## ContextRelevance

In [None]:
context_based_evals = Dataset.from_pandas(
    pd.DataFrame(synthetic_df),
    data_definition=DataDefinition(
        text_columns=["Question", "Ground_Truth", "Answer"],
    ),
    descriptors=[
        ContextRelevance(
            "Question",
            "Ground_Truth",
            output_scores=True,
            aggregation_method="hit",
            method="llm",
            alias="Hit",
        )
    ],
)
context_based_evals.as_dataframe()

## ContextRelevance, Hit

In [None]:
context_based_evals = Dataset.from_pandas(
    pd.DataFrame(synthetic_df),
    data_definition=DataDefinition(
        text_columns=["Question", "Ground_Truth", "Answer"],
    ),
    descriptors=[
        ContextRelevance(
            "Question",
            "Ground_Truth",
            output_scores=True,
            aggregation_method="hit",
            method="llm",
            alias="Hit",
        )
    ],
)
context_based_evals.as_dataframe()

## ContextRelevance, Mean

In [None]:
context_based_evals = Dataset.from_pandas(
    pd.DataFrame(synthetic_df),
    data_definition=DataDefinition(
        text_columns=["Question", "Ground_Truth", "Answer"],
    ),
    descriptors=[
        ContextRelevance(
            "Question",
            "Ground_Truth",
            output_scores=True,
            aggregation_method="mean",
            method="llm",
            alias="Relevance",
        )
    ],
)
context_based_evals.as_dataframe()

# Generation - ground truth

In [None]:
context_based_evals = Dataset.from_pandas(
    pd.DataFrame(synthetic_df),
    data_definition=DataDefinition(
        text_columns=["Question", "Answer", "Ground_Truth"],
    ),
    descriptors=[
        CorrectnessLLMEval("Answer", target_output="Ground_Truth"),
        BERTScore(columns=["Answer", "Ground_Truth"], alias="BERTScore"),
        SemanticSimilarity(
            columns=["Answer", "Ground_Truth"], alias="Semantic Similarity"
        ),
    ],
)
context_based_evals.as_dataframe()

# Generation - open-ended

In [None]:
context_based_evals = Dataset.from_pandas(
    pd.DataFrame(synthetic_df),
    data_definition=DataDefinition(
        text_columns=["Question", "Ground_Truth", "Answer"],
    ),
    descriptors=[FaithfulnessLLMEval("Answer", context="Ground_Truth")],
)
context_based_evals.as_dataframe()

# Report

Combine ContextQuality and faithfulness:

In [None]:
context_based_evals = Dataset.from_pandas(
    pd.DataFrame(synthetic_df),
    data_definition=DataDefinition(
        text_columns=["Question", "Ground_Truth", "Answer"],
    ),
    descriptors=[
        FaithfulnessLLMEval("Answer", context="Ground_Truth"),
        ContextQualityLLMEval("Ground_Truth", question="Question"),
    ],
)

In [None]:
# context_based_evals.as_dataframe()

In [None]:
report = Report([TextEvals()])

my_eval = report.run(context_based_evals, None)
my_eval

# Add Tests

In [None]:
report = Report(
    [
        TextEvals(),
        CategoryCount(column="Faithfulness", category="UNFAITHFUL", tests=[eq(0)]),
        CategoryCount(column="ContextQuality", category="INVALID", tests=[eq(0)]),
    ]
)

my_eval = report.run(context_based_evals, None)
my_eval