# Eval for Codex

In [112]:
!pip install -r requirements.txt



In [113]:
import requests
import json
import pandas as pd
from dotenv import load_dotenv
import os
import ipywidgets as widgets
from IPython.display import display
from ipydatagrid import DataGrid
from itables import show
import importlib


In [139]:
# from evaluation_prompts.eval_prompt import comparision_eval_prompt
import evaluation_prompts.eval_prompt as eval_prompt
print(eval_prompt.comparison_eval_prompt)

You are evaluating an LLM response against a prompt and expected answer. You have two evaluation tasks:

**Task 1: Prompt Adherence**
Evaluate if the response appropriately addresses the given prompt, including cases where the expected response indicates a failure (e.g., "channel not found").
- Score 5: Fully addresses all aspects of the prompt, including correct identification of failures if applicable
- Score 4: Addresses most aspects with minor gaps (may miss minor failure details)
- Score 3: Addresses some aspects but misses key elements or misrepresents failure cases
- Score 2: Minimally addresses the prompt or incorrectly describes failures
- Score 1: Fails to address the prompt meaningfully

**Task 2: Content Accuracy** 
Evaluate if the response conveys the same semantic meaning and core content as the expected response, including failure scenarios.
- Score 5: Conveys the same semantic meaning and captures all core concepts from the expected response, even if phrased differently

In [140]:
import sys
sys.path.append('/path/to/eval/evaluation_prompts')

In [116]:
importlib.reload(eval_prompt)
print(eval_prompt.comparision_eval_prompt)
comparision_eval_prompt = eval_prompt.comparision_eval_prompt

In [117]:
load_dotenv()
BASE_URL = "http://localhost:3100/api"

In [118]:
API_KEY = os.environ.get("OPENWEBUI_API_KEY", "")
headers = {"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"}


## Getting all Models
Testing if we can fetch models from the API

In [119]:
try:
    response = requests.get(f"{BASE_URL}/models", headers=headers)
    response.raise_for_status()
    json_data = response.json()
    
    # Convert to DataFrame 
    if json_data and 'data' in json_data:
        models_df = pd.DataFrame(json_data['data'])
        print(models_df)
except requests.exceptions.RequestException as e:
    print(f"Error fetching models: {e}")

                                               id     created object owned_by  \
0  apac.anthropic.claude-3-5-sonnet-20240620-v1:0  1751288090  model   openai   
1  apac.anthropic.claude-3-5-sonnet-20241022-v2:0  1751288090  model   openai   
2  apac.anthropic.claude-3-7-sonnet-20250219-v1:0  1751288090  model   openai   
3    apac.anthropic.claude-sonnet-4-20250514-v1:0  1751288090  model   openai   

  connection_type                                            name  \
0        external  apac.anthropic.claude-3-5-sonnet-20240620-v1:0   
1        external  apac.anthropic.claude-3-5-sonnet-20241022-v2:0   
2        external  apac.anthropic.claude-3-7-sonnet-20250219-v1:0   
3        external    apac.anthropic.claude-sonnet-4-20250514-v1:0   

                                              openai  urlIdx actions filters  \
0  {'id': 'apac.anthropic.claude-3-5-sonnet-20240...       0      []      []   
1  {'id': 'apac.anthropic.claude-3-5-sonnet-20241...       0      []      []   
2  {'id'

Choose a model going ahead to the eval.

In [120]:
model_options = models_df['id'].tolist()

model_dropdown = widgets.Dropdown(
    options=model_options,
    value="apac.anthropic.claude-3-5-sonnet-20241022-v2:0",  # default selection
    description='Model:',
    disabled=False,
)

display(model_dropdown)

Dropdown(description='Model:', index=1, options=('apac.anthropic.claude-3-5-sonnet-20240620-v1:0', 'apac.anthr…

In [121]:
model = model_dropdown.value
print("You selected:", model)

You selected: apac.anthropic.claude-3-5-sonnet-20241022-v2:0


## Getting all queries and Passing to LLM
Getting all prompts from data.csv file

In [122]:
try: 
    prompts_df = pd.read_csv('data/data.csv')
    prompt_expresponse_tuple = prompts_df.values.tolist()
except FileNotFoundError:
    print("Error: data/data.csv not found.")
    prompt_expresponse_tuple = []

# print(prompt_expresponse_tuple)
for prompt, expected_response in prompt_expresponse_tuple:
    print(prompt, expected_response)

List all channels in the workspace Here are all the channels in the Slack workspace:

Main Company Channels:
#general (201 members) - Main company-wide channel for announcements and team conversations
#random (199 members) - Casual conversation and team jokes
#help-it (196 members) - IT related queries
#help-hr (191 members) - HR related queries
#help-legal-risk-compliance (97 members) - Legal/risk/compliance queries
Engineering & Tech Channels:
#all-devs (132 members) - All engineers company-wide
#plz-review (109 members) - Code review requests
#eng-backend (102 members) - Backend engineering discussions
#fe-gang (89 members) - Frontend engineering team
#guild-backend (33 members) - Backend guild
#guild-devx (132 members) - Developer experience guild
Product & Operations:
#product (76 members) - Product management discussions
#operations-service-model (4 members) - Operations organization
#prod-onboarding (69 members) - Production onboarding
#incident-response (97 members) - Incident 

Running all prompts with the selected bedrock model (default: anthropic.claude-3-5-sonnet-20241022-v2:0)

In [123]:

results = []  
for prompt, expected_response in prompt_expresponse_tuple:
    # print(prompt)

    payload = {
        "model": model,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "tool_ids": ["server:0", "server:1"],
        "temperature": 0.7,
        "stream": False  # Set to True if you want to stream the response
    }

    response = requests.post(
        f"{BASE_URL}/chat/completions", 
        json=payload,
        headers=headers
    )
    print(response)
    result = response.json()
    print(result)
    row_result = {
        "prompt": prompt,
        "model": result["model"],
        "response": result["choices"][0]["message"]["content"],
        "expected_response": expected_response,
        "usage_prompt_tokens": result["usage"]["prompt_tokens"],
        "usage_completion_tokens": result["usage"]["completion_tokens"],
        "usage_total_tokens": result["usage"]["total_tokens"]
    }
    results.append(row_result)
    # print(result)

results_df = pd.DataFrame(results)
print(results_df)

<Response [200]>
{'id': 'chatcmpl-4980cd79', 'created': 1751288130, 'model': 'apac.anthropic.claude-3-5-sonnet-20241022-v2:0', 'system_fingerprint': 'fp', 'choices': [{'index': 0, 'finish_reason': 'stop', 'logprobs': None, 'message': {'role': 'assistant', 'content': "Here's a more readable summary of the Slack channels in the workspace:\n\nKey channels include:\n\nGeneral/Team-wide:\n- #general - Main channel for team-wide announcements and conversations\n- #random - Channel for casual conversation and team banter\n- #help-it - IT related queries\n- #help-hr - HR related queries\n- #help-engineering - Engineering related queries\n- #help-legal-risk-compliance - Legal/risk/compliance queries\n\nEngineering/Development:\n- #all-devs - Channel for all engineers\n- #eng-backend - Backend engineering discussions\n- #fe-gang - Frontend engineering discussions\n- #guild-backend - Backend engineering guild\n- #plz-review - Code review requests\n- #release-sprintly - Release coordination\n\nPro

In [124]:
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, 'chat_results.csv')
results_df.to_csv(output_path, index=False)

In [125]:
grid = DataGrid(results_df, selection_mode="row", layout={"height": "400px"})
display(grid)


DataGrid(auto_fit_params={'area': 'all', 'padding': 30, 'numCols': None}, corner_renderer=None, default_render…

# Evaluating the Responses

## Selecting evaluation model

In [126]:
model_dropdown_eval = widgets.Dropdown(
    options=model_options,
    value="apac.anthropic.claude-3-5-sonnet-20241022-v2:0",  # default selection
    description='Model:',
    disabled=False,
)
display(model_dropdown_eval)

Dropdown(description='Model:', index=1, options=('apac.anthropic.claude-3-5-sonnet-20240620-v1:0', 'apac.anthr…

In [127]:
model_eval = model_dropdown_eval.value
print("You selected:", model_eval)

You selected: apac.anthropic.claude-3-5-sonnet-20241022-v2:0


In [128]:
importlib.reload(eval_prompt) 
print(eval_prompt.comparison_eval_prompt)
comparision_eval_prompt = eval_prompt.comparison_eval_prompt


You are evaluating an LLM response against a prompt and expected answer. You have two evaluation tasks:

**Task 1: Prompt Adherence**
Evaluate if the response appropriately addresses the given prompt, including cases where the expected response indicates a failure (e.g., "channel not found").
- Score 5: Fully addresses all aspects of the prompt, including correct identification of failures if applicable
- Score 4: Addresses most aspects with minor gaps (may miss minor failure details)
- Score 3: Addresses some aspects but misses key elements or misrepresents failure cases
- Score 2: Minimally addresses the prompt or incorrectly describes failures
- Score 1: Fails to address the prompt meaningfully

**Task 2: Content Accuracy** 
Evaluate if the response conveys the same semantic meaning and core content as the expected response, including failure scenarios.
- Score 5: Conveys the same semantic meaning and captures all core concepts from the expected response, even if phrased differently

## Getting all prompts, responses and expected responses.

In [129]:
try:
    data = pd.read_csv('output/chat_results.csv')
    prompts_expected_responses = data[['prompt','response', 'expected_response']].values.tolist()
except FileNotFoundError:
    print("Error: output/chat_results.csv not found.")

print(prompts_expected_responses)

[['List all channels in the workspace', "Here's a more readable summary of the Slack channels in the workspace:\n\nKey channels include:\n\nGeneral/Team-wide:\n- #general - Main channel for team-wide announcements and conversations\n- #random - Channel for casual conversation and team banter\n- #help-it - IT related queries\n- #help-hr - HR related queries\n- #help-engineering - Engineering related queries\n- #help-legal-risk-compliance - Legal/risk/compliance queries\n\nEngineering/Development:\n- #all-devs - Channel for all engineers\n- #eng-backend - Backend engineering discussions\n- #fe-gang - Frontend engineering discussions\n- #guild-backend - Backend engineering guild\n- #plz-review - Code review requests\n- #release-sprintly - Release coordination\n\nProduct/Squad Channels:\n- #squad-users - User-focused squad\n- #squad-lending - Lending squad\n- #squad-txb - Transaction banking squad\n- #squad-platform - Platform squad\n- #squad-data - Data team squad\n- #squad-prod-ops - Pro

In [135]:
iterations = 3
output_dir = 'eval_output'

for i in range(iterations):
    all_eval_results = []

    for prompt, response, expected_response in prompts_expected_responses:
        # print(comparision_eval_prompt)
        evaluation_prompt = comparision_eval_prompt.format(prompt=prompt, response=response, expected_response=expected_response)
        # print(evaluation_prompt)

        payload = {
            "model": model_eval,
            "messages": [
                {"role": "user", "content": evaluation_prompt}
            ],
            "temperature": 0.7,
            "stream": False  # Set to True if you want to stream the response
        }

        result = requests.post(
            f"{BASE_URL}/chat/completions", 
            json=payload,
            headers=headers
        ).json()

        eval_result = result["choices"][0]["message"]["content"]
        print(eval_result)

        eval_result_json = json.loads(eval_result)

        
        row_result = {
            "prompt": prompt,
            "model": result["model"],
            "response": response,
            "expected_response": expected_response,
            "prompt_adherence_score": eval_result_json["prompt_adherence_score"],
            "prompt_adherence_answer": eval_result_json["prompt_adherence_answer"],
            "prompt_adherence_reasoning": eval_result_json["prompt_adherence_reasoning"],
            "content_accuracy_score": eval_result_json["content_accuracy_score"],
            "content_accuracy_answer": eval_result_json["content_accuracy_answer"],
            "content_accuracy_reasoning": eval_result_json["content_accuracy_reasoning"],
            "evaluation_final_score": eval_result_json["final_score"],
            "usage_prompt_tokens": result["usage"]["prompt_tokens"],
            "usage_completion_tokens": result["usage"]["completion_tokens"],
            "usage_total_tokens": result["usage"]["total_tokens"]
        }
        all_eval_results.append(row_result)

    all_eval_results_df = pd.DataFrame(all_eval_results)
    print(all_eval_results_df)

    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, 'eval_results_'+str(i+1)+'.csv')
    all_eval_results_df.to_csv(output_path, index=False)



{
  "prompt_adherence_score": 5,
  "prompt_adherence_answer": "yes",
  "prompt_adherence_reasoning": "The response directly addresses the prompt by providing a comprehensive list of channels in the workspace, organized by categories. It fulfills the basic request completely.",
  "content_accuracy_score": 4,
  "content_accuracy_answer": "no",
  "content_accuracy_reasoning": "While the response captures many of the same channels and maintains similar categorical organization, it omits several channels present in the expected response (e.g., #guild-devx, #finance, #intel) and member counts. It also includes some channels not in the expected response (e.g., #travel-recs, #book-club). The overall structure and most core information is present, but with notable differences.",
  "final_score": 9
}
{
  "prompt_adherence_score": 4,
  "prompt_adherence_answer": "yes",
  "prompt_adherence_reasoning": "The response directly addresses the request to show public channels with a clear, alphabetized l

JSONDecodeError: Extra data: line 11 column 1 (char 713)

## Retrieve All the Results and Average them out

In [ ]:
import glob
import pandas as pd
import os

# Get all CSV files in eval_output directory
csv_files = glob.glob('eval_output/eval_results_*.csv')
print(f"Found {len(csv_files)} CSV files: {csv_files}")

# Read all CSV files into a list of DataFrames
dfs = []
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)
    print(f"Loaded {file} with {len(df)} rows")

# Concatenate all DataFrames
all_data = pd.concat(dfs, ignore_index=True)
print(f"Combined data has {len(all_data)} rows")

# Group by prompt and response, then calculate average scores
grouped = all_data.groupby(['prompt', 'response']).agg({
    'prompt_adherence_score': 'mean',
    'content_accuracy_score': 'mean'
}).round(2)

# Reset index to turn grouped columns back into regular columns
final_results = grouped.reset_index()

print(f"Final results have {len(final_results)} unique prompt-response pairs")
print("\nFirst few rows:")
print(final_results.head())

# Save to final_eval_results.csv
output_path = 'eval_output/final_eval_results.csv'
final_results.to_csv(output_path, index=False)
print(f"\nResults saved to {output_path}")

# Display the results
display(final_results)

In [111]:
grid = DataGrid(all_eval_results_df, selection_mode="row", layout={"height": "400px"})
display(grid)

DataGrid(auto_fit_params={'area': 'all', 'padding': 30, 'numCols': None}, corner_renderer=None, default_render…

In [22]:
display(all_eval_results_df)

Unnamed: 0,prompt,model,response,expected_response,prompt_adherence_score,prompt_adherence_answer,prompt_adherence_reasoning,content_accuracy_score,content_accuracy_answer,content_accuracy_reasoning,evaluation_final_score,usage_prompt_tokens,usage_completion_tokens,usage_total_tokens
0,List all channels in the workspace,anthropic.claude-3-5-sonnet-20241022-v2:0,"Here are all the channels in the workspace, or...",Here are all the channels in the Slack workspa...,5,yes,The response fully addresses the prompt by pro...,4,yes,The response captures most key channels and or...,9,1642,164,1806
1,Show me all public channels,anthropic.claude-3-5-sonnet-20241022-v2:0,Here's a more readable summary of all public S...,Here are all the public channels available in ...,5,yes,The response fully addresses the prompt by pro...,3,no,While the response provides similar categories...,8,1616,159,1775
2,Post 'Hello team! From eval framework here.' t...,anthropic.claude-3-5-sonnet-20241022-v2:0,"I've posted the message ""Hello team! From eval...",I successfully posted the message \Hello team!...,5,yes,The response fully addresses the prompt by con...,4,yes,"Contains most key information (message posted,...",9,568,133,701
3,Add heart reaction to message at timestamp 175...,anthropic.claude-3-5-sonnet-20241022-v2:0,It seems I was unable to add the heart reactio...,Great! I've successfully added a heart reactio...,4,yes,Response directly addresses the attempt to add...,1,no,Response contradicts the expected answer by cl...,5,581,140,721
4,Reply to thread 1750300317.785949 in channel C...,anthropic.claude-3-5-sonnet-20241022-v2:0,"I've replied to the thread with ""Thanks for th...",Perfect! I've successfully replied to the thre...,5,yes,The response directly addresses the prompt by ...,3,partial,"While the core action is accurately reported, ...",8,576,140,716
5,Get all replies from thread 1750300317.785949 ...,anthropic.claude-3-5-sonnet-20241022-v2:0,Here are all the replies from the thread in ch...,Here are all the replies in the thread startin...,5,yes,The response fully addresses the prompt by ret...,4,mostly,Most key information is present and accurate (...,9,1016,157,1173
6,Show me all the users in the workspace,anthropic.claude-3-5-sonnet-20241022-v2:0,Here's a more readable list of all users in th...,Here are all the users in your Slack workspace...,4,yes,The response directly addresses the request to...,3,no,While many users match between response and ex...,7,1973,229,2202
7,Get profile details for user U08FUTB4QLC,anthropic.claude-3-5-sonnet-20241022-v2:0,Here are the key profile details for Mohitha M...,Here are the detailed profile information for ...,5,yes,The response fully addresses the request to ge...,3,partial,"While core details (name, email, profile pictu...",8,851,158,1009
8,Search for messages containing 'mcp server',anthropic.claude-3-5-sonnet-20241022-v2:0,Here are the key points from the Slack message...,Here are the search results for messages conta...,5,yes,The response fully addresses the prompt by pro...,4,yes,The response captures most key information fro...,9,1645,154,1799


In [None]:
show(all_eval_results_df, 
     maxBytes=0,           # Remove size limit
     maxColumns=0,         # Show all columns
     lengthMenu=[10, 25, 50, 100],  # Page size options
     scrollX=False)   


0
Loading ITables v2.4.2 from the internet...  (need help?)
