## E2E Evaluation Notebook

### Setting up environment variables

In [None]:
%pip install -r requirements.txt

## Setup

### Static Variables

In [None]:
# Environment variable keys
env_var_openai_key = 'AZURE_OPENAI_SERVICE_KEY'
env_var_openai_uri = 'AZURE_OPENAI_SERVICE_URI'
env_var_ask_licensing_chat_endpoint = 'ASK_LICENSING_ENDPOINT'
env_var_openai_deployment_id = 'AZURE_OPENAI_DEPLOYMENT'
env_var_openai_api_version = 'AZURE_OPENAI_API_VERSION'

ground_truth_file_path = "./ground_truth/ground_truth_single.csv"

full_content_evaluation_output_file_path = "./output/full_output_results.csv"
aggregated_evaluation_output_file_path = "./output/aggregated_output_results.csv"
processing_times_file_path = "./output/aggregated_processing_time.csv"

# File paths
ground_truth_file_path = '../ground_truth/ground_truth_single.csv'
with_recommendation_output_file_path = './output/ler_with_recommendation_output.json'
eval_results_file_path = './output/eval_results.csv'
processing_times_file_path = './output/processing_times.csv'



api_timeout = 120.0
throttle_time = 1.0 # use this to slow down the requests to the API in an attempt to avoid throttling

eval_start_time = 0

In [None]:
import os
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

open_ai_uri = os.getenv(env_var_openai_uri)
open_ai_key = os.getenv(env_var_openai_key)
open_ai_deployment_id = os.getenv(env_var_openai_deployment_id)
open_ai_api_version = os.getenv(env_var_openai_api_version)
ask_licensing_chat_endpoint = os.getenv(env_var_ask_licensing_chat_endpoint)
target = os.getenv("GROUND_TRUTH_MODE", "").strip().lower()

from openai import AzureOpenAI

openai_client = AzureOpenAI(
    api_key = open_ai_key,
    api_version = open_ai_api_version,
    azure_endpoint=open_ai_uri,
    azure_deployment=open_ai_deployment_id,
)

### Reading Ground Truth file or files 

In [None]:
import os
import pandas as pd
from dotenv import load_dotenv

load_dotenv()
GROUND_TRUTH_DIR = "/workspaces/ALCS-Sandbox/src/evaluation/ground_truth" # pragma: allowlist secret

# List of ground true files by title
csv_files = [f for f in os.listdir(GROUND_TRUTH_DIR) if f.endswith('.csv')]
# List of ground true files path 
csv_paths = [os.path.join(GROUND_TRUTH_DIR, f) for f in csv_files]

dataframes = []

# If not csv present in dir
if not csv_files:
    print("No CSV files found.")

else:# if more than 1 csv file is present
    
    # Delete any CSV file that contains "debug" in its name
    for file in csv_files:
        if "debug" in file.lower():
            file_path = os.path.join(GROUND_TRUTH_DIR, file)
            try:
                os.remove(file_path)
                print(f"Deleted debug file: {file}")
            except Exception as e:
                print(f"Failed to delete {file}: {e}")
                

    if target and target != "all":  # if target is 5072 and not ALL > proceed
        matched_files = [f for f in csv_files if target in f.lower()] # look for matching file/s with 5072 > proceed
        print(f"Matched files for target '{target}': {matched_files}")
        if matched_files: # if matching file/s found > proceed
            for file in matched_files: # do what is below for all matching files
                df = pd.read_csv(os.path.join(GROUND_TRUTH_DIR, file))
                dataframes.append(df)
                print(f"Loaded file: {file}")
        else: # If no matching files found, print
            print(f"No files match the target '{target}'.")
    else: # if target exists and its ALL > proceed
        for file in csv_files:
            df = pd.read_csv(os.path.join(GROUND_TRUTH_DIR, file))
            dataframes.append(df)
            print(f"Loaded file: {file}")

if dataframes: # if dataframe is not empty/ containe one or more > proceed 
    ground_truth_df = pd.concat(dataframes, ignore_index=True)

    # ## For debugging purposes only: save the concatenated DataFrame to a CSV file
    # debug_concat_path = os.path.join(GROUND_TRUTH_DIR, "debug_concat_ground_truth.csv")
    # ground_truth_df.to_csv(debug_concat_path, index=False)
    # print(f"Saved concatenated debug CSV: {debug_concat_path}")
    
else: # else dataframe does not contain any value/s. Double safety
    ground_truth_df = None

### Random Sampling

Taking a percent of the entire dataset to run evaluation.

In [None]:
import pandas as pd

RANDOM_SEED = 42
SAMPLE_PERCENT = 0.05  # configurable as needed

# df_full = ground_truth_df.head(10) # for testing purposes

df_full = ground_truth_df.drop_duplicates(subset=["content"])
print(f"{len(df_full)} unique records in the full dataset")

df = df_full.sample(frac=SAMPLE_PERCENT, random_state=RANDOM_SEED).reset_index(drop=True)
print(f"Sampled {len(df)} unique records for evaluation ({SAMPLE_PERCENT*100:.0f}% of total)")

### Client definitions

### Prompt for the LLM

In [None]:
parsing_prompt = """
    You are a helpful assistant tasked with reviewing recommendations made by licensing engineers at nuclear power 
    plants. Your goal is to determine if the recommendation is reportable to the NRC under 10 CFR 50.72 
    and 10 CFR 50.73.

    Instructions:
    Analyze the provided recommendation.
    Return your response as a single string in the following format:
    {{"reportable": <true/false>, "subsections": ["<subsection_1>", "<subsection_2>", ...]}}
    Examples:
    Reportable with a single subsection:
    {{"reportable": true, "subsections": ["10 CFR 50.72(b)(2)(i)(B)"]}}
    Reportable with multiple subsections:
    {{"reportable": true, "subsections": ["10 CFR 50.72(b)(2)(i)(B)", "10 CFR 50.72(b))(3)(v)"]}}
    Non-reportable:
    {{"reportable": false, "subsections": []}}
    Recommendation:
    {message}
"""

## Evaluate the API

### Get Recommendation

In [None]:
from eval_helpers import (
    add_response,
    add_recommendation,
    add_recommendation_score,
)

# Let's get the API's recommendation
df = await add_response(df, throttle_time, ask_licensing_chat_endpoint, api_timeout)

In [None]:
df = await add_recommendation(df, throttle_time, openai_client, open_ai_deployment_id, parsing_prompt)
df = add_recommendation_score(df)

In [None]:
df.head()
df.to_csv("test.csv", index=False)

### Get the Aggregated Metrics

From the output eval from each content, the cell below will output the aggregated metric.

In [None]:
from eval_helpers import (
    ScoreAggregator,
    ProcessingTimes,
    RecommendationScore,
    ChatResponse,
    ParsedResponse,
    EvalResults,
)
from eval_helpers import DataFrameColumnNames, all_reportable_subsections

# We'll start by getting the sum of all of the RecommendationScores from the dataframe
total_score = ScoreAggregator()
processing_times = ProcessingTimes()

for _, idx in df.iterrows():
    score: RecommendationScore = idx[DataFrameColumnNames.SCORE.value]
    total_score.total_records += 1
    if not score.has_errors:
        total_score.y_true.append(score.y_true)
        total_score.y_pred.append(score.y_pred)
    total_score.true_positive += score.true_positive
    total_score.false_positive += score.false_positive
    total_score.false_negative += score.false_negative
    total_score.true_negative += score.true_negative
    total_score.chat_failure += score.chat_failure
    total_score.parsing_failure += score.parsing_failure
    total_score.unexpected_subsections.update(score.unexpected_subsections)
    total_score.summarize_token_counts(score.tokens_by_agent)

    # aggregate times
    processing_times.total_records += 1
    chat_response: ChatResponse = idx[DataFrameColumnNames.CHAT_RESPONSE.value]
    processing_times.time_to_completion += chat_response.time_to_completion if chat_response.error is None else 0.0
    processing_times.time_to_first_chunk += chat_response.time_to_first_chunk if chat_response.error is None else 0.0
    processing_times.time_to_chat_error += chat_response.time_to_completion if chat_response.error else 0.0
    parsed_response: ParsedResponse = idx[DataFrameColumnNames.CHAT_RECOMMENDATION.value]
    processing_times.time_to_parsing_error += parsed_response.time_to_completion if parsed_response.error else 0.0
    processing_times.time_to_parsing_completion += parsed_response.time_to_completion \
        if parsed_response.error is None else 0.0
    
# Let's get our totals
eval_results = EvalResults()
eval_results.total_score = total_score
eval_results.total_records = len(df)

print(eval_results)

In [None]:
# Prep the data frame to save by making sure the fields we added are converted to dictionaries
df[DataFrameColumnNames.CHAT_RECOMMENDATION.value] = df[DataFrameColumnNames.CHAT_RECOMMENDATION.value].apply(
    lambda x: x.model_dump() if x else None)
df[DataFrameColumnNames.CHAT_RESPONSE.value] = df[DataFrameColumnNames.CHAT_RESPONSE.value].apply(
    lambda x: x.model_dump() if x else None)
df[DataFrameColumnNames.SCORE.value] = df[DataFrameColumnNames.SCORE.value].apply(
    lambda x: x.model_dump() if x else None)

In [None]:
df.to_json(
    full_content_evaluation_output_file_path, orient='records', 
    lines=full_content_evaluation_output_file_path.endswith('.jsonl'))
print(f"Results saved to {full_content_evaluation_output_file_path}")

# Save the evaluation results to a csv file
with open(aggregated_evaluation_output_file_path, 'w') as f:
    f.truncate(0)
    f.write('metric,value\n')
    f.write(f"total_records,{eval_results.total_records}\n")
    f.write(f"total_errors,{eval_results.total_errors}\n")
    f.write(f"total_success,{eval_results.total_success}\n")
    f.write(f"total_true_positive,{eval_results.total_score.true_positive}\n")
    f.write(f"total_false_positive,{eval_results.total_score.false_positive}\n")
    f.write(f"total_true_negative,{eval_results.total_score.true_negative}\n")
    f.write(f"total_false_negative,{eval_results.total_score.false_negative}\n")
    f.write(f"total_chat_failure,{eval_results.total_score.chat_failure}\n")
    f.write(f"total_parsing_failure,{eval_results.total_score.parsing_failure}\n")
    f.write(f"unexpected_subsections,{eval_results.total_score.unexpected_subsections}\n")
    f.write(f"error_rate,{eval_results.error_rate}\n")
    f.write(f"success_rate,{eval_results.success_rate}\n")
    f.write(f"accuracy,{eval_results.accuracy}\n")
    f.write(f"micro_precision,{eval_results.micro_precision}\n")
    f.write(f"micro_recall,{eval_results.micro_recall}\n")
    f.write(f"micro_f1_score,{eval_results.micro_f1_score}\n")
    for agent_tokens in eval_results.total_score.tokens_by_agent:
        f.write(f"tokens_by_agent,{agent_tokens}\n")
        f.write(f"{agent_tokens['agent_name']} completion_tokens, {agent_tokens['completion_tokens']}\n")
print(f"Evaluation results saved to {aggregated_evaluation_output_file_path}")

# Save the processing times to a csv file
with open(processing_times_file_path, 'w') as f:
    f.truncate(0)
    f.write('metric,value\n')
    f.write(f"mean_time_to_first_chunk,{processing_times.mean_time_to_first_chunk}\n")
    f.write(f"mean_time_to_completion,{processing_times.mean_time_to_completion}\n")
    f.write(f"mean_time_to_parsing_completion,{processing_times.mean_time_to_parsing_completion}\n")
    f.write(f"mean_time_to_chat_error,{processing_times.mean_time_to_chat_error}\n")
    f.write(f"mean_time_to_parsing_error,{processing_times.mean_time_to_parsing_error}\n")
print(f"Processing times saved to {processing_times_file_path}")

In [None]:
from tabulate2 import tabulate

"""
Prints the totals results in a table format.
"""
headers = ["Metric", "Value"]
rows = [
    ["Total Records", eval_results.total_records],
    ["Total Errors", eval_results.total_errors],
    ["Total Success", eval_results.total_success],
    ["Total True Positive", eval_results.total_score.true_positive],
    ["Total False Positive", eval_results.total_score.false_positive],
    ["Total True Negative", eval_results.total_score.true_negative],
    ["Total False Negative", eval_results.total_score.false_negative],
    ["Total Chat Failure", eval_results.total_score.chat_failure],
    ["Total Parsing Failure", eval_results.total_score.parsing_failure],
]
print(tabulate(rows, headers=headers, tablefmt="grid"))

In [None]:
print("Unexpected Subsections:")
for subsection in eval_results.total_score.unexpected_subsections:
    print(f"- {subsection}")

In [None]:
"""
Prints the evaluation results in a table format as percentages.
"""
headers = ["Metric", "Value (%)"]
rows = [
    ["Error Rate", f"{eval_results.error_rate * 100:.2f}%"],
    ["Success Rate", f"{eval_results.success_rate * 100:.2f}%"],
    ["Accuracy", f"{eval_results.accuracy * 100:.2f}%"],
    ["Micro Precision Score", f"{eval_results.micro_precision * 100:.2f}%"],
    ["Micro Recall Score", f"{eval_results.micro_recall * 100:.2f}%"],
    ["Micro F1 Score", f"{eval_results.micro_f1_score * 100:.2f}%"]
]
print(tabulate(rows, headers=headers, tablefmt="grid"))

In [None]:
"""Print the processing times in a table format."""
headers = ["Metric", "Value"]   

rows = [
    ["Mean Time to First Chunk", processing_times.mean_time_to_first_chunk],
    ["Mean Time to Completion", processing_times.mean_time_to_completion],
    ["Mean Time to Parsing Completion", processing_times.mean_time_to_parsing_completion],
    ["Mean Time to Chat Error", processing_times.mean_time_to_chat_error],
    ["Mean Time to Parsing Error", processing_times.mean_time_to_parsing_error]
]
print(tabulate(rows, headers=headers, tablefmt="grid"))

#### Token Usage

In [None]:
"""
Prints the token usage results in a table format.
"""
headers = ["Metric", "prompt_tokens", "completion_tokens"]
rows = []
for agent_tokens in eval_results.total_score.tokens_by_agent:
    rows.append([agent_tokens['agent_name'], agent_tokens['prompt_tokens'], agent_tokens['completion_tokens']])

print(tabulate(rows, headers=headers, tablefmt="grid"))

### Performance Analysis for each of the 50.72 subsections

### 

In [None]:
import numpy as np
import sklearn.metrics
from eval_helpers import all_reportable_subsections

ytrue = np.array(total_score.y_true, dtype=bool)
ypred = np.array(total_score.y_pred, dtype=bool)
ytrue_t = np.transpose(ytrue)
ypred_t = np.transpose(ypred)
num_datapoints = len(ytrue)
num_subsections = len(ytrue_t)

print(f"Number of data points: {num_datapoints}, Number of subsections: {num_subsections}\n")
print(f"Total number of true positives: {np.sum(ytrue)} \t Avg:{np.round(np.sum(ytrue) / num_datapoints,2)}")
print(f"Total number of pred positives: {np.sum(ypred)} \t Avg:{np.round(np.sum(ypred) / num_datapoints,2)}")
print(f"Percent Difference: {np.round(100 * (np.sum(ypred) - np.sum(ytrue)) / np.sum(ytrue), 2)}%\n\n"),

section_true_pos    = [0] * num_subsections
section_false_pos   = [0] * num_subsections
section_true_neg    = [0] * num_subsections
section_false_neg   = [0] * num_subsections
section_recall      = [0] * num_subsections
section_precision   = [0] * num_subsections
section_fbeta_score = [0] * num_subsections


for idx in range(num_subsections):
    section_true_pos[idx]    = np.sum(np.logical_and(ytrue_t[idx], ypred_t[idx]))
    section_false_pos[idx]   = np.sum(np.logical_and(ytrue_t[idx], np.invert(ypred_t[idx])))
    section_true_neg[idx]    = np.sum(np.logical_and(np.invert(ytrue_t[idx]), np.invert(ypred_t[idx])))
    section_false_neg[idx]   = np.sum(np.logical_and(np.invert(ytrue_t[idx]), ypred_t[idx]))
    section_precision[idx]   = sklearn.metrics.precision_score(ytrue_t[idx], ypred_t[idx], zero_division=1.0)
    section_recall[idx]      = sklearn.metrics.recall_score(ytrue_t[idx], ypred_t[idx], zero_division=1.0)
    section_fbeta_score[idx] = sklearn.metrics.fbeta_score(ytrue_t[idx], ypred_t[idx], beta=10.0, zero_division=1.0    )
    print(f"{idx}\tSection:{all_reportable_subsections[idx]}   \tTP: {section_true_pos[idx]}\tFP:{section_false_pos[idx]}\tTN:{section_true_neg[idx]}\tFN: {section_false_neg[idx]}\tPrecision:{section_precision[idx]:.2f}  \tRecall: {section_recall[idx]:.2f}\tF1:{section_fbeta_score[idx]:.2f}")

### Pairs of sections appearing together

In [None]:
mtrue = np.zeros((num_subsections, num_subsections), dtype=int)
mpred = np.zeros((num_subsections, num_subsections), dtype=int)

for a in range(num_subsections):
    for b in range(num_subsections):
        mtrue[a][b] = np.sum(np.logical_and(ytrue_t[a], ytrue_t[b]))
        mpred[a][b] = np.sum(np.logical_and(ypred_t[a], ypred_t[b]))

import matplotlib.pyplot as plt
plt.imshow(mtrue, cmap='cool', interpolation='nearest')
plt.show()
plt.imshow(mpred, cmap='cool', interpolation='nearest')
plt.show()    