# Reportability Manual Index Evaluation

This notebook is evaluating AI Search for the Reportability Manual index

### Install Libraries

In [None]:
%pip install -r requirements.txt

### Import Libraries, Initialize Static variables

In [None]:
from helper.alcs_indexeval import IndexEvalService, IndexEvalModel
from helper.alcs_search import AzureSearchModel, AzureSearchService, SearchType
from helper.alcs_llm import AzureOpenAIModel, AzureOpenAIService
import os
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import sys
import pickle
sys.path.append('../search/helper')


from dotenv import load_dotenv
load_dotenv()

env_var_storage_container_name = "AZURE_STORAGE_CONTAINER_NAME"
env_var_storage_connection_string = "AZURE_STORAGE_CONNECTION_STRING"
env_var_openai_key = 'SEARCH_EVAL_AZURE_OPENAI_SERVICE_KEY'
env_var_openai_uri = 'SEARCH_EVAL_AZURE_OPENAI_SERVICE_URI'
env_var_openai_model = 'SEARCH_EVAL_AZURE_OPENAI_CHATGPT_MODEL'
env_var_search_query_key = 'SEARCH_EVAL_AZURE_SEARCH_QUERY_KEY'
env_var_search_uri = 'SEARCH_EVAL_AZURE_SEARCH_URI'
env_var_openai_embedding_model = 'SEARCH_EVAL_OPENAI_EMBEDDING_MODEL'
openai_deployment = "SEARCH_EVAL_AZURE_OPENAI_CHATGPT_MODEL"
openai_version = "NUREG_AZURE_OPENAI_VERSION"

search_index_name='reportability-manual-index'
search_index_num_docs=122 # update number of docs to match AI Search index docs
beta_factor = 10
content_type = 'content'  # or 'narrative', depending on our needs
content_field = ['discussion']
embedding_field = 'discussionVector'

ground_truth_file = "../ground_truth/ground_truth_single.csv"
eval_results_fln = "output/reportability_manual_index_eval_results.csv"
search_scores_fln = "output/reportability_manual_search_scores.pkl"

### Setup Models and Corresponding Services

In [None]:
# Set parameters for Azure Search Model and initialize the service
azure_search_model = AzureSearchModel(
    azure_search_key = os.getenv(env_var_search_query_key),
    azure_search_service_uri= os.getenv(env_var_search_uri),
    azure_search_index_name= search_index_name,
    azure_content_field = content_field,
    azure_embedding_field = embedding_field,
    vector_knn = search_index_num_docs
)
azure_search = AzureSearchService(azure_search_model = azure_search_model)

search_result = azure_search.get_documents(
    search_text="*",
    search_type=SearchType.Vector,
    search_count=search_index_num_docs
)

index_eval_model = IndexEvalModel(
    beta_factor=beta_factor,
    num_docs=search_index_num_docs
)
index_eval_service = IndexEvalService(index_eval_model=index_eval_model)

## Evaluate Search Index Using Random Sampling

Gives flexibility to run evaluation using a smaller subset of data. Sample_Percent is calculated using decimal so for instance 0.5 would mean half the total dataset.

In [None]:
import ast

filenames = list()
predicted_sections = list()
actual_sections = list()
search_types = list()
results = list()
search_scores = dict()
for search_type in SearchType:
    search_scores[search_type] = list()


RANDOM_SEED = 42 # ensures reproducibility of the random sampling
SAMPLE_PERCENT = 1 # controls fraction of the dataset to sample for eval

df_full = pd.read_csv(ground_truth_file)
df_full = df_full.drop_duplicates(subset=["content"])
df_full['subsections'] = df_full['subsections'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith('[') else [x]
)
print(f"{len(df_full)} unique records in the full dataset")

df = df_full.sample(frac=SAMPLE_PERCENT, random_state=RANDOM_SEED).reset_index(drop=True)
print(f"Sampled {len(df)} unique records for evaluation ({SAMPLE_PERCENT*100:.0f}% of total)")

### Evaluate index search for all files and search types

In [None]:
results = []
failed_queries = 0

for index, row in df.iterrows():
    reported_content = row["content"]
    reported_sections = row["subsections"] if "subsections" in row else None
    reported_sections = [
        section.replace("10 CFR ", "") if isinstance(section, str) else section
        for section in reported_sections
    ]
    if not reported_content or not reported_sections:
        print(f"ERROR: No content or missing CFR requirements in row {index}.")
        continue
    for search_type in SearchType:
        search_name = search_type.name
        try:
            search_result = azure_search.get_documents(
                search_text=reported_content,
                search_type=search_type,
                search_count=search_index_num_docs,
            )
        except Exception as e:
            print(f"ERROR: Search failed for row {index} with error: {e}")
            failed_queries += 1
            continue
        if not search_result:
            print(f"ERROR: Search did not return anything using {content_type} from row {index}")
            failed_queries += 1
            continue

        predicted_sections = [doc.get('references') for doc in search_result]
        predicted_sections = [[x.replace("10 CFR ", "") for x in entry] for entry in predicted_sections]
        performance_eval = index_eval_service.encode_result(predicted_sections, reported_sections)
        doc_score = [doc.get('@search.score') for doc in search_result if doc.get('@search.score') is not None]
        search_scores[search_type].append(doc_score)
        docs =[
            {
                "score": doc.get('@search.score'),
                "doc": doc.get('section')
            }
            for doc in search_result
        ]
        results.append({
            "search_type": search_name,
            "results": performance_eval,
            "reported_sections": "|".join(reported_sections) if reported_sections else "",
            "docs": docs
        })

df = pd.DataFrame(results)
df.to_csv(eval_results_fln, index=False)
print("DONE with processing reportability manual index.")

## Save Search Scores

In [None]:
print("Saving search scores and evaluation results...")
with open(search_scores_fln, 'wb') as f:
    pickle.dump(search_scores, f)

print(search_scores)

## Plot metrics

In [None]:
for search_type in ["FullText", "Vector", "Hybrid"]:
    db = df[df["search_type"] == search_type]
    metrics = index_eval_service.calculate_metrics(db)
    print(f"{search_type} MRR: {metrics['MRR']}") 

    xaxis = range(1, len(metrics["Precision@K"]))
    precision_vector = metrics["Precision@K"][1:]
    recall_vector = metrics["Recall@K"][1:]
    fbeta_vector = metrics["Fbeta@K"][1:]

    plt.plot(xaxis, precision_vector, label='Precision@K')
    plt.plot(xaxis, recall_vector, label='Recall@K')
    plt.title(f'{search_type} Precision, Recall, and Fbeta Scores at Different K Values')
    plt.plot(xaxis, fbeta_vector, label='Fbeta@K')
    plt.xlabel('K')
    plt.ylabel('Score')  
    plt.legend()
    plt.show()

### Distribution of positions for the first search hit

In [None]:
for search_type in ["FullText", "Vector", "Hybrid"]:
    db = df[df["search_type"] == search_type]
    res = index_eval_service.get_k_distribution(positions=db['results'].tolist())
    print(f"SearchType:{search_type}: {[x / sum(res) for x in res]}")

### Cosine Similarity of documents in the Index

In [None]:
#Get all documents from the search index
search_result = azure_search.get_documents(
        search_text="*",
        search_type=SearchType.Vector,
        search_count=search_index_num_docs
    )


discussionVectors = [doc.get("discussionVector") for doc in search_result if doc.get("discussionVector") is not None]

discussionMatrix = sklearn.metrics.pairwise.cosine_similarity(discussionVectors)

np.set_printoptions(precision=3, suppress=True, linewidth=200)
print(discussionMatrix)
plt.imshow(discussionMatrix, cmap='inferno', interpolation='nearest')
plt.title('Discussion Vector Cosine Similarity Matrix')
plt.colorbar()
plt.show()


### Score values for different values of K

In [None]:
with open(search_scores_fln, 'rb') as file:
    search_scores = pickle.load(file)

# converting enum keys to string names
search_scores_str_keys = {str(k.name) if hasattr(k, 'name') else str(k): v for k, v in search_scores.items()}

for search_type in ["FullText", "Vector", "Hybrid"]:
    scores_nested = search_scores_str_keys.get(search_type, [])
    scores = [score for sublist in scores_nested for score in sublist]  # flatten the list of lists
    if not scores:
        print(f"No scores found for {search_type}")
        continue
    res = index_eval_service.get_score_stats(scores)
    print(f"{search_type} Score mean values:\n {res[0]}")
    print(f"{search_type} Score stdev values:\n {res[1]}")

### Determining Threshold and Top K

In [None]:
import pandas as pd
import ast
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('output/reportability_manual_index_eval_results.csv')
df = df[df['search_type'] == 'Vector'] # configure search type here

def parse_docs(docs_str: str) -> list[dict]:
    return ast.literal_eval(docs_str)

def extract_result_indices(results: str) -> list[int]:
    if pd.isna(results):
        return []
    parts = str(results).split('|')
    return [int(part) for part in parts if part.isdigit()]  # convert each digit string to an int

# using doc's index to determine relevant and irrelevant doc
score_data = []
labels = []
for _, row in df.iterrows():
    docs = parse_docs(row['docs'])
    result_indices = extract_result_indices(row['results'])
    for idx, doc in enumerate(docs):
        score = doc.get('score')
        if score is None:
            continue
        if idx in result_indices:
            score_data.append(score)
            labels.append('Relevant')
        else:
            score_data.append(score)
            labels.append('Irrelevant')


plot_df = pd.DataFrame({'Score': score_data, 'Type': labels})
plt.figure(figsize=(8, 5))
plot_df.boxplot(column='Score', by='Type')
plt.title('5072 Vector: @search.score for Relevant vs Irrelevant Docs')
plt.suptitle('')
plt.xlabel('Document Type')
plt.ylabel('@search.score')
plt.show()

def compute_precision_recall_thresholds(scores: list[float], labels: list[str], thresholds: list[float]) -> pd.DataFrame:
    results = []
    y_true = np.array([1 if l == 'Relevant' else 0 for l in labels])
    scores = np.array(scores)
    for t in thresholds:
        y_pred = (scores >= t).astype(int)
        tp = np.sum((y_pred == 1) & (y_true == 1))
        fp = np.sum((y_pred == 1) & (y_true == 0))
        fn = np.sum((y_pred == 0) & (y_true == 1))
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        results.append({'threshold': t, 'precision': precision, 'recall': recall})
    return pd.DataFrame(results)

if score_data:
    min_score, max_score = min(score_data), max(score_data)
    thresholds = np.linspace(min_score, max_score, 80)
    pr_df = compute_precision_recall_thresholds(score_data, labels, thresholds)
    plt.figure(figsize=(8, 5))
    plt.plot(pr_df['threshold'], pr_df['precision'], label='Precision')
    plt.plot(pr_df['threshold'], pr_df['recall'], label='Recall')
    plt.xlabel('Score Threshold')
    plt.ylabel('Metric')
    plt.title('5072 Vector: Precision and Recall at Different Score Thresholds')
    plt.legend()
    plt.show()
else:
    print("No score data found for Vector search type.")

In [None]:
import numpy as np
import pandas as pd
from typing import Tuple, Dict
import ast

def compute_optimal_threshold(
    scores: list[float], labels: list[str], metric: str = "f1"
) -> Tuple[float, Dict[str, float]]:
    y_true = np.array([1 if l == "Relevant" else 0 for l in labels])
    scores = np.array(scores)
    thresholds = np.linspace(scores.min(), scores.max(), 100)
    best_metric = -1.0
    best_threshold = thresholds[0]
    best_stats = {}

    for t in thresholds:
        y_pred = (scores >= t).astype(int)
        tp = np.sum((y_pred == 1) & (y_true == 1))
        fp = np.sum((y_pred == 1) & (y_true == 0))
        fn = np.sum((y_pred == 0) & (y_true == 1))
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = (
            2 * precision * recall / (precision + recall)
            if (precision + recall) > 0
            else 0.0
        )
        metric_value = {"precision": precision, "recall": recall, "f1": f1}[metric]
        if metric_value > best_metric:
            best_metric = metric_value
            best_threshold = t
            best_stats = {"precision": precision, "recall": recall, "f1": f1}

    return best_threshold, best_stats

def compute_optimal_k(
    df: pd.DataFrame, max_k: int = 10
) -> Tuple[int, Dict[str, float]]:
    best_f1 = -1.0
    best_k = 1
    best_stats = {}

    for k in range(1, max_k + 1):
        tp, fp, fn = 0, 0, 0
        for _, row in df.iterrows():
            result_indices = [
                int(idx) for idx in str(row["results"]).split("|") if idx.isdigit()
            ]
            relevant_in_top_k = [idx for idx in result_indices if idx < k]
            tp += len(relevant_in_top_k)
            fp += k - len(relevant_in_top_k)
            fn += max(0, len(result_indices) - len(relevant_in_top_k))
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = (
            2 * precision * recall / (precision + recall)
            if (precision + recall) > 0
            else 0.0
        )
        if f1 > best_f1:
            best_f1 = f1
            best_k = k
            best_stats = {"precision": precision, "recall": recall, "f1": f1}
    return best_k, best_stats

# Example usage for this notebook:
df_eval = pd.read_csv(eval_results_fln)
df_eval = df_eval[df_eval["search_type"] == "Vector"]

def parse_doc_scores(doc_score_str: str) -> list[float]:
    try:
        return ast.literal_eval(doc_score_str)
    except Exception:
        return []

score_data = []
labels = []
for _, row in df_eval.iterrows():
    doc_scores = parse_doc_scores(row.get("doc_score", "[]"))
    result_indices = [int(idx) for idx in str(row["results"]).split("|") if idx.isdigit()]
    for idx, score in enumerate(doc_scores):
        if idx in result_indices:
            score_data.append(score)
            labels.append("Relevant")
        else:
            score_data.append(score)
            labels.append("Irrelevant")

if score_data:
    threshold, stats = compute_optimal_threshold(score_data, labels)
    k, k_stats = compute_optimal_k(df_eval)
    print(f"Optimal threshold (F1): {threshold:.4f}, stats: {stats}")
    print(f"Optimal K (F1): {k}, stats: {k_stats}")
else:
    print("No score data found for Vector search type.")