In [1]:
import sys
sys.path.append('../Classes')
from AzureSearch import Config, AzureSearch
import pandas as pd
from azure.search.documents.indexes.models import (
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration
)

config = Config(path="../Classes/environment.env")

vector_search = VectorSearch(
    algorithm_configurations=[
        HnswVectorSearchAlgorithmConfiguration(
            name=config.vector_config_name,
            kind="hnsw",
            parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 500,
                "metric": "cosine"
            }
        )
    ]
)

def search_across_indexes(query: str, vector_search, config: Config, index_names: list) -> dict:
    output = {
        "query": query,
    }
    
    for index_name in index_names:
        azure_search = AzureSearch(config, index_name)
        results = azure_search.vector_search(query, filter=None, k=50, vector_search=vector_search)  # Only selecting "title"
        
        # Extracting required information from the results
        formatted_results = [
            {
                "Title": result["title"],
                "Score": result["@search.score"],
                "id": result["id"],
                "content": result["content"]
            }
            for result in results
        ]

        output[index_name] = formatted_results

    return output


def get_search_results_for_csv(file_path: str, index_names: list, vector_search: bool = True) -> pd.DataFrame:
    # Read the CSV file
    df = pd.read_excel(file_path, encoding='utf-8')
    df = df[["query", "title"]]

    # Lists to hold various results
    readable_results = []
    readable_places = []
    readable_ids_contents = []
    readable_std = []
    readable_mean = []

    for query, expected_file in zip(df['query'], df['title']):
        results = search_across_indexes(query, vector_search, Config(), index_names)

        for index_name in index_names:
            readable = [f"{r['Title'].strip()} ({round(r['Score'], 3)})" for r in results[index_name]]

            # Check place for the index
            readable_titles = [r['Title'].strip() for r in results[index_name]]
            readable_place = readable_titles.index(expected_file.strip()) + 1 if expected_file.strip() in readable_titles else 0
            readable_places.append(readable_place)

            readable_ids_contents.append([(r['id'], r['content']) for r in results[index_name]])

            # Compute the standard deviation for the index
            readable_scores = pd.Series([r['Score'] for r in results[index_name]])
            readable_std.append(readable_scores.std())
            readable_mean.append(readable_scores.mean())

            # Append the results
            readable_results.append(", ".join(readable))

    # Add results, rankings, and standard deviations to the dataframe
    for index_name in index_names:
        df[f'{index_name}'] = readable_results
        df[f'{index_name}_place'] = readable_places
        df[f'{index_name}_std'] = readable_std
        df[f'{index_name}_mean'] = readable_mean

        # Calculating Coefficient of Variation for the index
        df[f'{index_name}_cv'] = (df[f'{index_name}_std'] / df[f'{index_name}_mean']) * 100

    return df

In [2]:
#df_with_summrary = get_search_results_for_csv('./Prompt_Flow_Test.xlsx', ['summary-searchable-800-chunk'], vector_search)
#df_with_summrary.to_csv('retrieval_with_summary.csv')

df_without_summary = get_search_results_for_csv('./Prompt_Flow_Test.xlsx', [config.index_name], vector_search)
df_without_summary
#df_without_summary.to_csv('retrieval_without_summary.csv')

Unnamed: 0,query,title,readableindex,readableindex_place,readableindex_std,readableindex_mean,readableindex_cv
0,what can be used as proof of payment for ABC l...,Payouts_and_Closure_of_Liabilities,"Payouts_and_Closure_of_Liabilities (0.032), Pa...",1,0.005995,0.022577,26.551892
1,what is the debt recovery loan code?,Debt_Lending,"Debt_Lending (0.033), Products_and_Services___...",1,0.005516,0.025515,21.618747
2,what are the different types of liabilities a ...,Liabilities,"Liabilities (0.033), Liabilities (0.033), Asse...",1,0.0047,0.022477,20.909664
3,Does client need to close or reduce visa limit...,Payouts_and_Closure_of_Liabilities,"Debt_Lending (0.033), Debt_Lending (0.033), De...",4,0.004878,0.025631,19.032616
4,Are PLOCs archived after they are paid out?,Closing_an_Unsecured_PLoC,"Closing_an_Unsecured_PLoC (0.033), Pay_Out_and...",1,0.00517,0.022645,22.82983
5,Which form do I use to pay out a PLOC account?,Closing_an_Unsecured_PLoC,"Payouts_and_Closure_of_Liabilities (0.03), bc-...",5,0.004505,0.023115,19.490671
6,how to decrease overdraft after payout,Overdraft_Maintenance,"Overdraft_Maintenance (0.033), Overdraft_Maint...",1,0.004754,0.026509,17.935592
7,My client told me about a liability that’s not...,Liabilities,"obtaining-credit-bureau-consent (0.031), Payou...",9,0.004343,0.025206,17.229616
8,Does client need to close or reduce visa limit...,Payouts_and_Closure_of_Liabilities,"Debt_Lending (0.033), Debt_Lending (0.033), De...",4,0.004878,0.025631,19.032616
9,How do I close a PLOC?,Closing_an_Unsecured_PLoC,"bc-cancel-ploc-loan-od (0.033), bc-cancel-ploc...",4,0.005499,0.023911,22.996046
