In [14]:
import os

# do not forget to start ChromaDB using Docker
os.environ["CHROMA_DB_URL"] = "localhost"
os.environ["CHROMA_DB_PORT"] = "8000"

from src.evaluation import read_df, METRIC_COLUMNS, MIN_RERANKING_SCORE
import pandas as pd
from src.vector_db import check_document_existance
from matplotlib import pyplot as plt

In [3]:
# path to data
DATA_PATH = "data/Run 7.csv"

In [4]:
# read DataFrames
questions = read_df()
evaluation_results = pd.read_csv(filepath_or_buffer=DATA_PATH)

In [5]:
# current average results
print("Total average result:")
print(evaluation_results.drop(columns="Question").mean())

Total average result:
Context Precision      0.567457
Context Recall         0.438350
Precision              0.546911
Recall                 0.205000
Faithfulness           0.591351
Response Relevancy     0.479990
Factual Correctness    0.181016
Semantic Similarity    0.455566
Nothing Retrieved      0.336207
Best Ranking Value     0.579881
dtype: float64


In [6]:
# merge DataFrames
df = pd.merge(left=evaluation_results, right=questions, right_on="question_id", left_on="Question").drop(columns="question_id")
print("All Columns:", list(df.columns))
print("Not answered:", round(df["Nothing Retrieved"].mean(), 3))

All Columns: ['Question', 'Context Precision', 'Context Recall', 'Precision', 'Recall', 'Faithfulness', 'Response Relevancy', 'Factual Correctness', 'Semantic Similarity', 'Nothing Retrieved', 'Best Ranking Value', 'question', 'correct_answer', 'context']
Not answered: 0.336


In [7]:
# split to Answered and non-Answered questions
df_answered_mask = evaluation_results["Nothing Retrieved"]
df_answered = evaluation_results.loc[~df_answered_mask][METRIC_COLUMNS].drop(columns="Nothing Retrieved")
df_not_answered = evaluation_results.loc[df_answered_mask][METRIC_COLUMNS].drop(columns="Nothing Retrieved")

In [8]:
# get mean values in case of answered questions
print("Average Results for Answered Questions:")
df_answered.drop(columns="Question").mean()

Average Results for Answered Questions:


Context Precision      0.854870
Context Recall         0.663293
Precision              0.823918
Recall                 0.308831
Faithfulness           0.898853
Response Relevancy     0.727932
Factual Correctness    0.463400
Semantic Similarity    0.690891
Best Ranking Value     0.797763
dtype: float64

In [9]:
# get mean values in case of non-answered questions
print("Average Results for Non-Answered Questions:")
df_not_answered.drop(columns="Question").mean()

Average Results for Non-Answered Questions:


Context Precision      0.000000
Context Recall         0.000000
Precision              0.000000
Recall                 0.000000
Faithfulness           0.000000
Response Relevancy     0.000000
Factual Correctness    0.000000
Semantic Similarity    0.000000
Best Ranking Value     0.149704
dtype: float64

In [10]:
# select questions which were not answered
not_answered_questions = df_not_answered.Question.unique()
print(f"In total {len(not_answered_questions)} questions were not answered.")

In total 78 questions were not answered.


In [11]:
# check for each question that chunks actually exist in VectorDB
not_presented_in_db, not_retrieved = list(), list()

for q_id in not_answered_questions:
    if not check_document_existance(doc_name=q_id):
        not_presented_in_db.append(q_id)
    else:
        not_retrieved.append(q_id)

print("Not presented in the database:", len(not_presented_in_db))
print("Not retrieved from the database:", len(not_retrieved))

Not presented in the database: 0
Not retrieved from the database: 78


In [15]:
# check DocmentRanking performance for documents which exist in DB but were not retrieved
evaluation_results.index = evaluation_results["Question"]
bad_ranks = evaluation_results.loc[not_retrieved, "Best Ranking Value"]

llm_response_generation_errors = bad_ranks.where(lambda x: x > MIN_RERANKING_SCORE).dropna()
print("Errors in response generation:", list(llm_response_generation_errors.index))

Errors in response generation: []
