In [1]:
# %pip install llama-index llama-index-core llama-parse openai llama_index.embeddings.huggingface -q
# %pip install llama-index-llms-anthropic -q
# %pip install llama-index-vector-stores-weaviate -q

In [54]:
use_braintrust_dataset = False


In [55]:
PROJECT_NAME = "RagMetrics"
COMPARISON_FILE = 'claude-3-5-sonnet-20240620_qa.csv'
PDF_LOCATION = 'IndustrySource/Misc/62 Healthcare and Social Assistance in the US Industry Report.pdf'
DOC_ID = 'ibis-healthcare-social-assistance'
MODEL_ID = 'gpt-4o-mini'
QUESTION_COL = 'question'
RESPONSE_COL = 'rag_model_response'
NUM_QUESTIONS = -1
PARSER = "claude" # "claude" or "llama-parse"
CHUNK_SIZE = 600
SPLITTER = "custom_tree" #"sentence"
TOP_K = 3
OUTPUT_FOLDER = f'./rag_outputs/{DOC_ID}'
OUTPUT_FILE = f'{OUTPUT_FOLDER}/output_{MODEL_ID}_{PARSER}_{CHUNK_SIZE}_{SPLITTER}_{TOP_K}.csv'


In [56]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio
nest_asyncio.apply()

import os
from dotenv import load_dotenv
load_dotenv("/Users/mbajaj/.env")
# API access to llama-cloud
os.environ["LLAMA_CLOUD_API_KEY"] = os.getenv('LLAMA_CLOUD_API_KEY')
# Using OpenAI API for embeddings/llms
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
os.environ["BRAINTRUST_API_KEY"]=os.getenv('BRAINTRUST_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [57]:
def reformat_data(labelled_data):
    # operate on the labelled data and put page, file, source type, answer type in the metadata dictionary inside labelled data

    for data in labelled_data:
        data["metadata"] = {
            "page": data["page"],
            "file": data["file"],
            "source_type": data["source_type"],
            "answer_type": data["answer_type"],
            "references": data["references"],
        }
        del data["page"]
        del data["file"]
        del data["source_type"]
        del data["answer_type"]
        del data["references"]
        del data["context"]
    return labelled_data

In [58]:
from braintrust import Eval
 
from autoevals import Factuality
from autoevals.ragas import *



def my_task(input):
    row_number = int(input.split("> ")[0])
    response = {"answer": labelled_df.loc[row_number, "rag_model_response"], "context": labelled_df.loc[row_number,  "context"], "references": labelled_df.loc[row_number,  "references"]}
    return response
 
def my_task_braintrust(input):
    old_row_number = int(input.split("> ")[0])
    row_number = idx_to_row_number[old_row_number]
    response = {"answer": labelled_df.loc[row_number, "rag_model_response"], "context": labelled_df.loc[row_number,  "context"], "references": labelled_df.loc[row_number,  "references"]}
    return response

# my_task("0> What is the name of the organization that provides healthcare and social assistance services to the community?")

In [59]:
from braintrust import Eval

from autoevals import AnswerCorrectness, ContextRecall
from autoevals import Factuality
from autoevals.ragas import *
# BRAINTRUST_MODEL = "claude-3-5-sonnet-20240620"
BRAINTRUST_MODEL = "gpt-4o"


# Wrap ContextRecall() to propagate along the "answer" and "context" values separately
async def context_recall(output, **kwargs):
    return await ContextRecall(model=BRAINTRUST_MODEL).eval_async(output=output["answer"], context=output["context"], **kwargs)

async def answer_correctness(output, **kwargs):
    return await AnswerCorrectness(model=BRAINTRUST_MODEL).eval_async(output=output["answer"], **kwargs)

async def factuality(output, **kwargs):
    return await Factuality().eval_async(output=output["answer"], **kwargs)

async def answer_similarity(output, **kwargs):
    return await AnswerSimilarity().eval_async(output=output["answer"], **kwargs)



In [60]:
labelled_data[0]

{'input': '57> What factors influence the financial health of healthcare facilities in the US?',
 'expected': 'The financial health of healthcare facilities relies heavily on reimbursements from public and private insurance programs. Changes in these reimbursements, determined by various competing factors, can occur annually and produce significant consequences for healthcare and social assistance facilities. Additionally, pricing pressures from public and private payors can cause volatility in the sector.',
 'references': '[22, 74, 22]',
 'context': '["\\n# Healthcare and Social Assistance in the US\\n## Performance\\n### What influences industry volatility?\\n#### Healthcare facilities\' financial health relies on reimbursements\\n\\n- Reimbursements from public and private insurance programs influence healthcare providers\' financial stability. Changes in these reimbursements (determined by various competing factors) can occur annually and produce significant consequences for health

In [50]:
# df = pd.read_csv(OUTPUT_FILE)

# labelled_df = df[["input", "expected", "references", "context", "rag_model_response", "metadata"]]

# labelled_df["index_key"] = labelled_df.input.apply(lambda x: int(x.split("> ")[0]))
# idx_to_row_number = {idx: row_number for row_number, idx in labelled_df["index_key"].items()}
# print(idx_to_row_number)
# labelled_df.head()


{57: 0, 90: 1, 99: 2, 115: 3, 23: 4, 13: 5, 147: 6, 91: 7, 110: 8, 158: 9, 46: 10, 21: 11, 206: 12, 203: 13, 168: 14, 54: 15, 196: 16, 211: 17, 58: 18, 131: 19, 122: 20, 85: 21, 198: 22, 64: 23, 37: 24, 150: 25, 67: 26, 101: 27, 124: 28, 70: 29, 167: 30, 180: 31, 51: 32, 120: 33, 96: 34, 45: 35, 48: 36, 177: 37, 113: 38, 152: 39, 154: 40, 77: 41, 34: 42, 62: 43, 186: 44, 71: 45, 43: 46, 8: 47, 157: 48, 155: 49, 205: 50, 140: 51, 119: 52, 163: 53, 172: 54, 74: 55, 100: 56, 118: 57, 107: 58, 125: 59, 105: 60, 201: 61, 56: 62, 53: 63, 28: 64, 18: 65, 50: 66, 121: 67, 60: 68, 16: 69, 185: 70, 63: 71, 78: 72, 165: 73, 89: 74, 195: 75, 95: 76, 22: 77, 36: 78, 0: 79, 129: 80, 19: 81, 161: 82, 200: 83, 24: 84, 104: 85, 171: 86, 17: 87, 141: 88, 111: 89, 86: 90, 149: 91, 49: 92, 148: 93, 160: 94, 188: 95, 88: 96, 151: 97, 75: 98, 42: 99, 2: 100, 35: 101, 5: 102, 81: 103, 123: 104, 73: 105, 139: 106, 190: 107, 212: 108, 112: 109, 130: 110, 31: 111, 108: 112, 11: 113, 29: 114, 137: 115, 192: 116,

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labelled_df["index_key"] = labelled_df.input.apply(lambda x: int(x.split("> ")[0]))


Unnamed: 0,input,expected,references,context,rag_model_response,metadata,index_key
0,57> What factors influence the financial healt...,The financial health of healthcare facilities ...,"[22, 74, 22]","[""\n# Healthcare and Social Assistance in the ...",The financial health of healthcare facilities ...,{'file': '62 Healthcare and Social Assistance ...,57
1,90> How do Medicaid reimbursements compare to ...,Medicaid reimbursements make up a significantl...,"[35, 68, 35]","[""\n# Healthcare and Social Assistance in the ...",Medicaid reimbursements make up a significantl...,{'file': '62 Healthcare and Social Assistance ...,90
2,99> Which state in the US allocates the most o...,California allocates the most on personal heal...,"[39, 40, 43]","[""\n# Healthcare and Social Assistance in the ...",California allocates the most on personal heal...,{'file': '62 Healthcare and Social Assistance ...,99
3,115> What are the crucial elements for referra...,"Referral networks are crucial, with referrers ...","[44, 34, 22]",['\n# Healthcare and Social Assistance in the ...,Crucial elements for referral networks in the ...,{'file': '62 Healthcare and Social Assistance ...,115
4,23> What is the projected revenue growth rate ...,The sector revenue will grow at a CAGR of 2.7%...,"[15, 25, 9]",['\n# Healthcare and Social Assistance in the ...,The projected revenue growth rate for the Heal...,{'file': '62 Healthcare and Social Assistance ...,23


In [61]:
import pandas as pd
import braintrust
from braintrust import Eval, init_dataset


EXPERIMENT_NAME = f"RAG_querymodel:{MODEL_ID}_Braintrust:{BRAINTRUST_MODEL}_parser:{PARSER}_chunksize:{CHUNK_SIZE}_split:{SPLITTER}_topk:{TOP_K}_N:{NUM_QUESTIONS}"
df = pd.read_csv(OUTPUT_FILE)

if use_braintrust_dataset:
    #TODO : init dataset from braintrust and add the references to output instead of metadata
    
    labelled_df = df[["input", "expected", "references", "context", "rag_model_response", "metadata"]]

    labelled_df["index_key"] = labelled_df.input.apply(lambda x: int(x.split("> ")[0]))
    idx_to_row_number = {idx: row_number for row_number, idx in labelled_df["index_key"].items()}
    
    eval_result = await Eval(
    name=PROJECT_NAME,
    experiment_name=EXPERIMENT_NAME,
    data=init_dataset(project=PROJECT_NAME, name=DOC_ID),
    task=my_task_braintrust,
    scores=[context_recall, answer_correctness, factuality, answer_similarity],
    metadata=dict(model=MODEL_ID, topk=TOP_K, parser=PARSER, chunksize=CHUNK_SIZE, split=SPLITTER, braintrust_model=BRAINTRUST_MODEL, num_questions=NUM_QUESTIONS),
    )

else:
    # change the column names of question to input and answer to expected
    df.rename(columns={"question": "input", "answer": "expected", "source type": "source_type", "answer type": "answer_type"}, inplace=True)
    # select the columns "input" ,"expected", "page", "file", "source type", "answer type" from the dataframe
    labelled_df = df[["input", "expected", "page", "file", "source_type", "answer_type", "references", "context", "rag_model_response"]]

    # modify the input column to add the prefix row number from the index to the input
    labelled_df["input"] = labelled_df.apply(lambda x: f"{x.name}> {x.input}", axis=1)
    # convert the labelled dataframe to list of dictionaries
    labelled_data = labelled_df.to_dict(orient="records")
    labelled_data = reformat_data(labelled_data)
    
    eval_result = await Eval(
    name=PROJECT_NAME,
    experiment_name=EXPERIMENT_NAME,
    data=labelled_data,
    task=my_task,
    scores=[context_recall, answer_correctness, factuality, answer_similarity],
    metadata=dict(model=MODEL_ID, topk=TOP_K, parser=PARSER, chunksize=CHUNK_SIZE, split=SPLITTER, braintrust_model=BRAINTRUST_MODEL, num_questions=NUM_QUESTIONS),
    )



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labelled_df["input"] = labelled_df.apply(lambda x: f"{x.name}> {x.input}", axis=1)
Experiment RAG_querymodel:gpt-4o-mini_Braintrust:gpt-4o_parser:claude_chunksize:600_split:custom_tree_topk:3_N:-1 is running at https://www.braintrust.dev/app/Omega/p/RagMetrics/experiments/RAG_querymodel%3Agpt-4o-mini_Braintrust%3Agpt-4o_parser%3Aclaude_chunksize%3A600_split%3Acustom_tree_topk%3A3_N%3A-1
RagMetrics [experiment_name=RAG_querymodel:gpt-4o-mini_Braintrust:gpt-4o_parser:claude_chunksize:600_split:custom_tree_topk:3_N:-1] (data): 213it [00:00, 11361.76it/s]
RagMetrics [experiment_name=RAG_querymodel:gpt-4o-mini_Braintrust:gpt-4o_parser:claude_chunksize:600_split:custom_tree_topk:3_N:-1] (tasks): 100%|██████████| 213/213 [43:29<00:0

In [25]:
# import pandas as pd
# import braintrust


# EXPERIMENT_NAME = f"RAG_querymodel:{MODEL_ID}_Braintrust:{BRAINTRUST_MODEL}_parser:{PARSER}_chunksize:{CHUNK_SIZE}_split:{SPLITTER}_topk:{TOP_K}_N:{NUM_QUESTIONS}"
# df = pd.read_csv(OUTPUT_FILE)

# if use_braintrust_dataset:
#     df.rename(columns={"source type": "source_type", "answer type": "answer_type"}, inplace=True)
#     # make dictionary from the id to dictionary containing {answer: rag_model_response, context: context} and metadata dictonary as {page: page, file: file, source_type: source_type, answer_type: answer_type}
#     labelled_data = {}
#     for index, row in df.iterrows():
#         labelled_data[row['id']] = {"output": {"answer": row['rag_model_response'], "context": row['context']}, "metadata": row['metadata']}
#     dataset = braintrust.init_dataset(project=PROJECT_NAME, name=DOC_ID)
#     experiment = braintrust.init(project=PROJECT_NAME, experiment=EXPERIMENT_NAME, dataset=dataset)
#     for row in dataset:
#         labelled_row = labelled_data[row["id"]]
#         answer_correctness_score = answer_correctness(row["input"], labelled_row['output'], row["expected"])
#         context_recall_score = context_recall(row["input"], labelled_row['output'], row["expected"])
#         factuality_score = factuality(row["input"], labelled_row['output'], row["expected"])
#         answer_similarity_score = answer_similarity(row["input"], labelled_row['output'], row["expected"])

#         scores = {"context_recall": context_recall_score, "answer_correctness": answer_correctness_score, "factuality": factuality_score, "answer_similarity": answer_similarity_score}
#         experiment.log(
#             input=row["input"],
#             output=labelled_row['output']['answer'],
#             expected=row["expected"],
#             scores=scores,
#             # dataset_record_id=row["id"],
#             metadata=labelled_row["metadata"],
#         )
#     print(experiment.summarize())

# else:
#     # change the column names of question to input and answer to expected
#     df.rename(columns={"question": "input", "answer": "expected", "source type": "source_type", "answer type": "answer_type"}, inplace=True)
#     # select the columns "input" ,"expected", "page", "file", "source type", "answer type" from the dataframe
#     labelled_df = df[["input", "expected", "page", "file", "source_type", "answer_type", "references", "context", "rag_model_response"]]

#     # modify the input column to add the prefix row number from the index to the input
#     labelled_df["input"] = labelled_df.apply(lambda x: f"{x.name}> {x.input}", axis=1)
#     # convert the labelled dataframe to list of dictionaries
#     labelled_data = labelled_df.to_dict(orient="records")
#     labelled_data = reformat_data(labelled_data)
    
#     eval_result = await Eval(
#     name=PROJECT_NAME,
#     experiment_name=EXPERIMENT_NAME,
#     data=labelled_data,
#     task=my_task,
#     scores=[context_recall, answer_correctness, factuality, answer_similarity],
#     metadata=dict(model=MODEL_ID, topk=TOP_K, parser=PARSER, chunksize=CHUNK_SIZE, split=SPLITTER, braintrust_model=BRAINTRUST_MODEL, num_questions=NUM_QUESTIONS),
#     )



Exception: Cannot run toplevel `log` method while using spans. To log to the span, call `experiment.start_span` and then log with `span.log`

In [22]:
dataset = braintrust.init_dataset(project=PROJECT_NAME, name=DOC_ID)

for row in dataset:
    print(row['input'])

57> What factors influence the financial health of healthcare facilities in the US?
90> How do Medicaid reimbursements compare to Medicare reimbursements in terms of their share of sector revenue?
99> Which state in the US allocates the most on personal healthcare spending nationally?
115> What are the crucial elements for referral networks in the healthcare and social assistance industry?
23> What is the projected revenue growth rate for the Healthcare and Social Assistance sector in the US for the coming years?
13> How does the revenue of ambulatory healthcare services compare to hospitals in the US Healthcare and Social Assistance industry?
147> In which year did the US Healthcare and Social Assistance industry experience its highest growth rate?
91> How did the COVID-19 pandemic affect Medicare and Medicaid funding in the US healthcare system?
110> What demographic factor in Florida is driving higher healthcare utilization?
158> What is the projected revenue for Feeding America in 

In [29]:
df.head()

Unnamed: 0,id,_xact_id,created,project_id,dataset_id,input,expected,metadata,tags,span_id,root_span_id,is_root,origin,question,rag_model_response,references,context
0,00028ea4-a9e3-4293-ad1c-6e0d5bc2e672,1000193854671299437,2024-10-06T20:42:29.637Z,a3e7ed80-604a-4d72-ab2c-ba0040ab4f8d,5ddc103a-9916-4c19-aaa2-3d3d79115c35,57> What factors influence the financial healt...,The financial health of healthcare facilities ...,{'file': '62 Healthcare and Social Assistance ...,,7f368491-28bb-45bf-bfbb-7fe16ad3b2b7,7f368491-28bb-45bf-bfbb-7fe16ad3b2b7,True,,What factors influence the financial health o...,The financial health of healthcare facilities ...,"[22, 74, 22]","[""\n# Healthcare and Social Assistance in the ..."
1,01260b71-e5f5-4d2d-9028-8ce3c540f1fe,1000193854671299437,2024-10-06T20:42:29.638Z,a3e7ed80-604a-4d72-ab2c-ba0040ab4f8d,5ddc103a-9916-4c19-aaa2-3d3d79115c35,90> How do Medicaid reimbursements compare to ...,Medicaid reimbursements make up a significantl...,{'file': '62 Healthcare and Social Assistance ...,,306ffb8d-cf38-40f1-9bac-daa3ad6889b6,306ffb8d-cf38-40f1-9bac-daa3ad6889b6,True,,How do Medicaid reimbursements compare to Med...,Medicaid reimbursements make up a significantl...,"[35, 68, 35]","[""\n# Healthcare and Social Assistance in the ..."
2,01300719-22e4-4033-ba2f-47f61da8a11f,1000193854671299437,2024-10-06T20:42:29.639Z,a3e7ed80-604a-4d72-ab2c-ba0040ab4f8d,5ddc103a-9916-4c19-aaa2-3d3d79115c35,99> Which state in the US allocates the most o...,California allocates the most on personal heal...,{'file': '62 Healthcare and Social Assistance ...,,537c615b-dcde-47e8-9ae1-632ed227ffc6,537c615b-dcde-47e8-9ae1-632ed227ffc6,True,,Which state in the US allocates the most on p...,California allocates the most on personal heal...,"[39, 40, 43]","[""\n# Healthcare and Social Assistance in the ..."
3,017a3c26-d858-4dc8-8fa3-0353235938c3,1000193854671299436,2024-10-06T20:42:29.639Z,a3e7ed80-604a-4d72-ab2c-ba0040ab4f8d,5ddc103a-9916-4c19-aaa2-3d3d79115c35,115> What are the crucial elements for referra...,"Referral networks are crucial, with referrers ...",{'file': '62 Healthcare and Social Assistance ...,,3949f686-0c77-4250-863f-4402f396c51a,3949f686-0c77-4250-863f-4402f396c51a,True,,What are the crucial elements for referral ne...,Crucial elements for referral networks in the ...,"[44, 34, 22]",['\n# Healthcare and Social Assistance in the ...
4,03d8756e-f49d-41a8-acf9-48120a60a674,1000193854671299437,2024-10-06T20:42:29.637Z,a3e7ed80-604a-4d72-ab2c-ba0040ab4f8d,5ddc103a-9916-4c19-aaa2-3d3d79115c35,23> What is the projected revenue growth rate ...,The sector revenue will grow at a CAGR of 2.7%...,{'file': '62 Healthcare and Social Assistance ...,,27d60f2d-ef0e-4a39-9d6c-27d059ab2f97,27d60f2d-ef0e-4a39-9d6c-27d059ab2f97,True,,What is the projected revenue growth rate for...,The projected revenue growth rate for the Heal...,"[15, 25, 9]",['\n# Healthcare and Social Assistance in the ...


In [31]:
df = pd.read_csv(OUTPUT_FILE)
# df.rename(columns={"question": "input", "answer": "expected", "source type": "source_type", "answer type": "answer_type"}, inplace=True)
# select the columns "input" ,"expected", "page", "file", "source type", "answer type" from the dataframe
labelled_df = df[["input", "expected", "references", "context", "rag_model_response", "metadata"]]

# modify the input column to add the prefix row number from the index to the input

labelled_data = labelled_df.to_dict(orient="records")
labelled_data[:5]

[{'input': '57> What factors influence the financial health of healthcare facilities in the US?',
  'expected': 'The financial health of healthcare facilities relies heavily on reimbursements from public and private insurance programs. Changes in these reimbursements, determined by various competing factors, can occur annually and produce significant consequences for healthcare and social assistance facilities. Additionally, pricing pressures from public and private payors can cause volatility in the sector.',
  'references': '[22, 74, 22]',
  'context': '["\\n# Healthcare and Social Assistance in the US\\n## Performance\\n### What influences industry volatility?\\n#### Healthcare facilities\' financial health relies on reimbursements\\n\\n- Reimbursements from public and private insurance programs influence healthcare providers\' financial stability. Changes in these reimbursements (determined by various competing factors) can occur annually and produce significant consequences for he