In [1]:
# %pip install llama-index llama-index-core llama-parse openai llama_index.embeddings.huggingface -q
# %pip install llama-index-llms-anthropic -q
# %pip install llama-index-vector-stores-weaviate -q

In [1]:

COMPARISON_FILE = 'claude-3-5-sonnet-20240620_qa.csv'
PDF_LOCATION = 'IndustrySource/Misc/62 Healthcare and Social Assistance in the US Industry Report.pdf'
DOC_ID = 'ibis-healthcare-social-assistance'
MODEL_ID = 'gpt-4o-mini'
QUESTION_COL = 'question'
RESPONSE_COL = 'rag_model_response'
NUM_QUESTIONS = -1
PARSER = "claude" # "claude" or "llama-parse"
CHUNK_SIZE = 600
SPLITTER = "sentence"
TOP_K = 3
OUTPUT_FOLDER = f'./rag_outputs/{DOC_ID}'
OUTPUT_FILE = f'{OUTPUT_FOLDER}/output_{MODEL_ID}_{PARSER}_{CHUNK_SIZE}_{SPLITTER}_{TOP_K}.csv'


In [2]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio
nest_asyncio.apply()

import os
from dotenv import load_dotenv
load_dotenv("/Users/mbajaj/.env")
# API access to llama-cloud
os.environ["LLAMA_CLOUD_API_KEY"] = os.getenv('LLAMA_CLOUD_API_KEY')
# Using OpenAI API for embeddings/llms
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
os.environ["BRAINTRUST_API_KEY"]=os.getenv('BRAINTRUST_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [3]:
import pandas as pd
df = pd.read_csv(OUTPUT_FILE)
# change the column names of question to input and answer to expected
df.rename(columns={"question": "input", "answer": "expected", "source type": "source_type", "answer type": "answer_type"}, inplace=True)
df.head()



Unnamed: 0,input,expected,source_type,answer_type,page,file,explanation,rag_model_response,references,context
0,What types of enterprises are included in the ...,The Healthcare and Social Assistance sector in...,text,other,5,62 Healthcare and Social Assistance in the US ...,Understanding the scope of the Healthcare and ...,The Healthcare and Social Assistance sector in...,"[5, 53, 10]","[""# 1. About\n\nhttps://my.ibisworld.com/us/en..."
1,How is telemedicine defined in the context of ...,Telemedicine is defined as an application of c...,text,other,5,62 Healthcare and Social Assistance in the US ...,Identifying key technological trends like tele...,Telemedicine is defined as an application of c...,"[21, 5, 26]",['# IBISWorld | Healthcare and Social Assistan...
2,What are Health Insurance Exchanges in the US ...,Health Insurance Exchanges are sets of state-r...,text,other,5,62 Healthcare and Social Assistance in the US ...,Understanding the structure of health insuranc...,Health Insurance Exchanges in the US healthcar...,"[69, 5, 36]","[""# IBISWorld | Healthcare and Social Assistan..."
3,What services are included in the Healthcare a...,The Healthcare and Social Assistance industry ...,text,other,6,62 Healthcare and Social Assistance in the US ...,Understanding the scope of services in the ind...,The Healthcare and Social Assistance industry ...,"[10, 9, 30]",['# IBISWorld | Healthcare and Social Assistan...
4,Who are some of the major companies operating ...,Major companies in the industry include Hca He...,text,other,6,62 Healthcare and Social Assistance in the US ...,Identifying key players helps in understanding...,Some of the major companies operating in the H...,"[53, 11, 30]",['# IBISWorld | Healthcare and Social Assistan...


In [10]:
# select the columns "input" ,"expected", "page", "file", "source type", "answer type" from the dataframe
labelled_df = df[["input", "expected", "page", "file", "source_type", "answer_type", "references", "context", "rag_model_response"]]

# modify the input column to add the prefix row number from the index to the input
labelled_df["input"] = labelled_df.apply(lambda x: f"{x.name}> {x.input}", axis=1)
# convert the labelled dataframe to list of dictionaries
labelled_data = labelled_df.to_dict(orient="records")
labelled_data[:2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labelled_df["input"] = labelled_df.apply(lambda x: f"{x.name}> {x.input}", axis=1)


[{'input': '0> What types of enterprises are included in the Healthcare and Social Assistance sector in the United States?',
  'expected': 'The Healthcare and Social Assistance sector includes hospitals, ambulatory service providers, nursing and residential care facilities, counselors, social workers, family and welfare services, and natural disaster and emergency relief services.',
  'page': 5,
  'file': '62 Healthcare and Social Assistance in the US Industry Report.pdf',
  'source_type': 'text',
  'answer_type': 'other',
  'references': '[5, 53, 10]',
  'context': '["# 1. About\\n\\nhttps://my.ibisworld.com/us/en/industry/62/about\\n\\n## Codes\\n\\n| NAICS 2017 - USA | 62 |\\n|-------------------|-----|\\n| NAICS 2022 - USA | 62 |\\n\\nThe table shows the NAICS (North American Industry Classification System) codes for the Healthcare and Social Assistance sector in the USA. Both the 2017 and 2022 versions of NAICS assign the code 62 to this sector, indicating consistency in classific

In [11]:
# operate on the labelled data and put page, file, source type, answer type in the metadata dictionary inside labelled data

for data in labelled_data:
    data["metadata"] = {
        "page": data["page"],
        "file": data["file"],
        "source_type": data["source_type"],
        "answer_type": data["answer_type"],
        "references": data["references"],
    }
    del data["page"]
    del data["file"]
    del data["source_type"]
    del data["answer_type"]
    del data["references"]
    del data["context"]

In [6]:
labelled_data[:2]

[{'input': '0> What types of enterprises are included in the Healthcare and Social Assistance sector in the United States?',
  'expected': 'The Healthcare and Social Assistance sector includes hospitals, ambulatory service providers, nursing and residential care facilities, counselors, social workers, family and welfare services, and natural disaster and emergency relief services.',
  'rag_model_response': 'The Healthcare and Social Assistance sector in the United States includes enterprises such as hospitals, ambulatory service providers, nursing and residential care facilities, as well as social assistance services like counseling, social work, family and welfare services, and emergency relief services.',
  'metadata': {'page': 5,
   'file': '62 Healthcare and Social Assistance in the US Industry Report.pdf',
   'source_type': 'text',
   'answer_type': 'other',
   'references': '[5, 53, 10]',
   'context': '["# 1. About\\n\\nhttps://my.ibisworld.com/us/en/industry/62/about\\n\\n## Co

In [7]:
from braintrust import Eval
 
from autoevals import Factuality
from autoevals.ragas import *



def my_task(input):
    row_number = int(input.split("> ")[0])
    response = {"answer": labelled_df.loc[row_number, "rag_model_response"], "context": labelled_df.loc[row_number,  "context"]}
    return response
 

my_task("0> What is the name of the organization that provides healthcare and social assistance services to the community?")

  from .autonotebook import tqdm as notebook_tqdm


{'answer': 'The Healthcare and Social Assistance sector in the United States includes enterprises such as hospitals, ambulatory service providers, nursing and residential care facilities, as well as social assistance services like counseling, social work, family and welfare services, and emergency relief services.',
 'context': '["# 1. About\\n\\nhttps://my.ibisworld.com/us/en/industry/62/about\\n\\n## Codes\\n\\n| NAICS 2017 - USA | 62 |\\n|-------------------|-----|\\n| NAICS 2022 - USA | 62 |\\n\\nThe table shows the NAICS (North American Industry Classification System) codes for the Healthcare and Social Assistance sector in the USA. Both the 2017 and 2022 versions of NAICS assign the code 62 to this sector, indicating consistency in classification over time.\\n\\n## Definition\\n\\nThe Healthcare and Social Assistance sector is composed of enterprises that provide healthcare and social assistance for individuals in the United States, including hospitals, ambulatory service provide

In [14]:
from braintrust import Eval

from autoevals import AnswerCorrectness, ContextRecall
from autoevals import Factuality
from autoevals.ragas import *
# BRAINTRUST_MODEL = "claude-3-5-sonnet-20240620"
BRAINTRUST_MODEL = "gpt-4o-mini"


# Wrap ContextRecall() to propagate along the "answer" and "context" values separately
async def context_recall(output, **kwargs):
    return await ContextRecall(model=BRAINTRUST_MODEL).eval_async(output=output["answer"], context=output["context"], **kwargs)

async def answer_correctness(output, **kwargs):
    return await AnswerCorrectness(model=BRAINTRUST_MODEL).eval_async(output=output["answer"], **kwargs)

async def factuality(output, **kwargs):
    return await Factuality().eval_async(output=output["answer"], **kwargs)

async def answer_similarity(output, **kwargs):
    return await AnswerSimilarity().eval_async(output=output["answer"], **kwargs)


eval_result = await Eval(
    name="RagMetrics",
    experiment_name=f"RAG_querymodel:{MODEL_ID}_Braintrust:{BRAINTRUST_MODEL}_parser:{PARSER}_chunksize:{CHUNK_SIZE}_split:{SPLITTER}_topk:{TOP_K}_N:{NUM_QUESTIONS}",
    data=labelled_data,
    task=my_task,
    scores=[context_recall, answer_correctness, factuality, answer_similarity],
    metadata=dict(model=MODEL_ID, topk=TOP_K, parser=PARSER, chunksize=CHUNK_SIZE, split=SPLITTER, braintrust_model=BRAINTRUST_MODEL, num_questions=NUM_QUESTIONS),
)

Experiment RAG_querymodel:gpt-4o-mini_Braintrust:gpt-4o-mini_parser:claude_chunksize:600_split:sentence_topk:3_N:-1-add7bb39 is running at https://www.braintrust.dev/app/Omega/p/RagMetrics/experiments/RAG_querymodel%3Agpt-4o-mini_Braintrust%3Agpt-4o-mini_parser%3Aclaude_chunksize%3A600_split%3Asentence_topk%3A3_N%3A-1-add7bb39
RagMetrics [experiment_name=RAG_querymodel:gpt-4o-mini_Braintrust:gpt-4o-mini_parser:claude_chunksize:600_split:sentence_topk:3_N:-1] (data): 213it [00:00, 58296.04it/s]
RagMetrics [experiment_name=RAG_querymodel:gpt-4o-mini_Braintrust:gpt-4o-mini_parser:claude_chunksize:600_split:sentence_topk:3_N:-1] (tasks): 100%|██████████| 213/213 [00:15<00:00, 13.63it/s]



RAG_querymodel:gpt-4o-mini_Braintrust:gpt-4o-mini_parser:claude_chunksize:600_split:sentence_topk:3_N:-1-add7bb39 compared to RAG_querymodel:gpt-4o-mini_Braintrust:gpt-4o-mini_parser:claude_chunksize:600_split:sentence_topk:3_N:-1-1d92cf55:
87.40% (-) 'ContextRecall'       score	(0 improvements, 0 regressions)
65.26% (-) 'Factuality'          score	(0 improvements, 0 regressions)
67.43% (-02.61%) 'AnswerCorrectness'   score	(52 improvements, 91 regressions)
80.37% (-) 'EmbeddingSimilarity' score	(0 improvements, 0 regressions)

0.24s (-48.14%) 'duration'         	(213 improvements, 0 regressions)
0tok (-) 'prompt_tokens'    	(0 improvements, 0 regressions)
0tok (-) 'completion_tokens'	(0 improvements, 0 regressions)
0tok (-) 'total_tokens'     	(0 improvements, 0 regressions)

See results for RAG_querymodel:gpt-4o-mini_Braintrust:gpt-4o-mini_parser:claude_chunksize:600_split:sentence_topk:3_N:-1-add7bb39 at https://www.braintrust.dev/app/Omega/p/RagMetrics/experiments/RAG_querymodel%3

In [18]:
# convert labelled data to a dataframe with columns input, expected, metadata
df = pd.DataFrame(labelled_data)
# drop the column rag_model_response
df.drop(columns=["rag_model_response"], inplace=True)

# save the dataframe to a csv file
df.to_csv("./datasets/health_dataset.csv", index=False)

In [19]:
# import braintrust
 
# dataset = braintrust.init_dataset(project="RagMetrics", name="Healthcare and Social Assistance in the US Industry Report")

# for d in labelled_data:
#     dataset.insert(input=d["input"], expected=d["expected"], metadata=d["metadata"])
 
# print(dataset.summarize())