In [1]:
# %pip install llama-index llama-index-core llama-parse openai llama_index.embeddings.huggingface -q
# %pip install llama-index-llms-anthropic -q
# %pip install llama-index-vector-stores-weaviate -q

In [None]:
use_braintrust_dataset = True


In [1]:
PROJECT_NAME = "RagMetrics"
COMPARISON_FILE = 'claude-3-5-sonnet-20240620_qa.csv'
PDF_LOCATION = 'IndustrySource/Misc/62 Healthcare and Social Assistance in the US Industry Report.pdf'
DOC_ID = 'ibis-healthcare-social-assistance'
MODEL_ID = 'gpt-4o-mini'
QUESTION_COL = 'question'
RESPONSE_COL = 'rag_model_response'
NUM_QUESTIONS = -1
PARSER = "claude" # "claude" or "llama-parse"
CHUNK_SIZE = 600
SPLITTER = "sentence"
TOP_K = 3
OUTPUT_FOLDER = f'./rag_outputs/{DOC_ID}'
OUTPUT_FILE = f'{OUTPUT_FOLDER}/output_{MODEL_ID}_{PARSER}_{CHUNK_SIZE}_{SPLITTER}_{TOP_K}.csv'


In [2]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio
nest_asyncio.apply()

import os
from dotenv import load_dotenv
load_dotenv("/Users/mbajaj/.env")
# API access to llama-cloud
os.environ["LLAMA_CLOUD_API_KEY"] = os.getenv('LLAMA_CLOUD_API_KEY')
# Using OpenAI API for embeddings/llms
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
os.environ["BRAINTRUST_API_KEY"]=os.getenv('BRAINTRUST_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [None]:
def reformat_data(labelled_data):
    # operate on the labelled data and put page, file, source type, answer type in the metadata dictionary inside labelled data

    for data in labelled_data:
        data["metadata"] = {
            "page": data["page"],
            "file": data["file"],
            "source_type": data["source_type"],
            "answer_type": data["answer_type"],
            "references": data["references"],
        }
        del data["page"]
        del data["file"]
        del data["source_type"]
        del data["answer_type"]
        del data["references"]
        del data["context"]
    return labelled_data

In [None]:
from braintrust import Eval
 
from autoevals import Factuality
from autoevals.ragas import *



def my_task(input):
    row_number = int(input.split("> ")[0])
    response = {"answer": labelled_df.loc[row_number, "rag_model_response"], "context": labelled_df.loc[row_number,  "context"]}
    return response
 

my_task("0> What is the name of the organization that provides healthcare and social assistance services to the community?")

In [None]:
from braintrust import Eval

from autoevals import AnswerCorrectness, ContextRecall
from autoevals import Factuality
from autoevals.ragas import *
# BRAINTRUST_MODEL = "claude-3-5-sonnet-20240620"
BRAINTRUST_MODEL = "gpt-4o-mini"


# Wrap ContextRecall() to propagate along the "answer" and "context" values separately
async def context_recall(output, **kwargs):
    return await ContextRecall(model=BRAINTRUST_MODEL).eval_async(output=output["answer"], context=output["context"], **kwargs)

async def answer_correctness(output, **kwargs):
    return await AnswerCorrectness(model=BRAINTRUST_MODEL).eval_async(output=output["answer"], **kwargs)

async def factuality(output, **kwargs):
    return await Factuality().eval_async(output=output["answer"], **kwargs)

async def answer_similarity(output, **kwargs):
    return await AnswerSimilarity().eval_async(output=output["answer"], **kwargs)



In [3]:
import pandas as pd
EXPERIMENT_NAME = f"RAG_querymodel:{MODEL_ID}_Braintrust:{BRAINTRUST_MODEL}_parser:{PARSER}_chunksize:{CHUNK_SIZE}_split:{SPLITTER}_topk:{TOP_K}_N:{NUM_QUESTIONS}"
df = pd.read_csv(OUTPUT_FILE)

if use_braintrust_dataset:
    df.rename(columns={"source type": "source_type", "answer type": "answer_type"}, inplace=True)
    # make dictionary from the id to dictionary containing {answer: rag_model_response, context: context} and metadata dictonary as {page: page, file: file, source_type: source_type, answer_type: answer_type}
    labelled_data = {}
    for index, row in df.iterrows():
        labelled_data[row['id']] = {"answer": row['rag_model_response'], "context": row['context'], "metadata": {"page": row['page'], "file": row['file'], "source_type": row['source_type'], "answer_type": row['answer_type'], "references": row['references']}}
    dataset = braintrust.init_dataset(project=PROJECT_NAME, name="DOC_ID")
    experiment = braintrust.init(project=PROJECT, experiment=EXPERIMENT_NAME, dataset=dataset)
    for row in dataset:
        output = labelled_data[row["id"]]
        experiment.log(
            input=row["input"],
            output=output,
            expected=row["expected"],
            scores=[context_recall, answer_correctness, factuality, answer_similarity],
            dataset_record_id=row["id"],
        )
    print(experiment.summarize())

else:
    # change the column names of question to input and answer to expected
    df.rename(columns={"question": "input", "answer": "expected", "source type": "source_type", "answer type": "answer_type"}, inplace=True)
    # select the columns "input" ,"expected", "page", "file", "source type", "answer type" from the dataframe
    labelled_df = df[["input", "expected", "page", "file", "source_type", "answer_type", "references", "context", "rag_model_response"]]

    # modify the input column to add the prefix row number from the index to the input
    labelled_df["input"] = labelled_df.apply(lambda x: f"{x.name}> {x.input}", axis=1)
    # convert the labelled dataframe to list of dictionaries
    labelled_data = labelled_df.to_dict(orient="records")
    labelled_data = reformat_data(labelled_data)
    
    eval_result = await Eval(
    name=PROJECT_NAME,
    experiment_name=EXPERIMENT_NAME,
    data=labelled_data,
    task=my_task,
    scores=[context_recall, answer_correctness, factuality, answer_similarity],
    metadata=dict(model=MODEL_ID, topk=TOP_K, parser=PARSER, chunksize=CHUNK_SIZE, split=SPLITTER, braintrust_model=BRAINTRUST_MODEL, num_questions=NUM_QUESTIONS),
    )



Unnamed: 0,input,expected,source_type,answer_type,page,file,explanation,rag_model_response,references,context
0,What types of enterprises are included in the ...,The Healthcare and Social Assistance sector in...,text,other,5,62 Healthcare and Social Assistance in the US ...,Understanding the scope of the Healthcare and ...,The Healthcare and Social Assistance sector in...,"[5, 53, 10]","[""# 1. About\n\nhttps://my.ibisworld.com/us/en..."
1,How is telemedicine defined in the context of ...,Telemedicine is defined as an application of c...,text,other,5,62 Healthcare and Social Assistance in the US ...,Identifying key technological trends like tele...,Telemedicine is defined as an application of c...,"[21, 5, 26]",['# IBISWorld | Healthcare and Social Assistan...
2,What are Health Insurance Exchanges in the US ...,Health Insurance Exchanges are sets of state-r...,text,other,5,62 Healthcare and Social Assistance in the US ...,Understanding the structure of health insuranc...,Health Insurance Exchanges in the US healthcar...,"[69, 5, 36]","[""# IBISWorld | Healthcare and Social Assistan..."
3,What services are included in the Healthcare a...,The Healthcare and Social Assistance industry ...,text,other,6,62 Healthcare and Social Assistance in the US ...,Understanding the scope of services in the ind...,The Healthcare and Social Assistance industry ...,"[10, 9, 30]",['# IBISWorld | Healthcare and Social Assistan...
4,Who are some of the major companies operating ...,Major companies in the industry include Hca He...,text,other,6,62 Healthcare and Social Assistance in the US ...,Identifying key players helps in understanding...,Some of the major companies operating in the H...,"[53, 11, 30]",['# IBISWorld | Healthcare and Social Assistan...
