In [14]:
COMPARISON_FILE = 'claude-3-5-sonnet-20240620_qa.csv'
DOC_ID = 'ibis-healthcare-social-assistance'
PDF_LOCATION = 'IndustrySource/Misc/62 Healthcare and Social Assistance in the US Industry Report.pdf'
qa_model = "claude-3-5-sonnet-20240620"


In [2]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio
nest_asyncio.apply()

import os
from dotenv import load_dotenv
load_dotenv("/Users/mbajaj/.env")
os.environ["BRAINTRUST_API_KEY"]=os.getenv('BRAINTRUST_API_KEY')

In [3]:
# read the comparison file
import pandas as pd
df = pd.read_csv(COMPARISON_FILE)
df.head()

Unnamed: 0,question,answer,source type,answer type,page,file,explanation
0,What types of enterprises are included in the ...,The Healthcare and Social Assistance sector in...,text,other,5,62 Healthcare and Social Assistance in the US ...,Understanding the scope of the Healthcare and ...
1,How is telemedicine defined in the context of ...,Telemedicine is defined as an application of c...,text,other,5,62 Healthcare and Social Assistance in the US ...,Identifying key technological trends like tele...
2,What are Health Insurance Exchanges in the US ...,Health Insurance Exchanges are sets of state-r...,text,other,5,62 Healthcare and Social Assistance in the US ...,Understanding the structure of health insuranc...
3,What services are included in the Healthcare a...,The Healthcare and Social Assistance industry ...,text,other,6,62 Healthcare and Social Assistance in the US ...,Understanding the scope of services in the ind...
4,Who are some of the major companies operating ...,Major companies in the industry include Hca He...,text,other,6,62 Healthcare and Social Assistance in the US ...,Identifying key players helps in understanding...


In [4]:
df.rename(columns={"question": "input", "answer": "expected", "source type": "source_type", "answer type": "answer_type"}, inplace=True)
df.head()

Unnamed: 0,input,expected,source_type,answer_type,page,file,explanation
0,What types of enterprises are included in the ...,The Healthcare and Social Assistance sector in...,text,other,5,62 Healthcare and Social Assistance in the US ...,Understanding the scope of the Healthcare and ...
1,How is telemedicine defined in the context of ...,Telemedicine is defined as an application of c...,text,other,5,62 Healthcare and Social Assistance in the US ...,Identifying key technological trends like tele...
2,What are Health Insurance Exchanges in the US ...,Health Insurance Exchanges are sets of state-r...,text,other,5,62 Healthcare and Social Assistance in the US ...,Understanding the structure of health insuranc...
3,What services are included in the Healthcare a...,The Healthcare and Social Assistance industry ...,text,other,6,62 Healthcare and Social Assistance in the US ...,Understanding the scope of services in the ind...
4,Who are some of the major companies operating ...,Major companies in the industry include Hca He...,text,other,6,62 Healthcare and Social Assistance in the US ...,Identifying key players helps in understanding...


In [8]:
# select the columns "input" ,"expected", "page", "file", "source type", "answer type" from the dataframe
processed_df = df[["input", "expected", "page", "file", "source_type", "answer_type", "explanation"]]

# modify the input column to add the prefix row number from the index to the input
processed_df["input"] = processed_df.apply(lambda x: f"{x.name}> {x.input}", axis=1)
# convert the labelled dataframe to list of dictionaries
processed_data = processed_df.to_dict(orient="records")
processed_data[:2]

[{'input': '0> What types of enterprises are included in the Healthcare and Social Assistance sector in the United States?',
  'expected': 'The Healthcare and Social Assistance sector includes hospitals, ambulatory service providers, nursing and residential care facilities, counselors, social workers, family and welfare services, and natural disaster and emergency relief services.',
  'page': 5,
  'file': '62 Healthcare and Social Assistance in the US Industry Report.pdf',
  'source_type': 'text',
  'answer_type': 'other',
  'explanation': 'Understanding the scope of the Healthcare and Social Assistance sector is crucial for market analysis and identifying potential opportunities or challenges.'},
 {'input': '1> How is telemedicine defined in the context of the Healthcare and Social Assistance industry?',
  'expected': 'Telemedicine is defined as an application of clinical medicine whereby medical information is transferred through interactive audiovisual media for the purpose of consu

In [9]:
# operate on the labelled data and put page, file, source type, answer type in the metadata dictionary inside labelled data

for data in processed_data:
    data["metadata"] = {
        "page": data["page"],
        "file": data["file"],
        "source_type": data["source_type"],
        "answer_type": data["answer_type"],
        "explanation": data["explanation"]
    }
    del data["page"]
    del data["file"]
    del data["source_type"]
    del data["answer_type"]
    del data["explanation"]

In [10]:
processed_data[:2]

[{'input': '0> What types of enterprises are included in the Healthcare and Social Assistance sector in the United States?',
  'expected': 'The Healthcare and Social Assistance sector includes hospitals, ambulatory service providers, nursing and residential care facilities, counselors, social workers, family and welfare services, and natural disaster and emergency relief services.',
  'metadata': {'page': 5,
   'file': '62 Healthcare and Social Assistance in the US Industry Report.pdf',
   'source_type': 'text',
   'answer_type': 'other',
   'explanation': 'Understanding the scope of the Healthcare and Social Assistance sector is crucial for market analysis and identifying potential opportunities or challenges.'}},
 {'input': '1> How is telemedicine defined in the context of the Healthcare and Social Assistance industry?',
  'expected': 'Telemedicine is defined as an application of clinical medicine whereby medical information is transferred through interactive audiovisual media for th

In [16]:
import braintrust
 
dataset = braintrust.init_dataset(project="RagMetrics", name=DOC_ID, description=f"PDF: {PDF_LOCATION}, QA Model: {qa_model}, QA File: {COMPARISON_FILE}")

for d in processed_data:
    dataset.insert(input=d["input"], expected=d["expected"], metadata=d["metadata"])
 
print(dataset.summarize())


Total records: 213 (213 new or updated records)
See results for all datasets in RagMetrics at https://www.braintrust.dev/app/Omega/p/RagMetrics
See results for ibis-healthcare-social-assistance at https://www.braintrust.dev/app/Omega/p/RagMetrics/datasets/ibis-healthcare-social-assistance
