In [1]:
# COMPARISON_FILE = 'claude-3-5-sonnet-20240620_qa.csv'

DOC_ID = 'ibis-healthcare-social-assistance'
COMPARISON_FILE = f"./datasets/{DOC_ID}/major_questions.csv"
PDF_LOCATION = 'IndustrySource/Misc/62 Healthcare and Social Assistance in the US Industry Report.pdf'
qa_model = "claude-3-5-sonnet-20240620"


In [2]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio
nest_asyncio.apply()

import os
from dotenv import load_dotenv
load_dotenv("/Users/mbajaj/.env")
os.environ["BRAINTRUST_API_KEY"]=os.getenv('BRAINTRUST_API_KEY')

In [3]:
# read the comparison file
import pandas as pd
df = pd.read_csv(COMPARISON_FILE)
df.head()

Unnamed: 0,Question,Reference,Answer
0,What is the estimated total addressable market...,"Pages 9, 31, 40-42",The total revenue for the Healthcare and Socia...
1,What has been the market's average growth rate...,Page 78,"Annual revenue change percentages: 2019: 3.9%,..."
2,Which specific product categories or geographi...,"Pages 21, 25, 26, 31","Emerging sub-segments include telemedicine, di..."
3,"How are new technologies (e.g., AI, automation...","Pages 21, 25, 32-33",AI is being applied across healthcare faciliti...
4,How have changing macroeconomic indicators (e....,"Pages 20, 21, 26, 36, 67, 74",Rising inflation has increased costs for devic...


In [4]:
# rename the column names to all small case
df.columns = [col.lower() for col in df.columns]
df.head()

Unnamed: 0,question,reference,answer
0,What is the estimated total addressable market...,"Pages 9, 31, 40-42",The total revenue for the Healthcare and Socia...
1,What has been the market's average growth rate...,Page 78,"Annual revenue change percentages: 2019: 3.9%,..."
2,Which specific product categories or geographi...,"Pages 21, 25, 26, 31","Emerging sub-segments include telemedicine, di..."
3,"How are new technologies (e.g., AI, automation...","Pages 21, 25, 32-33",AI is being applied across healthcare faciliti...
4,How have changing macroeconomic indicators (e....,"Pages 20, 21, 26, 36, 67, 74",Rising inflation has increased costs for devic...


In [5]:
# if question column is present, rename it to input
if 'question' in df.columns:
    df.rename(columns={"question": "input"}, inplace=True)

# if answer column is present, rename it to expected
if 'answer' in df.columns:
    df.rename(columns={"answer": "expected"}, inplace=True)

# if source type column is present, rename it to source_type
if 'source type' in df.columns:
    df.rename(columns={"source type": "source_type"}, inplace=True)

# if answer type column is present, rename it to answer_type
if 'answer type' in df.columns:
    df.rename(columns={"answer type": "answer_type"}, inplace=True)

df.head()

Unnamed: 0,input,reference,expected
0,What is the estimated total addressable market...,"Pages 9, 31, 40-42",The total revenue for the Healthcare and Socia...
1,What has been the market's average growth rate...,Page 78,"Annual revenue change percentages: 2019: 3.9%,..."
2,Which specific product categories or geographi...,"Pages 21, 25, 26, 31","Emerging sub-segments include telemedicine, di..."
3,"How are new technologies (e.g., AI, automation...","Pages 21, 25, 32-33",AI is being applied across healthcare faciliti...
4,How have changing macroeconomic indicators (e....,"Pages 20, 21, 26, 36, 67, 74",Rising inflation has increased costs for devic...


In [6]:
# select the columns "input" ,"expected", "page", "file", "source type", "answer type" from the dataframe
# processed_df = df[["input", "expected", "page", "file", "source_type", "answer_type", "explanation"]]

processed_df = df
# modify the input column to add the prefix row number from the index to the input
processed_df["input"] = processed_df.apply(lambda x: f"{x.name}> {x.input}", axis=1)
# convert the labelled dataframe to list of dictionaries
processed_data = processed_df.to_dict(orient="records")
processed_data[:2]

[{'input': '0> What is the estimated total addressable market (TAM) for this sector in 2024, broken down by region and sub-sector?',
  'reference': 'Pages 9, 31, 40-42',
  'expected': 'The total revenue for the Healthcare and Social Assistance sector in the US is $3.6 trillion in 2024. Sub-sectors: Hospitals - 42%, Ambulatory healthcare services - 41%, Nursing and residential care facilities - 9%, Social assistance - 8%.'},
 {'input': "1> What has been the market's average growth rate in each of the past 5 years? Is growth stable, accelerating, or decelerating?",
  'reference': 'Page 78',
  'expected': 'Annual revenue change percentages: 2019: 3.9%, 2020: 0.1%, 2021: 4.9%, 2022: -3.9%, 2023: 2.4%. The average growth rate = 1.48%. CAGR = 0.7%. Growth is volatile, likely due to pandemic effects.'}]

In [7]:
# operate on the labelled data and put all metadata columns (other than input, expected) in the metadata dictionary inside labelled data

def reformat_metadata(df_dict):
    columns = df.columns
    metadata_columns = [col for col in columns if col not in ["input", "expected"]]
    for data in df_dict:
        data["metadata"] = {col: data[col] for col in metadata_columns}
        for col in metadata_columns:
            del data[col]
    return df_dict
    

In [8]:
processed_data = reformat_metadata(processed_data)
processed_data[:2]

[{'input': '0> What is the estimated total addressable market (TAM) for this sector in 2024, broken down by region and sub-sector?',
  'expected': 'The total revenue for the Healthcare and Social Assistance sector in the US is $3.6 trillion in 2024. Sub-sectors: Hospitals - 42%, Ambulatory healthcare services - 41%, Nursing and residential care facilities - 9%, Social assistance - 8%.',
  'metadata': {'reference': 'Pages 9, 31, 40-42'}},
 {'input': "1> What has been the market's average growth rate in each of the past 5 years? Is growth stable, accelerating, or decelerating?",
  'expected': 'Annual revenue change percentages: 2019: 3.9%, 2020: 0.1%, 2021: 4.9%, 2022: -3.9%, 2023: 2.4%. The average growth rate = 1.48%. CAGR = 0.7%. Growth is volatile, likely due to pandemic effects.',
  'metadata': {'reference': 'Page 78'}}]

In [9]:
import braintrust
dataset_name = f"{DOC_ID}/{COMPARISON_FILE}"
dataset = braintrust.init_dataset(project="RagMetrics", name=dataset_name, description=f"PDF: {PDF_LOCATION}, QA Model: {qa_model}, QA File: {COMPARISON_FILE}")

for d in processed_data:
    dataset.insert(input=d["input"], expected=d["expected"], metadata=d["metadata"])
 
print(dataset.summarize())

  from .autonotebook import tqdm as notebook_tqdm



Total records: 17 (17 new or updated records)
See results for all datasets in RagMetrics at https://www.braintrust.dev/app/Omega/p/RagMetrics
See results for ibis-healthcare-social-assistance/./datasets/ibis-healthcare-social-assistance/major_questions.csv at https://www.braintrust.dev/app/Omega/p/RagMetrics/datasets/ibis-healthcare-social-assistance%2F.%2Fdatasets%2Fibis-healthcare-social-assistance%2Fmajor_questions.csv
