# **Test Data Extraction Accuracy**

In [None]:
%load_ext autoreload
%autoreload 2

### **1. Imports and Config**

In [19]:
from data_pipeline.file_converter import *
from data_pipeline import *

In [None]:
from dotenv import load_dotenv

# load config
load_dotenv("../config/config.env")

# load secrets
load_dotenv("../config/secrets.env")

In [4]:
title_for_llm_experiment = "Zero shot example" # this will be used as title for the results of the LLM extraction
add_ids_to_tags = True # adds unique id to html tags extracted from pdfs 
test_dir = "..\\data\\test_pdfs"

### **2. Load and Convert Data**

In [None]:
# get all filepaths from test_dir
filepaths = get_documents_filepaths(test_dir)
filepaths

In [None]:
# convert all files to xhtml and text
convert_files(filepaths, output_type="xhtml", overwrite=False, add_ids_to_tags=add_ids_to_tags)
convert_files(filepaths, output_type="text", overwrite=False)

### **3. Extract Agenda Data With LLM**

In [7]:
# import agenda json schema
with open(os.getenv("AGENDA_JSON_SCHEMA_PATH"), "r", encoding="utf-8") as f:
    json_schema = f.read()

# import agenda prompt
with open(os.getenv("AGENDA_EXTRACTION_PROMPT_PATH"), "r", encoding="utf-8") as f:
    prompt = f.read()

In [None]:
# create batch file for agenda extraction
create_batch_file(filepaths, prompt, json_schema, batch_file_path=os.getenv("BATCH_FILE_PATH"), overwrite_batch_file=True)

In [None]:
batch_id = submit_batch_job(os.getenv("BATCH_FILE_PATH"))

In [None]:
output_file_id = check_batch_status(batch_id)

In [12]:
if output_file_id:
    output_jsonl = retrieve_batch_output(output_file_id)
else:
    print("No output file id found, please check the batch status")

In [14]:
# Save output
save_agenda_llm_batch_results(output_jsonl, filepaths, replace_ids=add_ids_to_tags)

### **4. Evaluations**

In [None]:
results_path = '../data/temp/batch_results.json'

average_results = process_results(filepaths, prompt, json_schema, title_for_llm_experiment, results_path)
print(average_results)


In [None]:
results_path = '../data/temp/batch_results.json'
visualize_results(results_path)
