In [10]:
import pandas as pd
import json
import os
import sys

sys.path.append("../")

from utils.parse_results import get_acc_df

In [17]:
INPUT_PATH = "../result/"
OUTPUT_PATH = "../score/"

entries = os.scandir(OUTPUT_PATH)
subdirs = [entry.path for entry in entries if entry.is_dir()]
acc_df_list = []
model_names = [subdir.split(OUTPUT_PATH)[1] for subdir in subdirs]
model_names

['gpt-4o-2024-05-13',
 'pt-research-llama-3-70b-instruct-generic-oai-compatible-model',
 'with-bfcl-parsing-temp0_7-pt-research-dbrx-instruct',
 'gpt-3.5-turbo-0125',
 'gpt-4-0125-preview-FC',
 'with-bfcl-parsing-pt-research-dbrx-instruct',
 'agent-flan-llama3-8b-lr2e-7-2ep-generic-oai-compatible-model',
 'gpt-3.5-turbo-0125-FC',
 'gpt-4-0125-preview',
 'databricks-dbrx-instruct-generic-oai-compatible-model-FC',
 'gpt-4-turbo-2024-04-09',
 'pt-research-llama-3-8b-instruct-generic-oai-compatible-model',
 'noparse-pt-research-dbrx-instruct-generic-oai-compatible-model']

In [13]:
model_name = 'databricks-dbrx-instruct-generic-oai-compatible-model-FC'

In [14]:
def get_results_df(models=['databricks-dbrx-instruct-old-run']):
    error_results_df_list = []
    full_results_df_list = []
    acc_dict = {'model': [], 'filename': [], 'accuracy': [], 'correct_count': [], 'total_count': []}
    for model in models:
        # model = 'gpt-3.5-turbo-0125'
        results_dir = f'../score/{model}'
        json_files = [f'{results_dir}/{f}' for f in os.listdir(results_dir) if f.endswith('.json')]
        for filename in json_files:
            with open(filename, 'r') as f:
                try:
                    data = [json.loads(line) for line in f.readlines()]
                    # skip the accuracy line
                    df = pd.DataFrame(data[1:])
                    df['filename'] = filename.split('/')[-1]
                    error_results_df_list.append(df)
                    # parse out accuracy_info
                    acc_info = data[0]
                    acc_dict['filename'].append(filename.split('/')[-1])
                    acc_info['model'] = model
                    for key in acc_info.keys():
                        acc_dict[key].append(acc_info[key])
                except Exception as e:
                    print(f'Error reading {filename}: {e}')
                    
    # now read full results
    for model in models:
        results_dir = f'../result/{model}'
        json_files = [f'{results_dir}/{f}' for f in os.listdir(results_dir) if f.endswith('.json')]
        for filename in json_files:
            with open(filename, 'r') as f:
                try:
                    data = [json.loads(line) for line in f.readlines()]
                    df = pd.DataFrame(data)
                    df['filename'] = filename.split('/')[-1]
                    df['model_name'] = model
                    full_results_df_list.append(df)
                except Exception as e:
                    print(f'Error reading {filename}: {e}')

    acc_df = pd.DataFrame(acc_dict)
    acc_df['metric'] = acc_df['filename'].apply(lambda x: x.split('/')[-1].split('.')[0])
    error_result_df = pd.concat(error_results_df_list)
    full_result_df = pd.concat(full_results_df_list)

    for model in acc_df['model'].unique():
        acc = acc_df[acc_df['model'] == model].correct_count.sum() / acc_df[acc_df['model'] == model].total_count.sum()
        print(f'Model: {model} : Acc = {100.0*acc}%')

    return acc_df, error_result_df, full_result_df

In [55]:
# compare DBRX and GPT-4 to see how we can get up there

acc_df, error_result_df, full_result_df = get_results_df(models=[
            'databricks-dbrx-instruct-generic-oai-compatible-model-FC',
            'gpt-4-0125-preview-FC',
                                                                 ])

full_result_df['test_category'] = full_result_df['filename'].apply(lambda x: x.split("gorilla_openfunctions_v1_test_")[1].split("_result.json")[0])

Model: databricks-dbrx-instruct-generic-oai-compatible-model-FC : Acc = 51.61290322580645%
Model: gpt-4-0125-preview-FC : Acc = 87.25806451612902%


In [21]:
error_result_df.head()

Unnamed: 0,id,model_name,test_category,valid,error,error_type,model_result,decoded_result,filename,prompt,model_result_raw,model_result_decoded,possible_answer
0,5,generic-oai-compatible-model-FC,relevance,False,[Valid syntax. Successfully decode AST when it...,relevance_error:decoder_success,"[{'find_roots': '{""a"": 0, ""b"": 1, ""c"": 2}'}]","[{'find_roots': {'a': 0, 'b': 1, 'c': 2}}]",relevance_score.json,,,,
1,8,generic-oai-compatible-model-FC,relevance,False,[Valid syntax. Successfully decode AST when it...,relevance_error:decoder_success,"[{'math_integral_calculator': '{""function"": ""3...",[{'math_integral_calculator': {'function': '3*...,relevance_score.json,,,,
2,12,generic-oai-compatible-model-FC,relevance,False,[Valid syntax. Successfully decode AST when it...,relevance_error:decoder_success,"[{'get_closest_prime': '{""number"": 30, ""skip"":...","[{'get_closest_prime': {'number': 30, 'skip': ...",relevance_score.json,,,,
3,15,generic-oai-compatible-model-FC,relevance,False,[Valid syntax. Successfully decode AST when it...,relevance_error:decoder_success,"[{'calculate_maximum_height': '{""gravity"": 9.8...","[{'calculate_maximum_height': {'gravity': 9.8,...",relevance_score.json,,,,
4,17,generic-oai-compatible-model-FC,relevance,False,[Valid syntax. Successfully decode AST when it...,relevance_error:decoder_success,"[{'calculate_projectile_range': '{""angle"": 45,...","[{'calculate_projectile_range': {'angle': 45, ...",relevance_score.json,,,,


In [58]:
print(error_result_df.test_category.unique())
print(full_result_df.test_category.unique())

['relevance' 'multiple_function' 'parallel_multiple_function'
 'parallel_function' 'simple']
['parallel_function' 'parallel_multiple_function' 'relevance'
 'multiple_function' 'simple']


In [59]:
def print_error_by_id(idx_list, dbrx_errors_df, dbrx_results_df, gpt4_errors_df, gpt4_results_df, category="simple"):
    if not isinstance(idx_list, list):
        idx_list = [idx_list]
    for idx in idx_list:
        print("----------------------------------------------------------------------------")
        dbrx_error = dbrx_errors_df[dbrx_errors_df['id'] == idx]
        gpt4_result = gpt4_results_df[gpt4_results_df['idx'] == idx-1]
        dbrx_result = dbrx_results_df[dbrx_results_df['idx'] == idx-1]
        # need to filter by category because idx is not unique
        dbrx_error = dbrx_error[dbrx_error['test_category'] == category]
        dbrx_result = dbrx_result[dbrx_result['test_category'] == category]
        gpt4_result = gpt4_result[gpt4_result['test_category'] == category]
        print(f"Idx: {dbrx_error.id.values[0]}")
        print(f"Filename: {dbrx_error.filename.values[0]}")
        # print(f"Prompt: {json.dumps(dbrx_error.prompt.item(), indent=2)}")
        print(f"Question: {json.dumps(dbrx_error.prompt.values[0]['question'], indent=2)}")
        print(f"Dbrx (error_df) result_raw: {json.dumps(dbrx_error.model_result_raw.values[0][0], indent=2)}")
        print(f"Dbrx error: {json.dumps(dbrx_error.error.values[0], indent=2)}")

        print(f"GPT4 result_raw: {json.dumps(gpt4_result.result.values[0][0], indent=2)}")
        print("\n----------------------------------------------------------------------------\n")
        # print(f"Dbrx result_raw: {json.dumps(dbrx_result.result.item()[0], indent=2)}")

In [60]:
full_result_df.model_name.unique()

array(['databricks-dbrx-instruct-generic-oai-compatible-model-FC',
       'gpt-4-0125-preview-FC'], dtype=object)

In [61]:
gpt4_results_df = full_result_df[(full_result_df['model_name'] == 'gpt-4-0125-preview-FC')]
gpt4_errors_df = error_result_df[(error_result_df['model_name'] == 'gpt-4-0125-preview-FC')]

dbrx_results_df = full_result_df[(full_result_df['model_name'] == 'databricks-dbrx-instruct-generic-oai-compatible-model-FC')]
dbrx_errors_df = error_result_df[(error_result_df['model_name'] == 'generic-oai-compatible-model-FC')]

# probably wanna do this by category
def get_dbrx_only_error_ids(dbrx_errors_df, gpt4_errors_df, category=None):
    if category:
        df1 = dbrx_errors_df[dbrx_errors_df['test_category'] == category]
        df2 = gpt4_errors_df[gpt4_errors_df['test_category'] == category]
    else:
        df1 = dbrx_errors_df
        df2 = gpt4_errors_df
    dbrx_only_error_ids = list(set(df1.id.unique()) - set(df2.id.unique()))
    return dbrx_only_error_ids

In [62]:
dbrx_errors_df.test_category.unique()

array(['relevance', 'multiple_function', 'parallel_multiple_function',
       'parallel_function', 'simple'], dtype=object)

In [63]:
dbrx_only_simple_error_ids = get_dbrx_only_error_ids(dbrx_errors_df,
                                                     gpt4_errors_df,
                                                     category='simple')

In [64]:
print_error_by_id(dbrx_only_simple_error_ids[0],
                  dbrx_errors_df,
                  dbrx_results_df,
                  gpt4_errors_df,
                  gpt4_results_df,
                  category='simple')

----------------------------------------------------------------------------
Idx: 131
Filename: simple_score.json
Question: "What's the NPV (Net Present Value) of a series of cash flows: [-50000, 10000, 15000, 20000, 25000, 30000] discounted at 8% annually?"
Dbrx (error_df) result_raw: {
  "finance_calculator_npv": "{\"cash_flows\": [10000, 15000, 20000, 25000, 30000], \"discount_rate\": 0.08, \"years\": [1, 2, 3, 4, 5] }"
}
Dbrx error: [
  "Invalid value for parameter 'cash_flows': [10000, 15000, 20000, 25000, 30000]. Expected one of [[-50000, 10000, 15000, 20000, 25000, 30000]]."
]
GPT4 result_raw: {
  "finance_calculator_npv": "{\"cash_flows\":[-50000,10000,15000,20000,25000,30000],\"discount_rate\":0.08}"
}

----------------------------------------------------------------------------



In [65]:
print_error_by_id(dbrx_only_simple_error_ids[1],
                  dbrx_errors_df,
                  dbrx_results_df,
                  gpt4_errors_df,
                  gpt4_results_df,
                  category='simple')

----------------------------------------------------------------------------
Idx: 260
Filename: simple_score.json
Question: "Calculate the total quantity of paint needed to cover a wall of 30 feet by 12 feet using a specific brand that covers 400 square feet per gallon."
Dbrx (error_df) result_raw: {
  "calculate_paint_needed": "{\"coverage_rate\": 400, \"height\": 30, \"length\": 12}"
}
Dbrx error: [
  "Invalid value for parameter 'height': 30. Expected one of [12]."
]
GPT4 result_raw: {
  "calculate_paint_needed": "{\"coverage_rate\":400,\"length\":30,\"height\":12}"
}

----------------------------------------------------------------------------



In [66]:
print_error_by_id(dbrx_only_simple_error_ids[2],
                  dbrx_errors_df,
                  dbrx_results_df,
                  gpt4_errors_df,
                  gpt4_results_df,
                  category='simple')

----------------------------------------------------------------------------
Idx: 4
Filename: simple_score.json
Question: "Find the roots of a quadratic equation with coefficients a=1, b=-3, c=2."
Dbrx (error_df) result_raw: {
  "algebra_quadratic_roots": "{\"a\": 1, \"b\": 3, \"c\": 2}"
}
Dbrx error: [
  "Invalid value for parameter 'b': 3. Expected one of [-3]."
]
GPT4 result_raw: {
  "algebra_quadratic_roots": "{\"a\":1,\"b\":-3,\"c\":2}"
}

----------------------------------------------------------------------------



In [67]:
print_error_by_id(dbrx_only_simple_error_ids[3],
                  dbrx_errors_df,
                  dbrx_results_df,
                  gpt4_errors_df,
                  gpt4_results_df,
                  category='simple')

----------------------------------------------------------------------------
Idx: 133
Filename: simple_score.json
Question: "Calculate the company's return on equity given its net income of $2,000,000, shareholder's equity of $10,000,000, and dividends paid of $200,000."
Dbrx (error_df) result_raw: {
  "calculate_return_on_equity": "{\"net_income\": 2000000, \"shareholder_equity\": 10000000}"
}
Dbrx error: [
  "Optional parameter 'dividends_paid' not provided and not marked as optional."
]
GPT4 result_raw: {
  "calculate_return_on_equity": "{\"net_income\":2000000,\"shareholder_equity\":10000000,\"dividends_paid\":200000}"
}

----------------------------------------------------------------------------



In [None]:
# TODO: write a function to rerun the example