In [1]:
import pandas as pd
import json
import os
import numpy as np

In [2]:
# first load prompt results
# {"idx": 3, "result": "[calculate_displacement(initial_velocity=15, acceleration=9.8, time=10)]", "input_token_count": 473, "output_token_count": 19, "latency": 0.47064781188964844}

error_results_df_list = []
full_results_df_list = []
acc_dict = {'model': [], 'filename': [], 'accuracy': [], 'correct_count': [], 'total_count': []}
for model in ['gpt-3.5-turbo-0125', 'gpt-3.5-turbo-0125-FC']:
    # model = 'gpt-3.5-turbo-0125'
    results_dir = f'score/{model}'
    json_files = [f'{results_dir}/{f}' for f in os.listdir(results_dir) if f.endswith('.json')]
    for filename in json_files:
        with open(filename, 'r') as f:
            try:
                data = [json.loads(line) for line in f.readlines()]
                # skip the accuracy line
                df = pd.DataFrame(data[1:])
                df['filename'] = filename.split('/')[-1]
                error_results_df_list.append(df)
                # parse out accuracy_info
                acc_info = data[0]
                acc_dict['filename'].append(filename.split('/')[-1])
                acc_info['model'] = model
                for key in acc_info.keys():
                    acc_dict[key].append(acc_info[key])
            except Exception as e:
                print(f'Error reading {filename}: {e}')

# now read full results
for model in ['gpt-3.5-turbo-0125', 'gpt-3.5-turbo-0125-FC']:
    results_dir = f'result/{model}'
    json_files = [f'{results_dir}/{f}' for f in os.listdir(results_dir) if f.endswith('.json')]
    for filename in json_files:
        with open(filename, 'r') as f:
            try:
                data = [json.loads(line) for line in f.readlines()]
                df = pd.DataFrame(data)
                df['filename'] = filename.split('/')[-1]
                df['model_name'] = model
                full_results_df_list.append(df)
            except Exception as e:
                print(f'Error reading {filename}: {e}')

acc_df = pd.DataFrame(acc_dict)
acc_df['metric'] = acc_df['filename'].apply(lambda x: x.split('/')[-1].split('.')[0])
error_result_df = pd.concat(error_results_df_list)
full_result_df = pd.concat(full_results_df_list)

Error reading score/gpt-3.5-turbo-0125-FC/executable_multiple_function_score_pallavi_annotated.json: Expecting value: line 1 column 1 (char 0)


In [3]:
acc_df[['model', 'filename', 'accuracy']].sort_values(by=['filename', 'model'])

Unnamed: 0,model,filename,accuracy
3,gpt-3.5-turbo-0125,executable_multiple_function_score.json,0.72
17,gpt-3.5-turbo-0125-FC,executable_multiple_function_score.json,0.68
10,gpt-3.5-turbo-0125,executable_parallel_function_score.json,0.62
23,gpt-3.5-turbo-0125-FC,executable_parallel_function_score.json,0.72
5,gpt-3.5-turbo-0125,executable_parallel_multiple_function_score.json,0.45
22,gpt-3.5-turbo-0125-FC,executable_parallel_multiple_function_score.json,0.4
11,gpt-3.5-turbo-0125,executable_simple_score.json,0.78
18,gpt-3.5-turbo-0125-FC,executable_simple_score.json,0.75
1,gpt-3.5-turbo-0125,java_score.json,0.51
19,gpt-3.5-turbo-0125-FC,java_score.json,0.53


In [4]:
for model in acc_df['model'].unique():
    acc = acc_df[acc_df['model'] == model].correct_count.sum() / acc_df[acc_df['model'] == model].total_count.sum()
    print(f'Model: {model} : Acc = {100.0*acc}%')

Model: gpt-3.5-turbo-0125 : Acc = 66.47058823529412%
Model: gpt-3.5-turbo-0125-FC : Acc = 55.76470588235294%


In [10]:
# most of the accuracies are comparable, the biggest difference makers are:
# 1. relevance_score.json: 60% (prompt) vs 0 (FC)
# 2. simple_score.json: 77% (prompt) vs 55% (FC)

In [11]:
error_result_df.head()

Unnamed: 0,id,model_name,test_category,valid,error,error_type,prompt,model_result_raw,possible_answer,model_result_decoded,filename,model_result,decoded_result
0,7,gpt-3.5-turbo-0125,simple,False,[Invalid syntax. Failed to decode AST. ],ast_decoder:decoder_failed,{'question': 'What are the roots of the quadra...,"[{'solve_quadratic': {'a': 2, 'b': 5, 'c': 3}}]","{'solve_quadratic': {'a': [2], 'b': [5], 'c': ...",,simple_score.json,,
1,14,gpt-3.5-turbo-0125,simple,False,[Nested type checking failed for parameter 'in...,type_error:nested,{'question': 'Calculate the area under the cur...,"[calculate_area_under_curve(function='x**2', i...",{'calculate_area_under_curve': {'function': ['...,[{'calculate_area_under_curve': {'function': '...,simple_score.json,,
2,15,gpt-3.5-turbo-0125,simple,False,[Invalid value for parameter 'function': '3*x*...,value_error:string,{'question': 'Calculate the derivative of the ...,[calculate_derivative(function='3*x**2 + 2*x -...,{'calculate_derivative': {'function': ['3x^2 +...,[{'calculate_derivative': {'function': '3*x**2...,simple_score.json,,
3,16,gpt-3.5-turbo-0125,simple,False,[Invalid syntax. Failed to decode AST. ],ast_decoder:decoder_failed,{'question': 'Calculate the area under the cur...,"[{'name': 'integrate', 'parameters': {'functio...","{'integrate': {'function': ['x^3', 'x**3'], 's...",,simple_score.json,,
4,17,gpt-3.5-turbo-0125,simple,False,[Invalid value for parameter 'function': '2*x*...,value_error:string,{'question': 'Calculate the derivative of the ...,"[calculus.derivative(function='2*x**2', value=...","{'calculus.derivative': {'function': ['2*x^2',...",[{'calculus.derivative': {'function': '2*x**2'...,simple_score.json,,


In [5]:
gpt3_5_fc_relevance_errors = error_result_df[(error_result_df['filename'] == 'relevance_score.json') & (error_result_df['model_name'] == 'gpt-3.5-turbo-0125-FC')]
gpt3_5_fc_relevance_results = full_result_df[(full_result_df['filename'] == 'gorilla_openfunctions_v1_test_relevance_result.json') & (full_result_df['model_name'] == 'gpt-3.5-turbo-0125-FC')]
gpt3_5_prompt_relevance_errors = error_result_df[(error_result_df['filename'] == 'relevance_score.json') & (error_result_df['model_name'] == 'gpt-3.5-turbo-0125')]
gpt3_5_prompt_relevance_results = full_result_df[(full_result_df['filename'] == 'gorilla_openfunctions_v1_test_relevance_result.json') & (full_result_df['model_name'] == 'gpt-3.5-turbo-0125')]

In [6]:
gpt3_5_fc_relevance_errors.shape

(234, 13)

In [7]:
gpt3_5_prompt_relevance_errors.shape

(96, 13)

In [8]:
gpt3_5_fc_relevance_results.shape

(240, 7)

# takeaway - gpt3.5 FC gets relevance results ALMOST ALWAYS WRONG! 234/240 are wrong.

In [9]:
gpt3_5_fc_relevance_errors.head()

Unnamed: 0,id,model_name,test_category,valid,error,error_type,prompt,model_result_raw,model_result_decoded,possible_answer,filename,model_result,decoded_result
0,1,gpt-3.5-turbo-0125-FC,relevance,False,[Valid syntax. Successfully decode AST when it...,relevance_error:decoder_success,,,,,relevance_score.json,"[{'determine_body_mass_index': '{""weight"": 10,...","[{'determine_body_mass_index': {'weight': 10, ..."
1,2,gpt-3.5-turbo-0125-FC,relevance,False,[Valid syntax. Successfully decode AST when it...,relevance_error:decoder_success,,,,,relevance_score.json,"[{'math_sum': '{""numbers"": [1, 2, 3]}'}, {'mat...","[{'math_sum': {'numbers': [1, 2, 3]}}, {'math_..."
2,3,gpt-3.5-turbo-0125-FC,relevance,False,[Valid syntax. Successfully decode AST when it...,relevance_error:decoder_success,,,,,relevance_score.json,"[{'solve_quadratic_equation': '{""a"": 3, ""b"": -...","[{'solve_quadratic_equation': {'a': 3, 'b': -2..."
3,4,gpt-3.5-turbo-0125-FC,relevance,False,[Valid syntax. Successfully decode AST when it...,relevance_error:decoder_success,,,,,relevance_score.json,"[{'find_critical_points': '{""function"":""3x + 2...",[{'find_critical_points': {'function': '3x + 2...
4,5,gpt-3.5-turbo-0125-FC,relevance,False,[Valid syntax. Successfully decode AST when it...,relevance_error:decoder_success,,,,,relevance_score.json,"[{'find_roots': '{""a"": 0, ""b"": 1, ""c"": 0}'}, {...","[{'find_roots': {'a': 0, 'b': 1, 'c': 0}}, {'f..."


In [10]:
gpt3_5_fc_relevance_results.head()

Unnamed: 0,idx,result,input_token_count,output_token_count,latency,filename,model_name
0,0,"[{'determine_body_mass_index': '{""weight"": 10,...",101,36,0.799414,gorilla_openfunctions_v1_test_relevance_result...,gpt-3.5-turbo-0125-FC
1,1,"[{'math_sum': '{""numbers"": [1, 2, 3]}'}, {'mat...",112,52,1.101193,gorilla_openfunctions_v1_test_relevance_result...,gpt-3.5-turbo-0125-FC
2,2,"[{'solve_quadratic_equation': '{""a"": 3, ""b"": -...",113,65,1.267069,gorilla_openfunctions_v1_test_relevance_result...,gpt-3.5-turbo-0125-FC
3,3,"[{'find_critical_points': '{""function"":""3x + 2...",133,23,0.650546,gorilla_openfunctions_v1_test_relevance_result...,gpt-3.5-turbo-0125-FC
4,4,"[{'find_roots': '{""a"": 0, ""b"": 1, ""c"": 0}'}, {...",109,61,1.080612,gorilla_openfunctions_v1_test_relevance_result...,gpt-3.5-turbo-0125-FC


In [17]:
# JESUS CHRIST BFCL! There's a bug in your code.
# the ID in the error results is not the same as the idx in the results
# the score.json idx is off by 1 from the idx in the results


# let's see if we can compare FC vs non-FC for these errors
def compare_fc_vs_prompt(fc_errors_df, fc_df, prompt_errors_df, prompt_df, idx=None, verbose=False):
    if idx is None:
        fc_only_errors = set(fc_errors_df.id.values) - set(prompt_errors_df.id.values)
        idx = np.random.choice(list(fc_only_errors))
    print(f'Looking at idx: {idx} (WHICH IS SECRETLY) {idx - 1} in the results.json')
    print(f"Error: {fc_errors_df.model_name.unique()[0]}", fc_errors_df[fc_errors_df['id'] == idx]['error'].item())
    print("FC Model: ", fc_df[fc_df['idx'] == (idx-1)]['result'].item())

    if fc_errors_df[fc_errors_df['id'] == idx].test_category.item() == 'simple':
        # check if fc model has multiple function calls which repeat
        fc_response = fc_df[fc_df['idx'] == (idx-1)]['result'].item()
        print(f"Num FC responses: {len(fc_response)}")
        if len(fc_response) > 1:
            same_bool = [response == fc_response[0] for response in fc_response]
            if sum(same_bool) == len(same_bool):
                print(f"!!! FC model repeated the same function call {len(fc_response)} times. !!!")
            else:
                print(f"FC model had multiple different function calls. Weird.")
        

    print("Prompt Model: ", prompt_df[prompt_df['idx'] == (idx-1)]['result'].item())
    if idx in prompt_errors_df.id.values:
        print("Prompt model also made an error. This is not a clear FC error.")
        print("Prompt Error: ", prompt_errors_df[prompt_errors_df['id'] == idx]['error'].item())
    else:
        print("Prompt model got it right.")

    if fc_errors_df[fc_errors_df['id'] == idx]['test_category'].item() == 'relevance':
        with open("data/gorilla_openfunctions_v1_test_relevance.json", 'r') as f:
            data = [json.loads(line) for line in f.readlines()]
            question_df = pd.DataFrame(data)
    elif fc_errors_df[fc_errors_df['id'] == idx].test_category.item() == 'simple':
        with open("data/gorilla_openfunctions_v1_test_simple.json", 'r') as f:
            data = [json.loads(line) for line in f.readlines()]
            question_df = pd.DataFrame(data)
    else:
        raise ValueError(f"Not implemented for test categories {fc_errors_df[fc_errors_df['id'] == idx].test_category.item()}.")
    
    print("Question: ", question_df.iloc[idx-1]['question'])
    if verbose:
        print(json.dumps(question_df.iloc[idx-1]['function'], indent=2))

In [19]:
compare_fc_vs_prompt(gpt3_5_fc_relevance_errors,
                     gpt3_5_fc_relevance_results,
                     gpt3_5_prompt_relevance_errors,
                     gpt3_5_prompt_relevance_results,
                     idx=121,
                     verbose=True)

Looking at idx: 121 (WHICH IS SECRETLY) 120 in the results.json
Error: gpt-3.5-turbo-0125-FC ['Valid syntax. Successfully decode AST when it should not.']
FC Model:  [{'caffeine_effect': '{"caffeine_content":95,"drinking_frequency":"daily"}'}]
Prompt Model:  NO tools call.
Prompt model got it right.
Question:  What's the neurological impact of sports on human brain?
{
  "name": "caffeine_effect",
  "description": "Provide potential neurological impact of caffeine, mainly from coffee, on human brain.",
  "parameters": {
    "type": "dict",
    "properties": {
      "caffeine_content": {
        "type": "float",
        "description": "The amount of caffeine contained in coffee in milligrams."
      },
      "drinking_frequency": {
        "type": "string",
        "description": "How often the individual drinks coffee in a day."
      },
      "drinking_duration": {
        "type": "integer",
        "description": "For how long the individual has been drinking coffee. Default: 100"
   

In [88]:
# with open("data/gorilla_openfunctions_v1_test_relevance.json", 'r') as f:
#     data = [json.loads(line) for line in f.readlines()]
#     question_df = pd.DataFrame(data)

In [20]:
compare_fc_vs_prompt(gpt3_5_fc_relevance_errors,
                     gpt3_5_fc_relevance_results,
                     gpt3_5_prompt_relevance_errors,
                     gpt3_5_prompt_relevance_results,
                     idx=113,
                     verbose=True)

Looking at idx: 113 (WHICH IS SECRETLY) 112 in the results.json
Error: gpt-3.5-turbo-0125-FC ['Valid syntax. Successfully decode AST when it should not.']
FC Model:  [{'geocode_address': '{"address":"New York, USA"}'}]
Prompt Model:  [geocode_address(address='New York')]
Prompt model also made an error. This is not a clear FC error.
Prompt Error:  ['Valid syntax. Successfully decode AST when it should not.']
Question:  What's the current traffic condition in New York?
{
  "name": "geocode_address",
  "description": "Transforms a description of a location (like a pair of coordinates, an address, or a name of a place) to a location on the Earth's surface.",
  "parameters": {
    "type": "dict",
    "properties": {
      "address": {
        "type": "string",
        "description": "The address that needs to be geocoded."
      },
      "locale": {
        "type": "string",
        "description": "Preferred locale for the returned address information. (Optional) Default: None"
      }
   

In [21]:
compare_fc_vs_prompt(gpt3_5_fc_relevance_errors,
                     gpt3_5_fc_relevance_results,
                     gpt3_5_prompt_relevance_errors,
                     gpt3_5_prompt_relevance_results,
                     idx=135,
                     verbose=True
                     )

Looking at idx: 135 (WHICH IS SECRETLY) 134 in the results.json
Error: gpt-3.5-turbo-0125-FC ['Valid syntax. Successfully decode AST when it should not.']
FC Model:  [{'calculate_battle_outcome': '{"battle_name": "World Cup 2022 Final", "strategy_type": "football"}'}, {'calculate_battle_outcome': '{"battle_name": "World Cup 2022 Final", "strategy_type": "football"}'}]
Prompt Model:  [This question does not relate to the available function. No function applies.]
Prompt model got it right.
Question:  Who won the World Cup 2022?
{
  "name": "calculate_battle_outcome",
  "description": "Predicts the outcome of a historical battle based on the strategies, army size and other influencing factors.",
  "parameters": {
    "type": "dict",
    "properties": {
      "battle_name": {
        "type": "string",
        "description": "The name of the historical battle."
      },
      "strategy_type": {
        "type": "string",
        "description": "The strategy employed in the battle."
      },


In [66]:
with open("data/gorilla_openfunctions_v1_test_relevance.json", 'r') as f:
    data = [json.loads(line) for line in f.readlines()]
    question_df = pd.DataFrame(data)

question_df.iloc[134].function

{'name': 'calculate_battle_outcome',
 'description': 'Predicts the outcome of a historical battle based on the strategies, army size and other influencing factors.',
 'parameters': {'type': 'dict',
  'properties': {'battle_name': {'type': 'string',
    'description': 'The name of the historical battle.'},
   'strategy_type': {'type': 'string',
    'description': 'The strategy employed in the battle.'},
   'weather_condition': {'type': 'string',
    'description': 'Weather condition during the battle.',
    'default': 'snowing'}},
  'required': ['battle_name', 'strategy_type']}}

In [67]:
# ^^honestly not bad. Pretty innovative way to use an irrelevant function to make it look relevant. Still wrong, but I'm impressed.

# Takeaway: Prompt model understands when the passed functions are irrelevant and says no. But the FC model almost always returns a function call even though it makes no sense and gets Rekt. I suspect this is an issue with the way BFCL is calling the FC model. But will verify after implementing for DBRX

In [23]:
# now compare for simple
gpt3_5_fc_simple_errors = error_result_df[(error_result_df['filename'] == 'simple_score.json') & (error_result_df['model_name'] == 'gpt-3.5-turbo-0125-FC')]
gpt3_5_fc_simple_results = full_result_df[(full_result_df['filename'] == 'gorilla_openfunctions_v1_test_simple_result.json') & (full_result_df['model_name'] == 'gpt-3.5-turbo-0125-FC')]
gpt3_5_prompt_simple_errors = error_result_df[(error_result_df['filename'] == 'simple_score.json') & (error_result_df['model_name'] == 'gpt-3.5-turbo-0125')]
gpt3_5_prompt_simple_results = full_result_df[(full_result_df['filename'] == 'gorilla_openfunctions_v1_test_simple_result.json') & (full_result_df['model_name'] == 'gpt-3.5-turbo-0125')]

In [24]:
compare_fc_vs_prompt(gpt3_5_fc_simple_errors,
                     gpt3_5_fc_simple_results,
                     gpt3_5_prompt_relevance_errors,
                     gpt3_5_prompt_simple_results,
                     idx=335,
                     verbose=True)

Looking at idx: 335 (WHICH IS SECRETLY) 334 in the results.json
Error: gpt-3.5-turbo-0125-FC ['Wrong number of functions.']
FC Model:  [{'blackjack_check_winner': '{"player_cards": ["A", "10"], "dealer_cards": ["10", "9"], "ace_value": 1}'}, {'blackjack_check_winner': '{"player_cards": ["A", "10"], "dealer_cards": ["10", "9"], "ace_value": 11}'}]
Num FC responses: 2
FC model had multiple different function calls. Weird.
Prompt Model:  [blackjack.check_winner(player_cards=['A', '10'], dealer_cards=['10', '9'], ace_value=1)]
Prompt model got it right.
Question:  Check who is the winner in a game of blackjack given player having A and 10, dealer having 10 and 9. The Ace is considered 1.
{
  "name": "blackjack.check_winner",
  "description": "Checks and determines the winner in a game of blackjack.",
  "parameters": {
    "type": "dict",
    "properties": {
      "player_cards": {
        "type": "array",
        "items": {
          "type": "string"
        },
        "description": "Card

In [33]:
compare_fc_vs_prompt(gpt3_5_fc_simple_errors,
                     gpt3_5_fc_simple_results,
                     gpt3_5_prompt_relevance_errors,
                     gpt3_5_prompt_simple_results,
                     idx=37)

Looking at idx: 37 (WHICH IS SECRETLY) 36 in the results.json
Error: gpt-3.5-turbo-0125-FC ["Invalid value for parameter 'unit': 'mi'. Expected one of ['km', '']. Case insensitive."]
FC Model:  [{'get_shortest_driving_distance': '{"origin":"New York City","destination":"Washington D.C.","unit":"mi"}'}]
Num FC responses: 1
Prompt Model:  [get_shortest_driving_distance(origin='New York City', destination='Washington D.C.')]
Prompt model got it right.
Question:  Find the shortest driving distance between New York City and Washington D.C.


In [25]:
compare_fc_vs_prompt(gpt3_5_fc_simple_errors,
                     gpt3_5_fc_simple_results,
                     gpt3_5_prompt_relevance_errors,
                     gpt3_5_prompt_simple_results,
                     idx=382,
                     verbose=True)

Looking at idx: 382 (WHICH IS SECRETLY) 381 in the results.json
Error: gpt-3.5-turbo-0125-FC ['Wrong number of functions.']
FC Model:  [{'hilton_hotel_check_availability': '{"location": "Paris", "check_in_date": "2023-04-04", "check_out_date": "2023-04-08", "no_of_adults": 2}'}, {'hilton_hotel_check_availability': '{"location": "Paris", "check_in_date": "2023-04-04", "check_out_date": "2023-04-08", "no_of_adults": 2, "hotel_chain": "Hilton Garden Inn"}'}]
Num FC responses: 2
FC model had multiple different function calls. Weird.
Prompt Model:  [{'name': 'hilton_hotel.check_availability', 'parameters': {'location': 'Paris', 'check_in_date': '2023-04-04', 'check_out_date': '2023-04-08', 'no_of_adults': 2}}]
Prompt model got it right.
Question:  Check if any Hilton Hotel is available for two adults in Paris from April 4th to April 8th?
{
  "name": "hilton_hotel.check_availability",
  "description": "Check hotel availability for a specific location and time frame.",
  "parameters": {
    "

In [26]:
compare_fc_vs_prompt(gpt3_5_fc_simple_errors,
                     gpt3_5_fc_simple_results,
                     gpt3_5_prompt_relevance_errors,
                     gpt3_5_prompt_simple_results,
                     idx=381,
                     verbose=True)

Looking at idx: 381 (WHICH IS SECRETLY) 380 in the results.json
Error: gpt-3.5-turbo-0125-FC ['Wrong number of functions.']
FC Model:  [{'hotel_booking': '{"location": "Manhattan, New York", "room_type": "single", "duration": 3, "start_date": "2023-03-10", "preferences": ["pet_friendly"]}'}, {'hotel_booking': '{"location": "Manhattan, New York", "room_type": "single", "duration": 3, "start_date": "2023-03-10", "preferences": ["pet_friendly"]}'}]
Num FC responses: 2
!!! FC model repeated the same function call 2 times. !!!
Prompt Model:  [hotel_booking(location='Manhattan, New York', room_type='single', duration=3, start_date='March 10th, 2023', preferences=['pet_friendly'])]
Prompt model got it right.
Question:  Book a single room at a pet friendly hotel near Manhattan, New York for 3 nights starting from March 10th, 2023.
{
  "name": "hotel_booking",
  "description": "Books a hotel room given the location, room type, stay duration and any additional preferences.",
  "parameters": {
  

In [27]:
compare_fc_vs_prompt(gpt3_5_fc_simple_errors,
                     gpt3_5_fc_simple_results,
                     gpt3_5_prompt_relevance_errors,
                     gpt3_5_prompt_simple_results,
                     idx=383,
                     verbose=True)

Looking at idx: 383 (WHICH IS SECRETLY) 382 in the results.json
Error: gpt-3.5-turbo-0125-FC ['Wrong number of functions.']
FC Model:  [{'book_hotel': '{"hotel_name": "Hilton Hotel", "location": "Chicago", "room_type": "single", "start_date": "10th December 2022", "nights": 2}'}, {'book_hotel': '{"hotel_name": "Hilton Hotel", "location": "Chicago", "room_type": "single", "start_date": "10th December 2022", "nights": 2}'}]
Num FC responses: 2
!!! FC model repeated the same function call 2 times. !!!
Prompt Model:  [book_hotel(hotel_name='Hilton Hotel', location='Chicago', room_type='single', start_date='10th December 2022', nights=2)]
Prompt model got it right.
Question:  Book a single room for two nights at the Hilton Hotel in Chicago, starting from 10th December 2022.
{
  "name": "book_hotel",
  "description": "Book a room of specified type for a particular number of nights at a specific hotel, starting from a specified date.",
  "parameters": {
    "type": "dict",
    "properties": {

In [2]:
fun_calls = [{'book_hotel': '{"hotel_name": "Hilton Hotel", "location": "Chicago", "room_type": "single", "start_date": "10th December 2022", "nights": 2}'}, {'book_hotel': '{"hotel_name": "Hilton Hotel", "location": "Chicago", "room_type": "single", "start_date": "10th December 2022", "nights": 2}'}]
fun_calls[0] == fun_calls[1]

True

In [42]:
compare_fc_vs_prompt(gpt3_5_fc_simple_errors,
                     gpt3_5_fc_simple_results,
                     gpt3_5_prompt_relevance_errors,
                     gpt3_5_prompt_simple_results,
                     idx=286)

Looking at idx: 286 (WHICH IS SECRETLY) 285 in the results.json
Error: gpt-3.5-turbo-0125-FC ['Wrong number of functions.']
FC Model:  [{'find_concert': '{"location": "Chicago, IL", "price": 100, "genre": "Rock"}'}, {'find_concert': '{"location": "Chicago, IL", "price": 100, "genre": "Pop"}'}, {'find_concert': '{"location": "Chicago, IL", "price": 100, "genre": "Country"}'}]
Num FC responses: 3
FC model had multiple different function calls. Weird.
Prompt Model:  [find_concert(location='Chicago, IL', price=100, genre='Rock')]
Prompt model got it right.
Question:  Find me a Rock concert in Chicago with ticket availability under $100.


In [43]:
compare_fc_vs_prompt(gpt3_5_fc_simple_errors,
                     gpt3_5_fc_simple_results,
                     gpt3_5_prompt_relevance_errors,
                     gpt3_5_prompt_simple_results,
                     idx=None)

Looking at idx: 128 (WHICH IS SECRETLY) 127 in the results.json
Error: gpt-3.5-turbo-0125-FC ['Wrong number of functions.']
FC Model:  [{'calculate_NPV': '{"cash_flows": [200, 300, 400, 500], "discount_rate": 0.1, "initial_investment": 2000}'}, {'calculate_NPV': '{"cash_flows": [-2000, 200, 300, 400, 500], "discount_rate": 0.1}'}]
Num FC responses: 2
FC model had multiple different function calls. Weird.
Prompt Model:  [calculate_NPV(cash_flows=[200,300,400,500], discount_rate=0.1, initial_investment=2000)]
Prompt model got it right.
Question:  Find the Net Present Value (NPV) of an investment, given cash_flows=[200,300,400,500], a discount rate of 10%, and an initial investment of $2000.


In [45]:
gpt3_5_fc_simple_errors.error.describe()

count                              179
unique                              25
top       [Wrong number of functions.]
freq                               155
Name: error, dtype: object

In [51]:
def check_if_funcall_repeats(fc_errors_df, fc_df, prompt_errors_df, prompt_df, idx=None):
    if idx is None:
        fc_only_errors = set(fc_errors_df.id.values) - set(prompt_errors_df.id.values)
        idx = np.random.choice(list(fc_only_errors))
    # print(f'Looking at idx: {idx} (WHICH IS SECRETLY) {idx - 1} in the results.json')
    # print(f"Error: {fc_errors_df.model_name.unique()[0]}", fc_errors_df[fc_errors_df['id'] == idx]['error'].item())
    # print("FC Model: ", fc_df[fc_df['idx'] == (idx-1)]['result'].item())

    if fc_errors_df[fc_errors_df['id'] == idx].test_category.item() == 'simple':
        # check if fc model has multiple function calls which repeat
        fc_response = fc_df[fc_df['idx'] == (idx-1)]['result'].item()
        # print(f"Num FC responses: {len(fc_response)}")
        if len(fc_response) > 1:
            same_bool = [response == fc_response[0] for response in fc_response]
            if sum(same_bool) == len(same_bool):
                # print(f"!!! FC model repeated the same function call {len(fc_response)} times. !!!")
                return True
            else:
                # print(f"FC model had multiple different function calls. Weird.")
                return False
        
num_fc_repeats = 0
for idx in gpt3_5_fc_simple_errors.id.values:
    if check_if_funcall_repeats(gpt3_5_fc_simple_errors,
                                gpt3_5_fc_simple_results,
                                gpt3_5_prompt_simple_errors,
                                gpt3_5_prompt_simple_results,
                                idx=idx):
        num_fc_repeats += 1
print(f"The FC model repeats function calls {num_fc_repeats}/{gpt3_5_fc_simple_errors.shape[0]} times.")

The FC model repeats function calls 42/179 times.


# so it looks like 155/179 errors are due to the FC model trying multiple function calls when it should have just invoked the one function, once. Sometimes (42 times) it repeats the same function call, but quite often (137 times) it just makes multiple function calls. This is not just a parsing issue, it could be a prompting issue/incorrect way to use the openAI api. Not sure.

### Try to conver it to the exact "params" which the model sees

In [29]:
from model_handler.handler import BaseHandler
from model_handler.model_style import ModelStyle
from model_handler.utils import (
    convert_to_tool,
    convert_to_function_call,
    augment_prompt_by_languge,
    language_specific_pre_processing,
    ast_parse,
)
from model_handler.constant import (
    GORILLA_TO_OPENAPI,
    GORILLA_TO_PYTHON,
    USER_PROMPT_FOR_CHAT_MODEL,
    SYSTEM_PROMPT_FOR_CHAT_MODEL,
)
from openai import OpenAI
import os, time, json



In [34]:
# non-FC model:

def compare_fc_vs_prompt(fc_errors_df, fc_df, prompt_errors_df, prompt_df, idx=None, verbose=False):
    if idx is None:
        fc_only_errors = set(fc_errors_df.id.values) - set(prompt_errors_df.id.values)
        idx = np.random.choice(list(fc_only_errors))
    print(f'Looking at idx: {idx} (WHICH IS SECRETLY) {idx - 1} in the results.json')
    print(f"Error: {fc_errors_df.model_name.unique()[0]}", fc_errors_df[fc_errors_df['id'] == idx]['error'].item())
    print("FC Model: ", fc_df[fc_df['idx'] == (idx-1)]['result'].item())

    if fc_errors_df[fc_errors_df['id'] == idx].test_category.item() == 'simple':
        # check if fc model has multiple function calls which repeat
        fc_response = fc_df[fc_df['idx'] == (idx-1)]['result'].item()
        print(f"Num FC responses: {len(fc_response)}")
        if len(fc_response) > 1:
            same_bool = [response == fc_response[0] for response in fc_response]
            if sum(same_bool) == len(same_bool):
                print(f"!!! FC model repeated the same function call {len(fc_response)} times. !!!")
            else:
                print(f"FC model had multiple different function calls. Weird.")
        

    print("Prompt Model: ", prompt_df[prompt_df['idx'] == (idx-1)]['result'].item())
    if idx in prompt_errors_df.id.values:
        print("Prompt model also made an error. This is not a clear FC error.")
        print("Prompt Error: ", prompt_errors_df[prompt_errors_df['id'] == idx]['error'].item())
    else:
        print("Prompt model got it right.")

    if fc_errors_df[fc_errors_df['id'] == idx]['test_category'].item() == 'relevance':
        with open("data/gorilla_openfunctions_v1_test_relevance.json", 'r') as f:
            data = [json.loads(line) for line in f.readlines()]
            question_df = pd.DataFrame(data)
    elif fc_errors_df[fc_errors_df['id'] == idx].test_category.item() == 'simple':
        with open("data/gorilla_openfunctions_v1_test_simple.json", 'r') as f:
            data = [json.loads(line) for line in f.readlines()]
            question_df = pd.DataFrame(data)
    else:
        raise ValueError(f"Not implemented for test categories {fc_errors_df[fc_errors_df['id'] == idx].test_category.item()}.")
    
    print("Question: ", question_df.iloc[idx-1]['question'])
    if verbose:
        print(json.dumps(question_df.iloc[idx-1]['function'], indent=2))

# canonical example:
idx=383
fc_errors_row = gpt3_5_fc_simple_errors[gpt3_5_fc_simple_errors['id'] == idx]
fc_results_row = gpt3_5_fc_simple_results[gpt3_5_fc_simple_results['idx'] == (idx-1)]
prompt_errors_row = gpt3_5_prompt_relevance_errors[gpt3_5_prompt_relevance_errors['id'] == idx]
prompt_results_row = gpt3_5_prompt_simple_results[gpt3_5_prompt_simple_results['idx'] == (idx-1)]

with open("data/gorilla_openfunctions_v1_test_simple.json", 'r') as f:
    data = [json.loads(line) for line in f.readlines()]
    question_df = pd.DataFrame(data)

question_row = question_df.iloc[idx-1]

In [53]:
print(f"Question: {question_row['question']}")
print(f"Function: {json.dumps(question_row['function'], indent=2)}")
print(f"FC Model Response: {fc_results_row['result'].item()}")
print(f"Prompt Model Response: {prompt_results_row['result'].item()}")

Question: Book a single room for two nights at the Hilton Hotel in Chicago, starting from 10th December 2022.
Function: {
  "name": "book_hotel",
  "description": "Book a room of specified type for a particular number of nights at a specific hotel, starting from a specified date.",
  "parameters": {
    "type": "dict",
    "properties": {
      "hotel_name": {
        "type": "string",
        "description": "The name of the hotel."
      },
      "location": {
        "type": "string",
        "description": "The city in which the hotel is located."
      },
      "room_type": {
        "type": "string",
        "description": "The type of room to be booked."
      },
      "start_date": {
        "type": "string",
        "description": "The start date for the booking."
      },
      "nights": {
        "type": "integer",
        "description": "The number of nights for which the booking is to be made."
      }
    },
    "required": [
      "hotel_name",
      "location",
      "ro

In [55]:
fc_errors_row

Unnamed: 0,id,model_name,test_category,valid,error,error_type,prompt,model_result_raw,model_result_decoded,possible_answer,filename,model_result,decoded_result
172,383,gpt-3.5-turbo-0125-FC,simple,False,[Wrong number of functions.],simple_function_checker:wrong_count,{'question': 'Book a single room for two night...,"[{'book_hotel': '{""hotel_name"": ""Hilton Hotel""...","[{'book_hotel': {'hotel_name': 'Hilton Hotel',...","{'book_hotel': {'hotel_name': ['Hilton Hotel',...",simple_score.json,,


In [58]:
test_category = "simple"
prompt = fc_errors_row.prompt.item()['question']
functions = [question_row['function']]
prompt = augment_prompt_by_languge(prompt,test_category)
functions = language_specific_pre_processing(functions,test_category,False)
message = [
    {
        "role": "system",
        "content": SYSTEM_PROMPT_FOR_CHAT_MODEL,
    },
    {
        "role": "user",
        "content": "Questions:"
        + USER_PROMPT_FOR_CHAT_MODEL.format(
            user_prompt=prompt, functions=str(functions)
        ),
    },
]

In [66]:
for msg in message:
    for key in msg:
        print(f"{key}: {msg[key]}")

role: system
content: "
    You are an expert in composing functions. You are given a question and a set of possible functions. 
    Based on the question, you will need to make one or more function/tool calls to achieve the purpose. 
    If none of the function can be used, point it out. If the given question lacks the parameters required by the function,
    also point it out. You should only return the function call in tools call sections.
    
role: user
content: Questions:
    Questions:Book a single room for two nights at the Hilton Hotel in Chicago, starting from 10th December 2022.
 Note that the provided function is in Python.
Here is a list of functions in JSON format that you can invoke:
[{'name': 'book_hotel', 'description': 'Book a room of specified type for a particular number of nights at a specific hotel, starting from a specified date.', 'parameters': {'type': 'dict', 'properties': {'hotel_name': {'type': 'string', 'description': 'The name of the hotel.'}, 'location': 

In [75]:
prompt = fc_errors_row.prompt.item()['question']
prompt = augment_prompt_by_languge(prompt, test_category)
functions = language_specific_pre_processing(functions, test_category, True)
if type(functions) is not list:
    functions = [functions]
message = [{"role": "user", "content": "Questions:" + prompt}]
oai_tool = convert_to_tool(
    functions, GORILLA_TO_OPENAPI, ModelStyle.OpenAI, test_category, True
)

In [76]:
for msg in message:
    for key in msg:
        print(f"{key}: {msg[key]}")

role: user
content: Questions:Book a single room for two nights at the Hilton Hotel in Chicago, starting from 10th December 2022.
 Note that the provided function is in Python.


In [77]:
oai_tool

[{'type': 'function',
  'function': {'name': 'book_hotel',
   'description': 'Book a room of specified type for a particular number of nights at a specific hotel, starting from a specified date.',
   'parameters': {'type': 'object',
    'properties': {'hotel_name': {'type': 'string',
      'description': 'The name of the hotel.'},
     'location': {'type': 'string',
      'description': 'The city in which the hotel is located.'},
     'room_type': {'type': 'string',
      'description': 'The type of room to be booked.'},
     'start_date': {'type': 'string',
      'description': 'The start date for the booking.'},
     'nights': {'type': 'integer',
      'description': 'The number of nights for which the booking is to be made.'}},
    'required': ['hotel_name',
     'location',
     'room_type',
     'start_date',
     'nights']}}}]