In [1]:
import json
import os
import pandas as pd

def summarize_results(run_dir, prompt_tokens_price_per_million = None, completion_tokens_price_per_million = None):
    all_task_ids = []
    for task_id in os.listdir(run_dir):
        if os.path.isdir(os.path.join(run_dir, task_id)):
            all_task_ids.append(str(task_id))

    all_task_ids = sorted(all_task_ids)

    results = {str(task_id): [] for task_id in all_task_ids}
    traj_metrics = {str(task_id): [] for task_id in all_task_ids}
    missing_runs = {}
    for task_id in all_task_ids:
        all_run_ids = []
        for run_id in os.listdir(os.path.join(run_dir, str(task_id))):
            if os.path.isdir(os.path.join(run_dir, str(task_id), run_id)):
                all_run_ids.append(run_id)

        all_run_ids = sorted(all_run_ids)

        for run_id in all_run_ids:
            try:
                with open(os.path.join(run_dir, str(task_id), run_id, "performance_metrics.json"), "r") as f:
                    performance_metrics = json.load(f)
                results[str(task_id)].append(performance_metrics)
                # process efficiency metrics
                try:
                    traj_path = os.path.join(run_dir, str(task_id), run_id, "traj.jsonl")
                    total_actions = 0
                    total_steps = 0
                    with open(traj_path, "r") as f:
                        for line in f:
                            temp = json.loads(line)
                            total_steps = max(total_steps, temp['step_num'])
                            total_actions += 1
                    traj_metrics[str(task_id)].append({'total_actions': total_actions, 'total_steps': total_steps})
                except FileNotFoundError as e:
                    total_actions = 1
                    total_steps = 1
                    traj_metrics[str(task_id)].append({'total_actions': total_actions, 'total_steps': total_steps})
                    print(f"{task_id}/{run_id}: Agent terminates with the initial observation.")
            except Exception as e:
                if str(task_id) not in missing_runs:
                    missing_runs[str(task_id)] = []
                missing_runs[str(task_id)].append(run_id)

    total_runs = sum([len(v) for v in results.values()])
    num_missing_runs = sum([len(v) for v in missing_runs.values()])
    # print(f"# missing runs / # total runs: {num_missing_runs} / {total_runs}")
    assert len(missing_runs) == 0, "Missing runs found"
    df_dict = {
    'task_id': [],
    'score': [],
    'is_success': [],
    'time': [],
    'total_actions': [],
    'total_steps': [],
    'prompt_tokens': [],
    'completion_tokens': [],
    'total_tokens': [],
    }
    text = ""
    count = 0
    for task_id, r in results.items():
        if task_id not in missing_runs:
            df_dict['task_id'].append(task_id)
            evaluation_result = r[0]['evaluation_result']
            score = evaluation_result['Score']
            df_dict['score'].append(score)
            if score == -1:
                msg = f'{task_id} failed since score is -1: {evaluation_result["System Failiures"]}'
                text += msg + '\n' + '='*100 + '\n'
                print(msg)
                print('-'*100)
                count += 1
            if float(score) == 1.0:
                df_dict['is_success'].append(True)
            else:
                df_dict['is_success'].append(False)
            df_dict['time'].append(r[0]['time (min)'])
            df_dict['prompt_tokens'].append(r[0]['usage']['prompt_tokens'])
            df_dict['completion_tokens'].append(r[0]['usage']['completion_tokens'])
            df_dict['total_tokens'].append(r[0]['usage']['total_tokens'])
            df_dict['total_actions'].append(traj_metrics[task_id][0]['total_actions'])
            df_dict['total_steps'].append(traj_metrics[task_id][0]['total_steps'])
    with open('sys_failed_evaluator.log', 'w') as f:
        f.write(text)
    assert text == "", "system eval failure log is not empty; cannot proceed"
    df = pd.DataFrame(df_dict)
    score = df.score.mean()
    success_rate = df.is_success.mean() * 100
    score = df.score.mean()
    steps = df.total_steps.mean()
    total_actions = df.total_actions.mean()
    total_tokens = df.total_tokens.mean()
    prompt_tokens = df.prompt_tokens.mean()
    completion_tokens = df.completion_tokens.mean()
    if prompt_tokens_price_per_million is not None and completion_tokens_price_per_million is not None:
        cost_per_per_task = (prompt_tokens * prompt_tokens_price_per_million + completion_tokens * completion_tokens_price_per_million) / 1000000
    else:
        cost_per_per_task = 'not available'
    time = df.time.mean()
    print(f'milestone score: {score}')
    print(f'success rate: {success_rate}')
    print(f'time: {time}')
    print(f'steps: {steps}')
    print(f'total_tokens(k): {total_tokens / 1000}')
    print(f'total_actions: {total_actions}')
    print(f'cost_per_per_task: {cost_per_per_task}')
    return df

# zero-shot

In [2]:
df_list = []

In [5]:
run_dir = "../outputs/mobileagentv3_zero_shot/trajectory"
model_name = run_dir.split('/')[-2]
print(model_name)
df = summarize_results(run_dir)
df['model_name'] = 'mobileagentv3-7B'
df_list.append(df)

mobileagentv3_zero_shot
sales_005_005 failed since score is -1: ['Traceback (most recent call last):', '  File "/fsx/home/yutong/OSS/SCUBA/scuba/phases/evaluation/master_evaluator.py", line 112, in evaluate_instance', '    milestones = evaluation_method(data, **ground_truth)', '                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^', '  File "/fsx/home/yutong/OSS/SCUBA/scuba/phases/evaluation/milestone_evaluator_sales.py", line 330, in evaluate_template_update_opportunity_stage_and_activity', "    activity_description_correct = activity_exists and self.__fuzzy_match(activity_records[0]['Subject'], params.activity_description)", '                                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^', '  File "/fsx/home/yutong/OSS/SCUBA/scuba/phases/evaluation/milestone_evaluator_sales.py", line 313, in __fuzzy_match', '    v1, v2 = self.__sentence_vector(string1), self.__sentence_vector(string2)', '             ^^^^^^^^^

AssertionError: system eval failure log is not empty; cannot proceed