In [None]:
import os.path as osp
import os
import pandas as pd
from utils.utils import get_timetsamp_with_random
from utils.map_utils import get_game_info_with_G_eval
from utils.eval_llama_rwkv_utils import get_csv,eval_llama_rwkv_batch,get_llama_rwkv_valid_batch
from utils.eval_gpt_utils import eval_gpt_batch,get_gpt_valid_batch

In [None]:
def get_all_task_id(rst_dir,map_dir,model_name,task_name,eval_difficulty):
    
    eval_rst={}
    for idx,game_name in enumerate(os.listdir(rst_dir)):
        if game_name in ['outputs_diff','.gitattributes','.gitignore','.git']:
            continue
        print(f'handling {game_name}')
                
        G_eval,G,actions,locations,all2all,all_pairs,walkthrough=get_game_info_with_G_eval(map_dir,game_name)
        
        if model_name.startswith('llama') or model_name.startswith('rwkv'):
            rst=get_llama_rwkv_valid_batch(game_name,rst_dir,G_eval,all2all,
                                                    all_pairs,task_type=task_name,model_name=model_name,
                                                    eval_difficulty=eval_difficulty)
        else:
            rst=get_gpt_valid_batch(game_name,rst_dir,G_eval,all2all,
                                                    all_pairs,task_type=task_name,model_name=model_name,
                                                    eval_difficulty=eval_difficulty)
        
        eval_rst[game_name]=rst
        print('-----------------------')
        
    return eval_rst

def eval_all(rst_dir,map_dir,save_path,model_name,task_name,eval_difficulty,eval_dict=None):
    
    eval_rst={}
    for idx,game_name in enumerate(os.listdir(rst_dir)):
        if game_name in ['outputs_diff','.gitattributes','.gitignore','.git']:
            continue
        print(f'handling {game_name}')
                
        G_eval,G,actions,locations,all2all,all_pairs,walkthrough=get_game_info_with_G_eval(map_dir,game_name)
        
        if model_name.startswith('llama') or model_name.startswith('rwkv'):
            if eval_dict is not None:
                rst=eval_llama_rwkv_batch(game_name,rst_dir,G_eval,all2all,
                                                        all_pairs,task_type=task_name,model_name=model_name,
                                                        eval_difficulty=eval_difficulty,eval_set=eval_dict[game_name])
            else:
                rst=eval_llama_rwkv_batch(game_name,rst_dir,G_eval,all2all,
                                                        all_pairs,task_type=task_name,model_name=model_name,
                                                        eval_difficulty=eval_difficulty,eval_set=None)
        else:
            if eval_dict is not None:
                rst=eval_gpt_batch(game_name,rst_dir,G_eval,all2all,
                                                    all_pairs,task_type=task_name,model_name=model_name,
                                                    eval_difficulty=eval_difficulty,eval_set=eval_dict[game_name])
            else:
                rst=eval_gpt_batch(game_name,rst_dir,G_eval,all2all,
                                                        all_pairs,task_type=task_name,model_name=model_name,
                                                        eval_difficulty=eval_difficulty,eval_set=None)
        
        eval_rst[game_name]=rst
        print('-----------------------')
        
    if not os.path.exists(save_path):
        os.makedirs(save_path)
        
    timestamp=get_timetsamp_with_random()
    target_name=osp.join(save_path,f'result_{eval_difficulty}_{timestamp}.csv')


    get_csv(eval_rst,target_name)

    df = pd.read_csv(target_name)
    df = df.sort_values('name')
    average = df.mean()
    
    # Convert first column to object type
    df[df.columns[0]] = df[df.columns[0]].astype(object)

    # Append the average row and set the value of the first column to 'average'
    df.loc[len(df)] = ['average'] + list(average.values)

    # Replace all NA/NaN in df with 'NA'
    df.fillna('NA', inplace=True)
    
    
#     print(average.values)
    weighted_average=[]
    
    weighted_average.append(average.values[6]/average.values[8])
    weighted_average.append(average.values[7]/average.values[8])
    
    weighted_average.append(average.values[12]/average.values[14])
    weighted_average.append(average.values[9]/average.values[11])
    
    weighted_average.append(average.values[13]/average.values[14])
    weighted_average.append(average.values[10]/average.values[11])
    
    for i in range(len(list(average.values))-len(weighted_average)):
        weighted_average.append(0.0)

    df.loc[len(df)] = ['weighted_average'] + weighted_average
    
    df.to_csv(f"{osp.join(save_path,f'sorted_result_{eval_difficulty}_{timestamp}.csv')}", index=False)
    if osp.isfile(target_name):
        os.remove(target_name)
        print(f'The file "{target_name}" has been deleted.')
    else:
        print(f'The file "{target_name}" does not exist.')
        
    


In [None]:
difficulty='strict'

In [None]:
# notebook input
rst_dir='/share/data/mei-work/kangrui/github/mango/kangrui/data/llama2_13b_base_results_0816'
map_dir='/share/data/mei-work/kangrui/github/mango/data/'
task_names=["stepnav","pathgen"]
model_names=['llama','llama_anno']

In [None]:
rst={}
for model_name in model_names:
    if model_name not in rst.keys():
        rst[model_name]={}
    for task_name in task_names:
        rst[model_name][task_name]=get_all_task_id(rst_dir,map_dir,model_name,task_name,difficulty)

In [None]:
model_names=['gpt-4','gpt-3.5-turbo']
rst_dir='/share/data/mei-work/kangrui/github/mango/kangrui/data/gpt-games-results-clean-new-new'
for model_name in model_names:
    if model_name not in rst.keys():
        rst[model_name]={}
    for task_name in task_names:
        rst[model_name][task_name]=get_all_task_id(rst_dir,map_dir,model_name,task_name,difficulty)

In [None]:
model_names=['rwkv','rwkv_anno']
rst_dir='/share/data/mei-work/kangrui/github/mango/kangrui/data/rwkv_results_0709'
for model_name in model_names:
    if model_name not in rst.keys():
        rst[model_name]={}
    for task_name in task_names:
        rst[model_name][task_name]=get_all_task_id(rst_dir,map_dir,model_name,task_name,difficulty)

In [None]:
from itertools import combinations

In [None]:
def excel_intersection_eval(model1,model2,eval_difficulty):
    if model1.startswith('llama'):
        rst_dir='/share/data/mei-work/kangrui/github/mango/kangrui/data/llama2_13b_base_results_0816'
    elif model1.startswith('rwkv'):
        rst_dir='/share/data/mei-work/kangrui/github/mango/kangrui/data/rwkv_results_0709'
    else:
        rst_dir='/share/data/mei-work/kangrui/github/mango/kangrui/data/gpt-games-results-clean-new'
    for task_name in task_names:
        save_path=osp.join(save_dir,model1,task_name)
        eval_all(rst_dir,map_dir,save_path,model1,task_name,eval_difficulty,rst[model2][task_name]) 

In [None]:
model_names=['llama','llama_anno','gpt-3.5-turbo','gpt-4','rwkv','rwkv_anno']
for pair in combinations(model_names,2):
    save_dir=f'/share/data/mei-work/kangrui/github/mango/kangrui/eval_results/results_{pair[0]}_vs_{pair[1]}_0806_{difficulty}/'   
    excel_intersection_eval(pair[0],pair[1],difficulty)
    excel_intersection_eval(pair[1],pair[0],difficulty)

In [None]:
for pair in combinations(model_names,2):
    print(pair)