In [1]:
import os
import os.path as osp
import networkx as nx
import json
import matplotlib.pyplot as plt
import hashlib
import shutil
import subprocess


from utils.map_utils import get_game_info
from utils.eval_utils import *
from utils.clean_utils import *

In [2]:
# notebook input
map_dir='/share/data/mei-work/kangrui/github/mango/data/maps/'
result_dir='/share/data/mei-work/kangrui/github/mango/kangrui/data/gpt-games-results'
save_path='/share/data/mei-work/kangrui/github/mango/kangrui/data/gpt-games-results-clean'

In [3]:
# merge results in gpt-games-results and gpt-games-results/outputs_diff

if not osp.exists(save_path):
    path1 = result_dir
    path2 = osp.join(result_dir,'outputs_diff')
    path3 = save_path



    # Merge directories
    merge_dirs(path1, path3)
    merge_dirs(path2, path3)
    for idx,game_name in enumerate(os.listdir(save_path)):
        if game_name in ['outputs_diff','.gitattributes','.gitignore','.git']:
            name_to_delete=osp.join(save_path,game_name)
            try:
                # Use -rf to recursively force deletion
                subprocess.run(["rm", "-rf", name_to_delete], check=True)
                print(f"{name_to_delete} has been deleted.")
            except subprocess.CalledProcessError as e:
                print(f"Failed to delete {name_to_delete}. Error: {str(e)}")

In [4]:
# specific functions for data cleaning
def get_collection_and_drop_rf(game_name,map_dir,final_dir_path):
    assert game_name in final_dir_path
    G,all2all,all_pairs,anno2code,code2anno=get_game_info(map_dir,game_name)
    path_id_set=get_path_id_set(all2all)

   
    drop_list=[]
    collection={}
    for idx,file_name in enumerate(os.listdir(final_dir_path)):

        final_file_path=osp.join(final_dir_path,file_name)
        try:
            path_gt=extract_path_gt_from_file(final_file_path)
        except Exception as e:
            print(final_file_path,e)
        path_id=compute_hash_for_path(path_gt)

        if path_id in path_id_set:
            if path_id in collection.keys():
                collection[path_id].append(final_file_path)
            else:
                collection[path_id]=[final_file_path]
        else:
            drop_list.append(final_file_path)
    
    collection_dup={}
    for k,v in collection.items():
        if len(v)>1:
            collection_dup[k]=v
    return drop_list,collection_dup

def handle_dup_drop_rf(game_name,map_dir,collection_dup):
    G,all2all,all_pairs,anno2code,code2anno=get_game_info(map_dir,game_name)
#     print([len(v) for k,v in collection_dup.items()])
    for k,v in collection_dup.items():
        flag=True
        for idx,file in enumerate(v):
            flag_easy,flag_harsh,flag_hard=eval_rf(file,G,all2all,all_pairs,anno2code,code2anno)
            
            if flag_easy==1 or flag_harsh==1:
                collection_dup[k].pop(idx)
                flag=False
                print("right answer found")
                break
        if flag:
            collection_dup[k].pop()
#     print([len(v) for k,v in collection_dup.items()])
    for k,v in collection_dup.items():
        for file_name in v:
            os.remove(file_name)

In [5]:
# clean data for pathgen
import os
for idx,game_name in enumerate(os.listdir(save_path)):
    print(f'checking {game_name} ...')
    G,all2all,all_pairs,anno2code,code2anno=get_game_info(map_dir,game_name)
    
    # get path id set in a game
    path_id_set=get_path_id_set(all2all)
    
    # iterate pathgen in result-clean
    file_dir=osp.join(save_path,game_name,'results')
    
#     print(file_dir)
    # iterate pathgen for gpt3 and gpt4
    for idx,dir_path in enumerate(os.listdir(file_dir)):
        if not dir_path.startswith("pathgen"):
            continue
            
        final_dir_path=osp.join(file_dir,dir_path)
        
        drop_list,collection_dup=get_collection_and_drop_rf(game_name,map_dir,final_dir_path)
        if len(collection_dup.keys())>0:
            handle_dup_drop_rf(game_name,map_dir,collection_dup)
#         os.sys.exit()
        for file_name in drop_list:
            os.remove(file_name)

checking afflicted ...


FileNotFoundError: [Errno 2] No such file or directory: '/share/data/mei-work/kangrui/github/mango/data/maps/afflicted/afflicted.all2all.json'

In [None]:
def eval_df(file,G,all2all,all_pairs,anno2code,code2anno):
    #evaluate destination finding given one question
    with open(file,'r') as f:
        result_json = json.load(f)
    
    if not 'path' in result_json.keys():
        print(file,'path not existed, skip eval')
        return -1,-1
    
    cutoff=get_cutoff(result_json)
    
    if (not isinstance(result_json['path'][-1],dict)):
        print(file,'path format error')
        return -1,-1
    
    try:
        src_node,dst_node,path=deal_anno(result_json,anno2code)
    except Exception as e:
        print(file,e)
        os.sys.exit()
        
    pair=find_node_pair(src_node,dst_node,all_pairs)
    
    if (not isinstance(path[-1],dict)) or ('node' not in path[-1].keys()):
        print(file,'path format error')
        return -1,-1
    
    if pair is None:
        print(file,'pair is none, skip eval')
        return -1,-1
        
    
    if pair['num_paths']<1:
        print(file,'pair unreachable, skip eval')
        return -1,-1
    
    if min(pair['path_min_cutoffs'])>cutoff:
        print(file,f"path_min_cutoffs {min(pair['path_min_cutoffs'])} > {cutoff} exceeded, skip eval")
        return -1,-1
    
    # evaluation begins
    flag_eval=dst_node==path[-1]['node']
    flag_hard=False
    for edge in result_json['path_gt']:
        if edge["seen"]==False:
            flag_hard=True
            break
    
    return flag_eval,flag_hard

def get_collection_and_drop_df(game_name,map_dir,final_dir_path):
    assert game_name in final_dir_path
    G,all2all,all_pairs,anno2code,code2anno=get_game_info(map_dir,game_name)
    path_id_set=get_path_id_set(all2all)

   
    drop_list=[]
    collection={}
    for idx,file_name in enumerate(os.listdir(final_dir_path)):

        final_file_path=osp.join(final_dir_path,file_name)
        try:
            path_gt=extract_path_gt_from_file(final_file_path)
        except Exception as e:
            print(final_file_path,e)
        path_id=compute_hash_for_path(path_gt)

        if path_id in path_id_set:
            if path_id in collection.keys():
                collection[path_id].append(final_file_path)
            else:
                collection[path_id]=[final_file_path]
        else:
            drop_list.append(final_file_path)
    
    collection_dup={}
    for k,v in collection.items():
        if len(v)>1:
            collection_dup[k]=v
    return drop_list,collection_dup

def handle_dup_drop_df(game_name,map_dir,collection_dup):
    G,all2all,all_pairs,anno2code,code2anno=get_game_info(map_dir,game_name)
#     print([len(v) for k,v in collection_dup.items()])
    for k,v in collection_dup.items():
        flag=True
        for idx,file in enumerate(v):
            flag_eval,flag_hard=eval_df(file,G,all2all,all_pairs,anno2code,code2anno)
            
            if flag_eval==1 or flag_hard==1:
                collection_dup[k].pop(idx)
                flag=False
                print("right answer found")
                break
        if flag:
            collection_dup[k].pop()
#     print([len(v) for k,v in collection_dup.items()])
    for k,v in collection_dup.items():
        for file_name in v:
            os.remove(file_name)

In [None]:
# clean data for stepnav
import os
for idx,game_name in enumerate(os.listdir(save_path)):
    print(f'checking {game_name} ...')
    G,all2all,all_pairs,anno2code,code2anno=get_game_info(map_dir,game_name)
    
    # get path id set in a game
    path_id_set=get_path_id_set(all2all)
    
    # iterate pathgen in result-clean
    file_dir=osp.join(save_path,game_name,'results')
    
#     print(file_dir)
    # iterate pathgen for gpt3 and gpt4
    for idx,dir_path in enumerate(os.listdir(file_dir)):
        if not dir_path.startswith("stepnav"):
            continue
            
        final_dir_path=osp.join(file_dir,dir_path)
        
        drop_list,collection_dup=get_collection_and_drop_df(game_name,map_dir,final_dir_path)
        
        if len(collection_dup.keys())>0:
            handle_dup_drop_df(game_name,map_dir,collection_dup)
#         os.sys.exit()
        for file_name in drop_list:
            os.remove(file_name)