In [77]:
import os
import os.path as osp
import networkx as nx
import json
import matplotlib.pyplot as plt
from utils.map_utils import get_game_info_with_G_eval
from utils.map_utils_old import find_all_paths
from utils.clean_utils import compute_hash_for_path
import tiktoken
import openai
import time
import traceback
import pandas as pd

In [78]:
def get_cut_off_and_walkthrough_text(walkthrough:str,token_size_limit=3600,model_name='gpt-4'):
    encoder = tiktoken.encoding_for_model(model_name)
    enc = encoder.encode(walkthrough)
    if len(enc) > token_size_limit:
        cut_off_walkthrough_text = encoder.decode(enc[:token_size_limit])
    else:
        cut_off_walkthrough_text = encoder.decode(enc)
    cut_off_number = int(cut_off_walkthrough_text.split('NUM: ')[-2].split('\n')[0])
    if cut_off_number > 70:
        cut_off_number = 70

    walkthrough_text = walkthrough.split('NUM: {}'.format(cut_off_number + 1))[0]

    return cut_off_number,len(encoder.encode(walkthrough_text))

def normalized_edit_distance(s1, s2):
    s1 = s1.lower()
    s2 = s2.lower()
    
    s1=s1.split()
    s2=s2.split()
    
    m = len(s1) + 1
    n = len(s2) + 1

    dp = [[0] * n for _ in range(m)]

    for i in range(m):
        dp[i][0] = i

    for j in range(n):
        dp[0][j] = j

    for i in range(1, m):
        for j in range(1, n):
            if s1[i - 1] == s2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = min(
                    dp[i - 1][j] + 1,  # deletion
                    dp[i][j - 1] + 1,  # insertion
                    dp[i - 1][j - 1] + 1  # substitution
                )
    
    # Compute the normalized score
    max_len = max(len(s1), len(s2))
    score = 1 - dp[m - 1][n - 1] / max_len
    return score

def get_edegs(G):
    edges=[]
    for edge in G.edges(data=True):
        edges.append(edge)
    return edges


In [79]:
def num_of_locations(G,cut_off_number):
    edges=get_edegs(G)
    locations=set()
    for edge in edges:
        if edge[2]['step_min_cutoff']<=cut_off_number:
            locations.add(edge[0])
            locations.add(edge[1])
    return len(locations)

def num_of_exp_edges(G,cut_off_number):
    edges=get_edegs(G)
    cnt=0
    for edge in edges:
        if edge[2]['step_min_cutoff']<=cut_off_number and edge[2]['seen_in_forward']:
            cnt+=1         
    return cnt

def num_of_imp_edges(G,cut_off_number):
    edges=get_edegs(G)
    cnt=0
    for edge in edges:
        if edge[2]['step_min_cutoff']<=cut_off_number and (not edge[2]['seen_in_forward']):
            cnt+=1         
    return cnt

def ratio_of_conf_locations(G,all2all,cut_off_number):
    edges=get_edegs(G)
    locations=set()
    for edge in edges:
        if edge[2]['step_min_cutoff']<=cut_off_number:
            locations.add(edge[0])
            locations.add(edge[1])
    locations=list(locations)
    
    score=0
    for i in range(len(locations)):
        current_max_score = float('-inf')
        for j in range(len(locations)):
            if i != j:  # Don't compare an item with itself
                current_score = normalized_edit_distance(locations[i], locations[j])
                if current_score > current_max_score:
                    current_max_score = current_score
        score+=current_max_score
        
    return score/len(locations) if len(locations)>0 else 0

def num_of_conf_locations(G,all2all,cut_off_number):
    edges=get_edegs(G)
    locations=set()
    for edge in edges:
        if edge[2]['step_min_cutoff']<=cut_off_number:
            locations.add(edge[0])
            locations.add(edge[1])
    locations=list(locations)
    
    score=0
    for i in range(len(locations)):
        current_max_score = float('-inf')
        for j in range(len(locations)):
            if i != j:  # Don't compare an item with itself
                current_score = normalized_edit_distance(locations[i], locations[j])
                if current_score > current_max_score:
                    current_max_score = current_score
        score+=current_max_score
        
    return score
    
    

def average_length_of_all2all(all2all,cut_off_number):
    length=0
    cnt=0
    for path in all2all:
        if path['path_min_cutoff']<=cut_off_number:
            length+=path['step_count']
            cnt+=1
    return length/cnt if cnt>0 else 0

def average_length_of_all2all_simple(all2all,cut_off_number):
    length=0
    cnt=0
    for path in all2all:
        if path['path_min_cutoff']<=cut_off_number and path['all_steps_seen_in_forward']:
            length+=path['step_count']
            cnt+=1
    return length/cnt if cnt>0 else 0

def average_length_of_all2all_hard(all2all,cut_off_number):
    length=0
    cnt=0
    for path in all2all:
        if path['path_min_cutoff']<=cut_off_number and not path['all_steps_seen_in_forward']:
            length+=path['step_count']
            cnt+=1
    return length/cnt if cnt>0 else 0
        
def average_num_of_imp_edge(all2all,cut_off_number):
    num=0
    cnt=0
    for path in all2all:
        if path['path_min_cutoff']<=cut_off_number and not path['all_steps_seen_in_forward']:
            cnt+=1
            for edge in path["path_details"]:
                if not edge["seen_in_forward"]:
                    num+=1
                    
    return num/cnt if cnt>0 else 0

def num_of_special_moves(G,cut_off_number):
    
    reverse_dict = {
    "up": "down",
    "down": "up",
    "north": "south",
    "south": "north",
    "east": "west",
    "west": "east",
    "northeast": "southwest",
    "northwest": "southeast",
    "southeast": "northwest",
    "southwest": "northeast"
}
    
    edges=get_edegs(G)
    actions=set()
    for edge in edges:
        if edge[2]['step_min_cutoff']<=cut_off_number:
            if edge[2]['action'] not in reverse_dict.keys():
                actions.add(edge[2]['action'])
    return len(actions)

def num_of_tokens_per_edge(G,cut_off_number,token_num):
    return token_num/num_of_exp_edges(G,cut_off_number)
    

In [80]:
def analyze_map(map_dir,game_name):
    G_eval,G,actions,locations,all2all,all_pairs,walkthrough=get_game_info_with_G_eval(map_dir,game_name)
    cut_off_number,token_num=get_cut_off_and_walkthrough_text(walkthrough)
    
    result = {
        'num_of_locations': num_of_locations(G, cut_off_number),
        'num_of_exp_edges': num_of_exp_edges(G, cut_off_number),
        'num_of_imp_edges': num_of_imp_edges(G, cut_off_number),
#         'ratio_of_conf_locations': ratio_of_conf_locations(G,all2all, cut_off_number),
        'num_of_conf_locations':num_of_conf_locations(G,all2all, cut_off_number),
#         'avg_len_of_all2all': average_length_of_all2all(all2all, cut_off_number),
        'avg_len_easy': average_length_of_all2all_simple(all2all, cut_off_number),
        'avg_len_hard': average_length_of_all2all_hard(all2all, cut_off_number),
        'ave_num_of_imp_in_hard': average_num_of_imp_edge(all2all, cut_off_number),
        'num_of_special_moves': num_of_special_moves(G, cut_off_number),
        'avg_len_scene': num_of_tokens_per_edge(G, cut_off_number, token_num)
    }
    return result

def get_data(model_name,difficulty,map_dir,result_dir):
    for root, dirs, files in os.walk(result_dir):
        if root.endswith(model_name):
            for task in dirs:
                path=osp.join(root,task)
                for file in os.listdir(path):
                    if difficulty in file:
                        if task=='pathgen':
                            rf_df = pd.read_csv(os.path.join(path, file))
                        else:
                            df_df = pd.read_csv(os.path.join(path, file))
                        print(os.path.join(path, file))
    df_dict=df_df[:-2].to_dict()
    rf_dict=rf_df[:-2].to_dict()

    rst={}
    for k,v in df_dict['name'].items():
        rst[v]={
            'df_easy_success_rate':df_dict['easy_success_rate'][k],
            'df_hard_success_rate':df_dict['hard_success_rate'][k]
        }

    for k,v in rf_dict['name'].items():
        rst[v]['rf_easy_success_rate']=rf_dict['easy_success_rate'][k]
        rst[v]['rf_hard_success_rate']=rf_dict['hard_success_rate'][k]

    for key in rst.keys():
        rst[key].update(analyze_map(map_dir,key))
        
    return rst

In [81]:
# note book input
difficulty='loose'
map_dir=f"/share/data/mei-work/kangrui/github/mango/data/"
result_dir=f"/share/data/mei-work/kangrui/github/mango/kangrui/eval_results/results_gpt_overall_0709"

final_result={'gpt-4':{},'gpt-3.5-turbo':{}}

for model_name in ['gpt-4','gpt-3.5-turbo']:
    final_result[model_name]=get_data(model_name,difficulty,map_dir,result_dir)

    

/share/data/mei-work/kangrui/github/mango/kangrui/eval_results/results_gpt_overall_0709/gpt-4/pathgen/sorted_result_loose_2023-07-10-02-35-11_760.csv
/share/data/mei-work/kangrui/github/mango/kangrui/eval_results/results_gpt_overall_0709/gpt-4/stepnav/sorted_result_loose_2023-07-10-02-34-11_399.csv
/share/data/mei-work/kangrui/github/mango/kangrui/eval_results/results_gpt_overall_0709/gpt-3.5-turbo/pathgen/sorted_result_loose_2023-07-10-02-38-26_138.csv
/share/data/mei-work/kangrui/github/mango/kangrui/eval_results/results_gpt_overall_0709/gpt-3.5-turbo/stepnav/sorted_result_loose_2023-07-10-02-36-53_691.csv


In [82]:
import numpy as np
import statsmodels.api as sm
from scipy import stats

def calculate_beta_pvalue(x, y):
    # Ensure lists are numpy arrays
    x = np.array(x)
    y = np.array(y)

    # Add a constant (intercept term) to predictors
    X = sm.add_constant(x)

    # Fit ordinary least squares regression
    model = sm.OLS(y, X)
    results = model.fit()

    # Get slope (beta) and p-value
    beta = results.params[1]
    p_value = results.pvalues[1]

    return beta, p_value


In [83]:
final_result_list={'gpt-4':[],'gpt-3.5-turbo':[]}
for model_name in ['gpt-4','gpt-3.5-turbo']:
    rst_list=[]
    for k,v in final_result[model_name].items():
        rst_list.append(v)
    final_result_list[model_name]=rst_list

In [84]:
final_result_list['gpt-4'][0]

{'df_easy_success_rate': 1.0,
 'df_hard_success_rate': 1.0,
 'rf_easy_success_rate': 1.0,
 'rf_hard_success_rate': 1.0,
 'num_of_locations': 5,
 'num_of_exp_edges': 5,
 'num_of_imp_edges': 2,
 'num_of_conf_locations': 0.0,
 'avg_len_easy': 1.9090909090909092,
 'avg_len_hard': 1.8,
 'ave_num_of_imp_in_hard': 1.4,
 'num_of_special_moves': 1,
 'avg_len_scene': 258.2}

In [85]:
# calculate cov
# calculate for RF_HARD
    
    
# df = pd.DataFrame(rst_list)
# df_RF_HARD = df.drop(columns=['rf_hard_success_rate','df_easy_success_rate',
#                               'df_hard_success_rate','rf_easy_success_rate',]
# df_RF_HARD = df_RF_HARD.dropna()
# df_RF_HARD_standardized = (df_RF_HARD - df_RF_HARD.mean()) / df_RF_HARD.std()
# cov_matrix = df_RF_HARD_standardized.cov()
# print(cov_matrix)


In [86]:
# multi-linear

# import statsmodels.api as sm
# from scipy import stats
# y=df['rf_hard_success_rate']
# X=df_RF_HARD_standardized
# X = sm.add_constant(X)

# # Fit ordinary least squares regression
# model = sm.OLS(y, X)
# results = model.fit()

# # Get slope (beta) and p-value
# # beta = results.params[1]
# # p_value = results.pvalues[1]

# print(results.pvalues)
# print(results.params)

In [87]:
#PCA

# from sklearn.decomposition import PCA

# pca = PCA()
# pca.fit(df_RF_HARD_standardized)

# # Get the eigenvalues (explained variance)
# eigenvalues = pca.explained_variance_
# print("\nEigenvalues (Explained Variance): ")
# print(eigenvalues)

In [88]:
# eigenvectors = pca.components_
# feature_names = df_RF_HARD_standardized.columns

# print("Eigenvectors (Principal Components):")
# for i, eigenvector in enumerate(eigenvectors):
#     print(f"\nPrincipal Component {i+1}:")
#     for feature_weight, feature_name in zip(eigenvector, feature_names):
#         print(f"{feature_name}: {feature_weight}")

In [89]:
#calculate average

# keys = rst_list[0].keys()
# total = {key: 0 for key in keys}

# # loop over the list to sum up the values for each key
# for dictionary in rst_list:
#     for key in keys:
#         total[key] += dictionary[key]

# # calculate the average for each key
# average = {key: total_value / len(rst_list) for key, total_value in total.items()}

# average

In [90]:
analyze_map(map_dir,'lostpig')

{'num_of_locations': 6,
 'num_of_exp_edges': 6,
 'num_of_imp_edges': 3,
 'num_of_conf_locations': 1.5,
 'avg_len_easy': 2.0,
 'avg_len_hard': 1.8888888888888888,
 'ave_num_of_imp_in_hard': 1.5555555555555556,
 'num_of_special_moves': 0,
 'avg_len_scene': 579.3333333333334}

In [91]:
def process_data(data_list):
    y_vars = ['df_easy_success_rate', 'df_hard_success_rate', 'rf_easy_success_rate', 'rf_hard_success_rate']
    x_vars = [key for key in data_list[0].keys() if key not in y_vars]

    results = {}

    for y_var in y_vars:
        results[y_var] = {}
        for x_var in x_vars:
            x_values = np.array([d[x_var] for d in data_list])
            y_values = np.array([d[y_var] for d in data_list])
            
            # Identify indices where either x or y values are NaN
            nan_indices = np.isnan(x_values) | np.isnan(y_values)

            # Remove elements with these indices
            x_values = x_values[~nan_indices]
            
            mu=np.mean(x_values,axis=0)
            sigma=np.std(x_values,axis=0)
            x_values=(x_values-mu)/sigma
            
            y_values = y_values[~nan_indices]
            
            beta, p_value = calculate_beta_pvalue(x_values, y_values)
            results[y_var][x_var] = {'beta': beta, 'p-value': p_value}
            
    return results

In [92]:
reg_result={'gpt-4':process_data(final_result_list['gpt-4']),
            'gpt-3.5-turbo':process_data(final_result_list['gpt-3.5-turbo'])}


In [104]:
for k,v in reg_result['gpt-4']['rf_easy_success_rate'].items():
    print(k,v)

num_of_locations {'beta': -0.12016535494827207, 'p-value': 0.00011598685869870894}
num_of_exp_edges {'beta': -0.09559628329754738, 'p-value': 0.0028792092769139546}
num_of_imp_edges {'beta': -0.08822720385054866, 'p-value': 0.006298238360291655}
num_of_conf_locations {'beta': -0.08827252318194721, 'p-value': 0.006269385869155417}
avg_len_easy {'beta': -0.1314709071766833, 'p-value': 1.808583062434654e-05}
avg_len_hard {'beta': -0.0739060025739398, 'p-value': 0.023666100699843016}
ave_num_of_imp_in_hard {'beta': -0.061741559755436166, 'p-value': 0.06074136012874523}
num_of_special_moves {'beta': -0.0993091348756164, 'p-value': 0.0018857829834572162}
avg_len_scene {'beta': 0.0690154145299896, 'p-value': 0.03522679377845401}


In [94]:
hard={'gpt-4':{'df_hard_success_rate':reg_result['gpt-4']['df_hard_success_rate'],
              'rf_hard_success_rate':reg_result['gpt-4']['rf_hard_success_rate'],},
     'gpt-3.5-turbo':{'df_hard_success_rate':reg_result['gpt-3.5-turbo']['df_hard_success_rate'],
                     'rf_hard_success_rate':reg_result['gpt-3.5-turbo']['rf_hard_success_rate'],}}

In [95]:
print(hard)

{'gpt-4': {'df_hard_success_rate': {'num_of_locations': {'beta': -0.08341181513601865, 'p-value': 0.006811663409619938}, 'num_of_exp_edges': {'beta': -0.0800395365099226, 'p-value': 0.009713516143594897}, 'num_of_imp_edges': {'beta': -0.11763945823293881, 'p-value': 5.348549384740426e-05}, 'num_of_conf_locations': {'beta': -0.0680822117973976, 'p-value': 0.029892682274400055}, 'avg_len_easy': {'beta': -0.06845454206293565, 'p-value': 0.028949562718192934}, 'avg_len_hard': {'beta': -0.09408821810801356, 'p-value': 0.0019540757561536887}, 'ave_num_of_imp_in_hard': {'beta': -0.08076441561731834, 'p-value': 0.009013545231994258}, 'num_of_special_moves': {'beta': 0.01349843491951645, 'p-value': 0.6760845527924433}, 'avg_len_scene': {'beta': 0.08384271728695326, 'p-value': 0.006501256593029141}}, 'rf_hard_success_rate': {'num_of_locations': {'beta': -0.11454375361064521, 'p-value': 0.006459108102809318}, 'num_of_exp_edges': {'beta': -0.10711085775998438, 'p-value': 0.011374659742625207}, 'nu

In [87]:
for k, v in final_rst.items():
    for kk,vv in v.items():
        if kk=='num_of_conf_locations':
            print(k,kk,vv)

df_easy_success_rate num_of_conf_locations {'beta': -0.018053814619060516, 'p-value': 0.017034688367336917}
df_hard_success_rate num_of_conf_locations {'beta': -0.01324795427531114, 'p-value': 0.12503585724246571}
rf_easy_success_rate num_of_conf_locations {'beta': -0.021445333987439336, 'p-value': 0.006012165237880988}
rf_hard_success_rate num_of_conf_locations {'beta': -0.016661551285832715, 'p-value': 0.042511421500743614}


In [88]:
# for k, v in final_rst.items():
#      for kk,vv in v.items():
#         if kk=='num_of_conf_locations':
#             print(k,kk,vv)