In [1]:
import os
import pandas as pd
import numpy as np
import traceback
import json
import math
from functools import partial
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr, kendalltau
from collections import Counter
sns.set()

In [2]:
#!tree results

In [3]:
ROOT = "./_answers/results_gemini_0shot"
HUMAN_RESULTS = "./_human_annotations/ImagenHub_human_eval_results"

In [4]:
#!python count_entries.py results_gemini_0shot

In [5]:
def sigfig(number, sigfigs=4, digit_mode=True):
    """
    Convert a number to its significant figure representation.
    
    Args:
        number (float/list): Number or list of numbers to convert.
        sigfigs (int, optional): Number of significant figures to keep. Defaults to 4.
        digit_mode (bool, optional): If set to True, will use the digit mode for formatting. Defaults to True.
        
    Returns:
        float/list: Number(s) in their significant figure representation.
    """
    if digit_mode:
        string_mode = '{:#.{sigfigs}f}'
    else:
        string_mode = '{:#.{sigfigs}g}'
    if isinstance(number, list):
        new_numbers = []
        for num in number:
            new_num = string_mode.format(num, sigfigs=sigfigs)
            new_numbers.append(float(new_num))
        return new_numbers
    else:
        return float(string_mode.format(number, sigfigs=sigfigs))

def map_to_nearest_higher(number, target_numbers=[0.0, 0.17, 0.33, 0.5, 0.67, 0.83, 1.0], not_mapping=True):
    """
    Maps the given number to the nearest higher number in the target_numbers list.
    If the number is higher than the highest in the list, return the highest number.

    :param number: A float number between 0.0 and 1.0.
    :param target_numbers: A list of numbers to which the given number should be mapped.
    :return: The nearest higher number in the target_numbers list to the given number.
    """
    if not_mapping:
        if number > 1.0:
            return 1.0
        if number < 0.0:
            return 0.0
        return number
    
    # Sort the target numbers just in case they are not sorted
    target_numbers = sorted(target_numbers)

    # Find the nearest higher number
    for target in target_numbers:
        if target >= number:
            return target
    return target_numbers[-1]  # Return the maximum if no higher number is found

In [6]:
def process_json_data(json_data):
    """
    Processes the provided JSON data (in dictionary form) to retain only the 'score' values for each entry.

    :param json_data: A dictionary representing the JSON data.
    :return: A dictionary with only the 'score' values.
    """
    processed_data = {key: value['score'] for key, value in json_data.items()}

    return processed_data

def read_json_files(root_dir=ROOT):
    """
    Reads all JSON files in the specified directory structure and returns their contents.

    :param root_dir: The root directory to start the search from.
    :return: A dictionary with file paths as keys and their content as values.
    """
    json_contents = {}
    
    # Walk through the directory
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for file in filenames:
            if file.endswith('.json'):
                # Construct full file path
                file_path = os.path.join(dirpath, file)
                
                # Read the JSON file
                with open(file_path, 'r') as json_file:
                    data = json.load(json_file)
                    data = process_json_data(data)
                    json_contents[file_path] = data

    return json_contents


def read_json(result_folder, task, modelname, filename):
    # Construct the full path to the JSON file
    file_path = os.path.join(result_folder, task, modelname, filename)
    
    try:
        # Read the JSON file
        with open(file_path, 'r') as file:
            data = json.load(file)
    except:
        return None
    
    return data

def grab_gpt4v_processed(task, modelname, filename, result_folder=ROOT, dumb_way=False):
    if dumb_way:
        out = read_json_files()
        return out[f'{result_folder}/{task}/{modelname}/{filename}.json']
    else:
        data = read_json(result_folder, task, modelname, f"{filename}.json")
        if data is not None:
            data = process_json_data(data)
        return data
    
    

In [7]:
def iterate_all_results(root=ROOT):
    task_list = os.listdir(root) 
    for task in task_list:
        print("===============>", task)
        model_list = os.listdir(os.path.join(root,task))
        for model in model_list:
            print("======>", model, ": ")
            SC_results = grab_gpt4v_processed(task, model, "SC")
            PQ_results = grab_gpt4v_processed(task, model, "PQ")
            
            SC_results, PQ_results = align_dicts(SC_results, PQ_results)
            
            
            SC_list = parse_scores_to_list(SC_results)
            print("Length: ",len(SC_list))
            SC_mean, SC_std  = get_statistics(SC_list)
            print("SC | mean: ", sigfig(SC_mean), " std: ", sigfig(SC_std))

            PQ_list = parse_scores_to_list(PQ_results)
            PQ_mean, PQ_std = get_statistics(PQ_list)
            print("PQ | mean: ", sigfig(PQ_mean), " std: ", sigfig(PQ_std))
            
            O_list = get_O_from_two_lists(SC_list, PQ_list)
            O_mean, O_std = get_statistics(O_list, False)
            print("O | mean: ", sigfig(O_mean), " std: ", sigfig(O_std))

def align_dicts(dict1, dict2):
    """
    Filter out both dict1 and dict2 to hold the common keys in the same order.
    """
    # Find the common keys
    common_keys = set(dict1.keys()) & set(dict2.keys())

    # Filter and order dict1 and dict2 using the common keys
    dict1_aligned = {key: dict1[key] for key in sorted(common_keys)}
    dict2_aligned = {key: dict2[key] for key in sorted(common_keys)}

    return dict1_aligned, dict2_aligned

def parse_scores_to_list(data_dict):
    score_list = []
    for scores in data_dict.values():
        score_list.append(scores)
        #print(scores)

    return score_list

def preprocess(_list):
    temp_list = []
    for scores in _list:
        if isinstance(scores, (int, float)):
            temp_list.append(map_to_nearest_higher(scores/10.0))
        else:
            scores = [int(score) for score in scores]
            temp_list.append(map_to_nearest_higher(min(scores)/10.0))
    return temp_list
    #return [min(scores)/10.0 for scores in _list]
    
def get_O_from_two_dicts_gpt4v(SC_dict, PQ_dict, normalize=True):
    return_dict = {}
    for key in SC_dict.keys():
        if isinstance(SC_dict[key], (int, float)):
            SC = map_to_nearest_higher(SC_dict[key]/10.0) if normalize else min(SC_dict[key])
        else:
            SC_dict[key] = [int(score) for score in SC_dict[key]]
            SC = map_to_nearest_higher(min(SC_dict[key])/10.0) if normalize else min(SC_dict[key])

        if isinstance(PQ_dict[key], (int, float)):
            PQ = map_to_nearest_higher(PQ_dict[key]/10.0) if normalize else min(PQ_dict[key])
        else:
            PQ_dict[key] = [int(score) for score in PQ_dict[key]]
            PQ = map_to_nearest_higher(min(PQ_dict[key])/10.0) if normalize else min(PQ_dict[key])
        return_dict[key] = math.sqrt(SC * PQ)
    return return_dict

def get_O_from_two_lists(SC_list, PQ_list, preprocess_fn=preprocess):
    """
    Compute the geometric mean of two lists on an instance level.

    Parameters:
    list1 (list): The first list of numbers.
    list2 (list): The second list of numbers.

    Returns:
    list: A list containing the geometric means of corresponding elements from list1 and list2.
    """
    # Check if both lists are of the same length
    if len(SC_list) != len(PQ_list):
        raise ValueError("Both lists must be of the same length")
    
    if(preprocess_fn):
        SC_list = preprocess(SC_list)
        PQ_list = preprocess(PQ_list)
    
    # Calculate the geometric mean for each pair of elements
    return [math.sqrt(x * y) for x, y in zip(SC_list, PQ_list)]


def get_statistics(score_list, preprocess=True):
    new_score_list = []
    if preprocess:
        for scores in score_list:
            if isinstance(scores, (int, float)):
                score = scores
            else:
                scores = [int(val) for val in scores]
                score = min(scores)
            #score = map_values(score)
            new_score_list.append(map_to_nearest_higher(score/10.0))
        score_list = new_score_list
    mean = np.mean(score_list)
    std = np.std(score_list)
    return mean, std


In [8]:
def analyze_scores(root=ROOT, per_model=False):
    task_list = os.listdir(root) 
    for task in task_list:
        print("===============>", task)
        model_list = os.listdir(os.path.join(root,task))
        task_counter_SC = Counter()
        task_counter_PQ = Counter()
        task_counter_O = Counter()
        
        for model in model_list:
            print("======>", model, ": ")
            SC_results = grab_gpt4v_processed(task, model, "SC")
            PQ_results = grab_gpt4v_processed(task, model, "PQ")
            
            SC_results, PQ_results = align_dicts(SC_results, PQ_results)
            
            SC_list = parse_scores_to_list(SC_results)
            PQ_list = parse_scores_to_list(PQ_results)
            O_list = get_O_from_two_lists(SC_list, PQ_list)
            print("SC list | ", sort_counter_keys(Counter(to_one_score_list(SC_list))))
            print("PQ list | ", sort_counter_keys(Counter(to_one_score_list(PQ_list))))
            print("O list | ", sort_counter_keys(correct_keys_and_sum(Counter(O_list))))
            if per_model:
                plot_counter_bar_chart(sort_counter_keys(Counter(to_one_score_list(SC_list))), "red", title=f"{task} | {model}(SC)")
                plot_counter_bar_chart(sort_counter_keys(Counter(to_one_score_list(PQ_list))), "blue", title=f"{task} | {model}(PQ)")
                plot_counter_bar_chart(correct_keys_and_sum(sort_counter_keys(correct_keys_and_sum(Counter(O_list)))), "purple", title=f"{task} | {model}(O)")
            task_counter_SC += Counter(sort_counter_keys(sort_counter_keys(Counter(to_one_score_list(SC_list)))))
            task_counter_PQ += Counter(sort_counter_keys(sort_counter_keys(Counter(to_one_score_list(PQ_list)))))
            task_counter_O += Counter(sort_counter_keys(sort_counter_keys(Counter(O_list))))
        
        if not per_model:
            plot_counter_bar_chart(task_counter_SC, "red", title=f"{task} (SC)")
            plot_counter_bar_chart(task_counter_PQ, "blue", title=f"{task} (PQ)")
            plot_counter_bar_chart(correct_keys_and_sum(task_counter_O), "purple", title=f"{task} (O), nearest 0.1")

def analyze_scores_SC(root=ROOT, per_model=False):
    task_list = os.listdir(root) 
    for task in task_list:
        print("===============>", task)
        model_list = os.listdir(os.path.join(root,task))
        task_counter_SC = Counter()
        
        for model in model_list:
            print("======>", model, ": ")
            SC_results = grab_gpt4v_processed(task, model, "SC")
            if SC_results is None:
                print(f"{task} | {model} not found")
                continue
            SC_list = parse_scores_to_list(SC_results)
            print("SC list | ", sort_counter_keys(Counter(to_one_score_list(SC_list))))
            if per_model:
                plot_counter_bar_chart(sort_counter_keys(Counter(to_one_score_list(SC_list))), "red", title=f"{task} | {model}(SC)")
            task_counter_SC += Counter(sort_counter_keys(sort_counter_keys(Counter(to_one_score_list(SC_list)))))
        
        if not per_model:
            plot_counter_bar_chart(task_counter_SC, "red", title=f"{task} (SC)")
            
def sort_counter_keys(counter):
    # Sorting the counter by its keys and creating a sorted dictionary
    sorted_dict = {k: counter[k] for k in sorted(counter)}

    return sorted_dict

def to_one_score_list(scores_list):
    temp_list = []
    for scores in scores_list:
        if isinstance(scores, (int, float)):
            temp_list.append(map_to_nearest_higher(scores/10.0))
        else:
            int_scores = []
            # Iterate through each element in the list
            for item in scores:
                # Convert the item to an integer and append it to the int_list
                int_scores.append(int(item))

            temp_list.append(map_to_nearest_higher(min(int_scores)/10.0))
    return temp_list

def correct_keys_and_sum(counter):
    """
    Adjusts the keys of a Counter object to the nearest 0.1 value and sums up the counts.

    Parameters:
    counter (Counter): The Counter object to be adjusted.

    Returns:
    Counter: The adjusted Counter object.
    """
    new_counter = Counter()
    for key, value in counter.items():
        # Rounding the key to the nearest 0.1
        corrected_key = round(key * 10) / 10
        # Adding the count to the corrected key in the new counter
        new_counter[corrected_key] += value
    return new_counter

def plot_counter_bar_chart(counter, color="blue", bar_width=0.09, title="", alpha=1.0):
    """
    Plots a bar chart based on a Counter object.

    Parameters:
    counter (Counter): The Counter object to be plotted.
    """
    # Extracting keys and values from the Counter object
    keys = list(counter.keys())
    values = list(counter.values())

    # Plotting the bar chart
    plt.figure(figsize=(5, 3))
    plt.bar(keys, values, color=color, width=bar_width, alpha=alpha)
    plt.xlabel('Score')
    plt.ylabel('Counts')
    plt.xticks(np.arange(0, 1.1, 0.1))
    plt.title(title)
    plt.show()

In [9]:
#analyze_scores_SC()

In [10]:
iterate_all_results()

Length:  102
SC | mean:  0.1176  std:  0.1712
PQ | mean:  0.6206  std:  0.1301
O | mean:  0.1851  std:  0.1888
Length:  102
SC | mean:  0.1392  std:  0.1926
PQ | mean:  0.6147  std:  0.1562
O | mean:  0.2211  std:  0.1931
Length:  102
SC | mean:  0.2167  std:  0.262
PQ | mean:  0.6814  std:  0.144
O | mean:  0.2977  std:  0.2495
Length:  179
SC | mean:  0.4084  std:  0.4562
PQ | mean:  0.505  std:  0.1304
O | mean:  0.307  std:  0.3334
Length:  179
SC | mean:  0.3067  std:  0.4059
PQ | mean:  0.4626  std:  0.1585
O | mean:  0.2259  std:  0.2814
Length:  179
SC | mean:  0.3704  std:  0.4537
PQ | mean:  0.5313  std:  0.1269
O | mean:  0.2819  std:  0.3345
Length:  179
SC | mean:  0.1615  std:  0.321
PQ | mean:  0.3642  std:  0.1504
O | mean:  0.1272  std:  0.2243
Length:  179
SC | mean:  0.6263  std:  0.4325
PQ | mean:  0.4855  std:  0.1375
O | mean:  0.4544  std:  0.3143
Length:  179
SC | mean:  0.0771  std:  0.2472
PQ | mean:  0.4933  std:  0.1385
O | mean:  0.0571  std:  0.1722
Length

In [11]:
#analyze_scores()

In [12]:
def grab_dataframes(root_dir = '.'):
    """
    Grab dataframes from the subdirectories of a given root directory.

    Args:
        root_dir (str): The root directory to start searching for .tsv files. Defaults to the current directory.

    Returns:
        dict: A dictionary where keys are subdirectory names and values are lists of TSV dataframes.
    """
    # Initialize an empty dictionary to store dataframes
    dataframes_dict = {}

    # List all subdirectories in the root directory
    subdirectories = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]

    # Iterate through each subdirectory
    for subdir in subdirectories:
        subdir_path = os.path.join(root_dir, subdir)

        # Forget about ipynb things
        if subdir == ".ipynb_checkpoints":
            continue
        # Initialize a list for dataframes in this directory
        dir_dataframes = []

        # Walk through the current subdirectory
        for root, _, files in os.walk(subdir_path):
            for file in files:
                # Check if the file has a .tsv extension
                if file.endswith('.tsv'):
                    # Create the full path to the TSV file
                    tsv_path = os.path.join(root, file)

                    # Read the TSV file into a pandas dataframe
                    df = pd.read_csv(tsv_path, sep='\t')

                    # Append the dataframe to the list for this directory
                    dir_dataframes.append(df)

        # Add the list of dataframes to the dictionary with the subdirectory name as the key
        dataframes_dict[subdir] = dir_dataframes

    # Now you have a dictionary where keys are subdirectory names and values are lists of TSV dataframes
    return dataframes_dict

In [13]:
def extract_values_based_on_keys(target_dict, key_reference_dict):
    """
    Extracts values from the target dictionary based on the keys of the reference dictionary.

    Parameters:
    key_reference_dict (dict): The dictionary whose keys are used as a reference.
    target_dict (dict): The dictionary from which values are extracted.

    Returns:
    dict: A dictionary with keys from the reference dictionary and corresponding values from the target dictionary.
    """
    result = {key: target_dict[key] for key in key_reference_dict.keys() if key in target_dict}
    assert list(result.keys()) == list(key_reference_dict.keys())
    return result


In [14]:
def convert_to_list_of_lists(input_list):
    """
    Convert a mixed list of string representations of lists and actual lists into a list of lists.
    
    Args:
    input_list (list): A list containing a mix of string representations of lists and actual lists.

    Returns:
    list: A list of lists.
    """
    output_list = []

    for item in input_list:
        if isinstance(item, str):
            # Convert string representation of a list to an actual list
            try:
                converted_item = eval(item)
                if isinstance(converted_item, list):
                    output_list.append(converted_item)
                else:
                    raise ValueError(f"Item '{item}' is not a valid list representation.")
            except:
                raise ValueError(f"Unable to convert item '{item}' to a list.")
        elif isinstance(item, list):
            # Directly append if it's already a list
            output_list.append(item)
        else:
            raise ValueError(f"Invalid item type: {type(item)}. Expected a string or a list.")

    return output_list

In [15]:
def extract_human_score_raw(list_of_string_lists, cell = 0, overall_mode=False):
    """
    Converts a list of string representations of lists into actual lists, 
    and then returns the first item from each of these lists.

    Parameters:
    list_of_string_lists (list of str): A list containing string representations of lists.

    Returns:
    list: A list of first items from each converted list in the input list.
    """
    result = []
    list_of_list = convert_to_list_of_lists(list_of_string_lists)
    for value_list in list_of_list:
        if not overall_mode:
            result.append(value_list[cell])
        else:
            result.append(math.sqrt(value_list[0] * value_list[1]))
    return result

def combine_gpt4v_human_scores(gpt4v_dict, human_dict, mode="SC"):
    gpt4v_l = []
    human_l = []
    o_human_l = []
    
    if mode == "O":
        for key in gpt4v_dict:
            human_score_raw = human_dict[key]
            #print(human_score_raw)
            o_human_score = extract_human_score_raw(human_score_raw, overall_mode=True)
            o_human_score = np.mean(o_human_score)
            o_human_l.append(o_human_score)
        return o_human_l
    
    for key in gpt4v_dict:
        if (mode == "SC"):
            cell = 0
        elif (mode == "PQ"):
            cell = 1
        gpt4v_score_raw = gpt4v_dict[key]
        if isinstance(gpt4v_score_raw, (int, float)):
            gpt4v_score = map_to_nearest_higher(gpt4v_score_raw/10.0)
        else:
            gpt4v_score_raw = [int(score) for score in gpt4v_score_raw]
            gpt4v_score = map_to_nearest_higher(min(gpt4v_score_raw)/10.0)
        human_score_raw = human_dict[key]
        #print(human_score_raw)
        human_score = extract_human_score_raw(human_score_raw, cell)
        human_score = np.mean(human_score)
        gpt4v_l.append(gpt4v_score)
        human_l.append(human_score)
    return gpt4v_l, human_l

def dict_to_value_list(dictionary):
    """
    Convert a dictionary into a list of its values.

    Args:
    dictionary (dict): A dictionary from which values are to be extracted.

    Returns:
    list: A list containing the values of the dictionary.
    """
    return list(dictionary.values())

In [16]:
def average_correlation(z_scores):
    """
    Averages a list of Fisher Z-transformed correlation scores and converts it back to a correlation coefficient.

    :param z_scores: A list of Fisher Z-transformed correlation scores.
    :return: The averaged correlation coefficient.
    """
    # Calculate the average Z score
    z_avg = sum(z_scores) / len(z_scores)

    # Convert the average Z score back to a correlation coefficient
    r_avg = (math.exp(2 * z_avg) - 1) / (math.exp(2 * z_avg) + 1)
    return r_avg

In [17]:
def iterate_all_results_corr(root=ROOT, use_spearmanr=1):
    task_list = os.listdir(root) 
    all_task_SC_correlation = []
    all_task_PQ_correlation = []
    all_task_O_correlation = []
    for task in task_list:
        print("===============>", task)
        model_list = os.listdir(os.path.join(root,task))
        human_data = grab_dataframes(HUMAN_RESULTS)[task]
        
        human_result = {model: {} for model in model_list}
        for df in human_data:
            for index, row in df.iterrows():
                uid = row['uid']

                for model in human_result.keys():
                    if uid not in human_result[model]:
                        human_result[model][uid] = []

                    human_result[model][uid].append(row[model])
        
        task_SC_correlation = []
        task_PQ_correlation = []
        task_O_correlation = []
        for model in model_list:
            print("======>", model, ": ")
            SC_results = grab_gpt4v_processed(task, model, "SC")
            PQ_results = grab_gpt4v_processed(task, model, "PQ")
            
            SC_results, PQ_results = align_dicts(SC_results, PQ_results)
            O_results = get_O_from_two_dicts_gpt4v(SC_results, PQ_results)
            
            SC_results_human = extract_values_based_on_keys(human_result[model], SC_results)
            SC_gpt4v, SC_human = combine_gpt4v_human_scores(SC_results, SC_results_human, mode="SC")
            

            PQ_results_human = extract_values_based_on_keys(human_result[model], PQ_results)
            PQ_gpt4v, PQ_human = combine_gpt4v_human_scores(PQ_results, PQ_results_human, mode="PQ")

            O_results_human = PQ_results_human # Basically same source
            O_human = combine_gpt4v_human_scores(O_results, O_results_human, mode="O")
            O_gpt4v = dict_to_value_list(O_results)
            
            if use_spearmanr==2: 
                SC_rho, _ = kendalltau(SC_gpt4v, SC_human)
                print("SC|", sigfig(SC_rho))
                PQ_rho, _ = kendalltau(PQ_gpt4v, PQ_human)
                print("PQ|", sigfig(PQ_rho))
                O_rho, _ = kendalltau(O_gpt4v, O_human)
                print("O|", sigfig(O_rho))
            elif use_spearmanr==1:
                SC_rho, _ = spearmanr(SC_gpt4v, SC_human)
                print("SC|", sigfig(SC_rho))
                PQ_rho, _ = spearmanr(PQ_gpt4v, PQ_human)
                print("PQ|", sigfig(PQ_rho))
                O_rho, _ = spearmanr(O_gpt4v, O_human)
                print("O|", sigfig(O_rho))
            elif use_spearmanr==0:
                SC_rho = np.corrcoef(SC_gpt4v, SC_human)[0, 1]
                print("SC|", sigfig(SC_rho))
                PQ_rho = np.corrcoef(PQ_gpt4v, PQ_human)[0, 1]
                print("PQ|", sigfig(PQ_rho))
                O_rho = np.corrcoef(O_gpt4v, O_human)[0, 1]
                print("O|", sigfig(O_rho))
            task_SC_correlation.append(SC_rho)
            task_PQ_correlation.append(PQ_rho)
            task_O_correlation.append(O_rho)
        print(f"++++++++++++++> {task} Avg")
        task_SC_correlation_avg = average_correlation(task_SC_correlation)
        task_PQ_correlation_avg = average_correlation(task_PQ_correlation)
        task_O_correlation_avg = average_correlation(task_O_correlation)
        print(task,"|SC|",sigfig(task_SC_correlation_avg))
        print(task,"|PQ|",sigfig(task_PQ_correlation_avg))
        print(task,"|O|",sigfig(task_O_correlation_avg))
        all_task_SC_correlation.append(task_SC_correlation_avg)
        all_task_PQ_correlation.append(task_PQ_correlation_avg)
        all_task_O_correlation.append(task_O_correlation_avg)

    final_SC_corr_avg = average_correlation(all_task_SC_correlation)
    final_PQ_corr_avg = average_correlation(all_task_PQ_correlation)
    final_O_corr_avg = average_correlation(all_task_O_correlation)
    print("")
    print("VVVVVVVVVVVVVVVVVV> FINAL Correlation Across 7 Tasks")
    print("|SC|",sigfig(final_SC_corr_avg))
    print("|PQ|",sigfig(final_PQ_corr_avg))
    print("|O|",sigfig(final_O_corr_avg))
        

In [18]:
#iterate_all_results_corr(use_spearmanr=0)

In [19]:
iterate_all_results_corr(use_spearmanr=1)



SC| 0.3538
PQ| -0.0053
O| 0.3134
SC| 0.2145
PQ| 0.2928
O| 0.1707
SC| 0.5476
PQ| 0.3046
O| 0.5491
++++++++++++++> ImagenHub_Multi-Concept_IC Avg
ImagenHub_Multi-Concept_IC |SC| 0.3557
ImagenHub_Multi-Concept_IC |PQ| 0.1948
ImagenHub_Multi-Concept_IC |O| 0.3314
SC| 0.3249
PQ| 0.3611
O| 0.3268
SC| 0.4387
PQ| 0.5941
O| 0.4009
SC| 0.1091
PQ| 0.4078
O| 0.1077
SC| 0.1719
PQ| 0.5632
O| 0.0947
SC| 0.4689
PQ| 0.4927
O| 0.4727
SC| 0.1368
PQ| 0.4136
O| 0.1356
SC| 0.197
PQ| 0.4698
O| 0.2188
SC| 0.4856
PQ| 0.3684
O| 0.4822
++++++++++++++> ImagenHub_Text-Guided_IE Avg
ImagenHub_Text-Guided_IE |SC| 0.2836
ImagenHub_Text-Guided_IE |PQ| 0.4291
ImagenHub_Text-Guided_IE |O| 0.2728
SC| 0.4059
PQ| 0.3143
O| 0.3277
SC| 0.2695
PQ| 0.3846
O| 0.2819
++++++++++++++> ImagenHub_Control-Guided_IG Avg
ImagenHub_Control-Guided_IG |SC| 0.3254
ImagenHub_Control-Guided_IG |PQ| 0.3359
ImagenHub_Control-Guided_IG |O| 0.2957
SC| 0.4417
PQ| 0.3575
O| 0.3965
SC| 0.2487
PQ| 0.224
O| 0.1154
SC| 0.5325
PQ| 0.3008
O| 0.4443
SC| 

In [20]:
#iterate_all_results_corr(use_spearmanr=2)

In [21]:
def compute_spearman_correlation_by_item(data, target_user, use_spearmanr=1):
    """
    Computes the Spearman rank correlation of the target user's ratings for each item (A and B) 
    with the average ratings of the other two users for the same items.

    :param data: A dictionary with keys as image names and values as a list of rating pairs [A, B].
    :param target_user: Index (0, 1, or 2) of the target user whose ratings are to be compared.
    :return: A dictionary containing Spearman rank correlation coefficients and p-values for each item.
    """
    target_ratings_SC = []
    target_ratings_PQ = []
    target_ratings_O = []
    avg_other_ratings_SC = []
    avg_other_ratings_PQ = []
    avg_other_ratings_O = []

    for ratings in data.values():
        # Extract the ratings of the target user and the other two users
        target_rating = ratings[target_user]
        other_ratings = [ratings[i] for i in range(3) if i != target_user]

        # Convert string ratings to list of floats

        O_ratings = [list(map(float, r.strip('[]').split(','))) for r in ratings]
        O_ratings = [math.sqrt(r[0] * r[1]) for r in O_ratings]

        target_rating = list(map(float, target_rating.strip('[]').split(',')))
        other_ratings = [list(map(float, r.strip('[]').split(','))) for r in other_ratings]

        O_target_rating = O_ratings[target_user]
        O_other_ratings = [O_ratings[i] for i in range(3) if i != target_user]

        # Calculate the average of the other two users' ratings
        avg_rating = np.mean(other_ratings, axis=0)

        O_avg_rating = np.mean(O_other_ratings, axis=0)

        # Append the ratings for each item for correlation calculation
        target_ratings_SC.append(target_rating[0])
        target_ratings_PQ.append(target_rating[1])
        target_ratings_O.append(O_target_rating)
        avg_other_ratings_SC.append(avg_rating[0])
        avg_other_ratings_PQ.append(avg_rating[1])
        avg_other_ratings_O.append(O_avg_rating)
        
    if use_spearmanr==2:
        correlation_SC, p_value_SC = kendalltau(target_ratings_SC, avg_other_ratings_SC)
        correlation_PQ, p_value_PQ = kendalltau(target_ratings_PQ, avg_other_ratings_PQ)
        correlation_O, p_value_O = kendalltau(target_ratings_O, avg_other_ratings_O)
        return {
            'SC': {'Correlation': correlation_SC, 'P-Value': p_value_SC},
            'PQ': {'Correlation': correlation_PQ, 'P-Value': p_value_PQ},
            'O': {'Correlation': correlation_O, 'P-Value': p_value_O},
        }
    elif use_spearmanr==1:
        correlation_SC, p_value_SC = spearmanr(target_ratings_SC, avg_other_ratings_SC)
        correlation_PQ, p_value_PQ = spearmanr(target_ratings_PQ, avg_other_ratings_PQ)
        correlation_O, p_value_O = spearmanr(target_ratings_O, avg_other_ratings_O)
        return {
            'SC': {'Correlation': correlation_SC, 'P-Value': p_value_SC},
            'PQ': {'Correlation': correlation_PQ, 'P-Value': p_value_PQ},
            'O': {'Correlation': correlation_O, 'P-Value': p_value_O},
        }
    elif use_spearmanr==0:
        correlation_SC = np.corrcoef(target_ratings_SC, avg_other_ratings_SC)[0, 1]
        correlation_PQ = np.corrcoef(target_ratings_PQ, avg_other_ratings_PQ)[0, 1]
        correlation_O = np.corrcoef(target_ratings_O, avg_other_ratings_O)[0, 1]
        return {
            'SC': {'Correlation': correlation_SC},
            'PQ': {'Correlation': correlation_PQ},
            'O': {'Correlation': correlation_O},
        }

        

def iterate_all_results_human2human(root=ROOT, target_user_index=0, num_raters=3, use_spearmanr=1):
    task_list = os.listdir(root) 
    for task in task_list:
        print("===============>", task)
        model_list = os.listdir(os.path.join(root,task))
        human_data = grab_dataframes(HUMAN_RESULTS)[task]
        
        human_result = {model: {} for model in model_list}
        for df in human_data:
            for index, row in df.iterrows():
                uid = row['uid']

                for model in human_result.keys():
                    if uid not in human_result[model]:
                        human_result[model][uid] = []

                    human_result[model][uid].append(row[model])
        for model in model_list:
            print("======>", model, ": ")
            data = human_result[model]
            corr_list_SC = []
            corr_list_PQ = []
            corr_list_O = []
            for i in range(num_raters):
                human_corr_dict = compute_spearman_correlation_by_item(data, i, use_spearmanr=use_spearmanr)
                corr_list_SC.append(human_corr_dict['SC']['Correlation'])
                corr_list_PQ.append(human_corr_dict['PQ']['Correlation'])
                corr_list_O.append(human_corr_dict['O']['Correlation'])
            print(f"Human2Human for SC:", sigfig(average_correlation(corr_list_SC)))
            print(f"Human2Human for PQ:", sigfig(average_correlation(corr_list_PQ)))
            print(f"Human2Human for O:", sigfig(average_correlation(corr_list_O)))
            

In [25]:
#iterate_all_results_human2human(use_spearmanr=0) #pearson

In [26]:
iterate_all_results_human2human(use_spearmanr=1) #spearman

Human2Human for SC: 0.699
Human2Human for PQ: 0.5803
Human2Human for O: 0.698
Human2Human for SC: 0.6209
Human2Human for PQ: 0.6423
Human2Human for O: 0.6222
Human2Human for SC: 0.7256
Human2Human for PQ: 0.4838
Human2Human for O: 0.7217
Human2Human for SC: 0.588
Human2Human for PQ: 0.5028
Human2Human for O: 0.5811
Human2Human for SC: 0.5482
Human2Human for PQ: 0.5887
Human2Human for O: 0.5891
Human2Human for SC: 0.2657
Human2Human for PQ: 0.4705
Human2Human for O: 0.1991
Human2Human for SC: 0.3286
Human2Human for PQ: 0.6214
Human2Human for O: 0.4265
Human2Human for SC: 0.6219
Human2Human for PQ: 0.519
Human2Human for O: 0.6289
Human2Human for SC: 0.2675
Human2Human for PQ: 0.5757
Human2Human for O: 0.1524
Human2Human for SC: 0.3311
Human2Human for PQ: 0.577
Human2Human for O: 0.3327
Human2Human for SC: 0.66
Human2Human for PQ: 0.5955
Human2Human for O: 0.6561
Human2Human for SC: 0.606
Human2Human for PQ: 0.6062
Human2Human for O: 0.5954
Human2Human for SC: 0.6144
Human2Human for PQ: 0

In [27]:
#iterate_all_results_human2human(use_spearmanr=2) #kendalltau