# Analysis of Results


## Setup

In [4]:
pip install scikit_posthocs

Collecting scikit_posthocs
  Downloading scikit_posthocs-0.11.4-py3-none-any.whl.metadata (5.8 kB)
Downloading scikit_posthocs-0.11.4-py3-none-any.whl (33 kB)
Installing collected packages: scikit_posthocs
Successfully installed scikit_posthocs-0.11.4


In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import friedmanchisquare
import scikit_posthocs as sp
import os
import itertools
pd.options.display.float_format = '{:.11}'.format
pd.set_option('display.max_columns', None)

In [6]:
# Function to perform statistical tests
def statistics(groups, verbose=False):
    num_groups = len(groups)
    results = {}

    if num_groups >= 2:
        # Conduct Friedman test
        friedman_stat, friedman_p_value = friedmanchisquare(*groups)
        results['Friedman p-value'] = friedman_p_value
        results['Friedman statistic'] = friedman_stat
        if verbose:
            print(f"Friedman Test: Statistic={friedman_stat}, p-value={friedman_p_value}")

        # Conduct Nemenyi post-hoc test if Friedman test is significant
        if friedman_p_value < 0.05:
            data = np.array(groups).T
            nemenyi_results = sp.posthoc_nemenyi_friedman(data)
            results['Nemenyi results'] = nemenyi_results
            if verbose:
                print("Nemenyi Post-Hoc Test Results:")
                print(nemenyi_results)
        else:
            if verbose:
                print("Friedman test is not significant; Nemenyi post-hoc test not performed.")
    else:
        raise ValueError("At least two groups are required for comparison.")

    return results

def calculate_means(df):
    # Dictionary to store the mean of each combination
    means_dict = {}

    # Get all column names
    columns = df.columns

    # Iterate over all possible combinations of the columns
    for r in range(1, len(columns) + 1):
        for combo in itertools.combinations(columns, r):
            combo_name = ''.join(combo)
            means_dict[combo_name] = df[list(combo)].mean(axis=1)

    # Function to interpret r effect sizes
def interpret_effect_size_r(r):
    if r is None:
        return "N/A"
    if r < 0.3:
        return "small"
    elif r < 0.7:
        return "medium"
    else:
        return "large"

# Function to interpret Kendall's W
def interpret_kendall_w(w):
    if w < 0.3:
        return "small"
    elif w < 0.7:
        return "medium"
    else:
        return "large"

In [7]:
def get_stereotype(responses_path, calc_mean=False, prompts=[1,2,3,4,5,6,7,8,9,10,13,16,17,18,19,20]):
    df = pd.read_csv(responses_path, index_col=['prompt_id'])
    if 'pronoun_sequence_id' in df.columns:
      df.drop(columns=['pronoun_sequence_id'], inplace=True)
    df = df.loc[:, (df.columns.str.contains('neutral'))]
    df = df[df.index.isin(prompts)]
    if calc_mean:
      df = pd.DataFrame(df.mean(axis=1).values, index=df.index, columns=['Calibration'])
    return df

In [8]:
PATH = ''

# Gender

In [9]:
data_dict = {
    # support for finer drill-down
    "All" : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55]
}

In [10]:
import pandas as pd
import numpy as np
from scipy.stats import friedmanchisquare, kendalltau, norm
import scikit_posthocs as sp

# Define the paths to the CSV files
paths = [
    [PATH + '/Gender/gpt-4o-mini-2024-07-18 gender 2024-07-30 - cos_similarity.csv', PATH + '/Gender/Calibration/gpt-4o-mini-2024-07-18 gender calibration - cos_similarity.csv'],
    [PATH + '/Gender/mistral gender 2024-05-02 - cos_similarity.csv', PATH + '/Gender/Calibration/mistral gender calibration - cos_similarity.csv'],
    [PATH + '/Gender/claude-3 gender 2024-05-02 - cos_similarity.csv', PATH + '/Gender/Calibration/claude-3 gender calibration - cos_similarity.csv'],
    [PATH + '/Gender/gemini gender 19-07-2024 - cos_similarity.csv', PATH + '/Gender/Calibration/gemini gender calibration - cos_similarity.csv'],
    [PATH + '/Gender/llama-2 gender 2024-05-02 - cos_similarity.csv', PATH + '/Gender/Calibration/llama-2 gender calibration - cos_similarity.csv'],
    [PATH + '/Gender/llama-3 gender 2024-05-02 - cos_similarity.csv', PATH + '/Gender/Calibration/llama-3 gender calibration - cos_similarity.csv'],
    [PATH + '/Gender/gemma gender 2024-05-02 - cos_similarity.csv', PATH + '/Gender/Calibration/gemma gender calibration - cos_similarity.csv'],
    [PATH + '/Gender/yi gender 2024-05-02 - cos_similarity.csv', PATH + '/Gender/Calibration/yi gender calibration - cos_similarity.csv']]

# Initialize the main results DataFrame
results_df = pd.DataFrame()

for key in data_dict.keys():
    print(key)
    # Initialize the results DataFrame for this key
    # Loop through each CSV file and calculate statistics
    for path in paths:
        # Extract LLM name from the path
        llm_name = path[0].split('/')[-1].split(' ')[0]

        # Get indices for prompts
        indicies = data_dict.get(key)  # replace with actual function to get indices

        # Read the CSV files
        prompts = get_stereotype(path[0], prompts=indicies)
        neutral = get_stereotype(path[1], calc_mean=True, prompts=indicies)

        # Merge prompts and neutral data
        prompts = pd.merge(prompts, neutral, on='prompt_id', how='inner')

        if len(prompts.columns) >= 2:
            try:
                len(prompts['cos_similarity: male vs neutral'])
                male = 'cos_similarity: male vs neutral'
                female = 'cos_similarity: female vs neutral'
            except:
                male = 'cos_similarity: neutral vs male'
                female = 'cos_similarity: neutral vs female'

            # Prepare data for male vs female vs neutral(calibration) comparisons
            groups = [prompts[male], prompts[female], prompts['Calibration']]

            # Perform statistical tests
            stat_results = statistics(groups, verbose=False)
            friedman_p = stat_results['Friedman p-value']  # float(stat_results['friedman'].split(', ')[1].split('=')[1])
            nemenyi_results = stat_results.get('Nemenyi results')  # stat_results.get('nemenyi')

            # Calculate Kendall's W for effect size
            # Extract Friedman chi-square from the results
            friedman_chi2 = stat_results['Friedman statistic']  # float(stat_results['friedman'].split(', ')[0].split('=')[1])
            k = len(groups)  # Number of groups/conditions
            n = len(groups[0])  # Number of subjects/blocks

            # Calculate Kendall's W
            kendall_w = friedman_chi2 / (k * (n - 1))
            effect_size_interpretation = interpret_kendall_w(kendall_w)

            # Calculate effect size for each pairwise comparison
            # Initialize effect size variables
            r_male_female = None
            r_male_neutral = None
            r_female_neutral = None
            effect_male_female = "N/A"
            effect_male_neutral = "N/A"
            effect_female_neutral = "N/A"

            if friedman_p <= 0.05:  # Only if the Friedman test is significant
                # For male vs female comparison
                if isinstance(nemenyi_results[0][1], float):
                    z_male_female = norm.ppf(1 - float(nemenyi_results[0][1])/2)
                    r_male_female = abs(z_male_female) / np.sqrt(2 * n)
                    effect_male_female = interpret_effect_size_r(r_male_female)

                # For male vs neutral comparison
                if isinstance(nemenyi_results[0][2], float):
                    z_male_neutral = norm.ppf(1 - float(nemenyi_results[0][2])/2)
                    r_male_neutral = abs(z_male_neutral) / np.sqrt(2 * n)
                    effect_male_neutral = interpret_effect_size_r(r_male_neutral)

                # For female vs neutral comparison
                if isinstance(nemenyi_results[1][2], float):
                    z_female_neutral = norm.ppf(1 - float(nemenyi_results[1][2])/2)
                    r_female_neutral = abs(z_female_neutral) / np.sqrt(2 * n)
                    effect_female_neutral = interpret_effect_size_r(r_female_neutral)

            if friedman_p > 0.05:
                p_male_female = '*'
                p_male_neutral = '*'
                p_neutral_female = '*'
            else:
                p_male_female = f'{nemenyi_results[0][1]:.2f}'
                p_male_neutral = f'{nemenyi_results[0][2]:.2f}'
                p_neutral_female = f'{nemenyi_results[1][2]:.2f}'

            print(f'LLM: {llm_name}\nKendall w: {kendall_w}\nNemenyi: {nemenyi_results}')
            male_mean = prompts[male].mean()
            female_mean = prompts[female].mean()
            neutral_mean = prompts['Calibration'].mean()

            male_female_mean = prompts[[male, female]].mean().mean()
            male_neutral_mean = prompts[[male, 'Calibration']].mean().mean()
            female_neutral_mean = prompts[[female, 'Calibration']].mean().mean()
            male_female_neutral_mean = prompts[[male, female ,'Calibration']].mean().mean()

            male_mean = float(f'{male_mean:.2f}')
            female_mean = float(f'{female_mean:.2f}')
            neutral_mean = float(f'{neutral_mean:.2f}')

            male_female_mean = float(f'{male_female_mean:.2f}')
            male_neutral_mean = float(f'{male_neutral_mean:.2f}')
            female_neutral_mean = float(f'{female_neutral_mean:.2f}')
            male_female_neutral_mean = float(f'{male_female_neutral_mean:.2f}')
            print(len(prompts))

            # Append results to the results DataFrame
            new_row = pd.DataFrame({
                'LLM': [llm_name],
                'Stereotype': key,
                'P Value Male-Neutral': [p_male_neutral],
                'P Value Female-Neutral': [p_neutral_female],
                'P Value Male-Female': [p_male_female],
                'Male_mean': [male_mean],
                'Female_mean':[female_mean],
                'Neutral_mean':[neutral_mean],
                'Male_Female_mean':[male_female_mean],
                'Male_Neutral_mean':[male_neutral_mean],
                'Female_Neutral_mean':[female_neutral_mean],
                'Male_Female_Neutral_mean':[male_female_neutral_mean],
                'Kendall_W': [round(kendall_w, 3) if kendall_w is not None else None],
                'Effect_Size_Overall': [effect_size_interpretation],
                'r_Male_Female': [round(r_male_female, 3) if r_male_female is not None else None],
                'Effect_Male_Female': [effect_male_female],
                'r_Male_Neutral': [round(r_male_neutral, 3) if r_male_neutral is not None else None],
                'Effect_Male_Neutral': [effect_male_neutral],
                'r_Female_Neutral': [round(r_female_neutral, 3) if r_female_neutral is not None else None],
                'Effect_Female_Neutral': [effect_female_neutral]
            })
            results_df = pd.concat([results_df, new_row], ignore_index=True)

# Display the main results DataFrame
print(f'*************{key}*************')
display(results_df)

All
LLM: gpt-4o-mini-2024-07-18
Kendall w: 0.009167494032001568
Nemenyi:                  0                1                2
0              1.0    0.39978230849 1.4148184146e-07
1    0.39978230849              1.0 4.3361092494e-11
2 1.4148184146e-07 4.3361092494e-11              1.0
1870
LLM: mistral
Kendall w: 0.009532011647015367
Nemenyi:                  0                1                2
0              1.0    0.10561256927 1.1964297943e-06
1    0.10561256927              1.0 3.8341552155e-12
2 1.1964297943e-06 3.8341552155e-12              1.0
1870
LLM: claude-3
Kendall w: 0.013722794551812483
Nemenyi:                  0                1                2
0              1.0   0.034564759565 4.5656360914e-09
1   0.034564759565              1.0 1.1102230246e-16
2 4.5656360914e-09 1.1102230246e-16              1.0
1870
LLM: gemini
Kendall w: 0.01798086224382502
Nemenyi:               0                1                2
0           1.0    0.79929500865              0.0
1 0.79929500865

  results_df = pd.concat([results_df, new_row], ignore_index=True)


Unnamed: 0,LLM,Stereotype,P Value Male-Neutral,P Value Female-Neutral,P Value Male-Female,Male_mean,Female_mean,Neutral_mean,Male_Female_mean,Male_Neutral_mean,Female_Neutral_mean,Male_Female_Neutral_mean,Kendall_W,Effect_Size_Overall,r_Male_Female,Effect_Male_Female,r_Male_Neutral,Effect_Male_Neutral,r_Female_Neutral,Effect_Female_Neutral
0,gpt-4o-mini-2024-07-18,All,0.00,0.00,0.40,0.87,0.87,0.89,0.87,0.88,0.88,0.87,0.009,small,0.014,small,0.086,small,0.108,small
1,mistral,All,0.00,0.00,0.11,0.82,0.82,0.84,0.82,0.83,0.83,0.83,0.01,small,0.026,small,0.079,small,0.114,small
2,claude-3,All,0.00,0.00,0.03,0.84,0.83,0.86,0.84,0.85,0.85,0.84,0.014,small,0.035,small,0.096,small,inf,large
3,gemini,All,0.00,0.00,0.80,0.74,0.75,0.78,0.75,0.76,0.77,0.76,0.018,small,0.004,small,inf,large,0.133,small
4,llama-2,All,0.00,0.00,0.95,0.83,0.83,0.87,0.83,0.85,0.85,0.84,0.049,small,0.001,small,inf,large,inf,large
5,llama-3,All,0.00,0.00,0.23,0.81,0.8,0.84,0.81,0.82,0.82,0.82,0.029,small,0.02,small,inf,large,inf,large
6,gemma,All,0.00,0.00,0.92,0.79,0.79,0.82,0.79,0.8,0.8,0.8,0.023,small,0.002,small,inf,large,inf,large
7,yi,All,*,*,*,0.75,0.75,0.76,0.75,0.76,0.75,0.75,0.001,small,,,,,,


# Ageism

In [11]:
data_dict = {
    "All":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
}

In [12]:
import pandas as pd
import numpy as np
from scipy.stats import norm

# Define the paths to the CSV files
paths = [
    [PATH + '/Ageism/gpt-4o-mini-2024-07-18 ageism 2024-07-31 - cos_similarity.csv',PATH + '/Ageism/Calibration/gpt-4o-mini-2024-07-18 ageism calibration - cos_similarity.csv'],
    [PATH + '/Ageism/claude-3 ageism 2024-05-02 - cos_similarity.csv', PATH + '/Ageism/Calibration/claude-3 ageism calibration - cos_similarity.csv'],
    [PATH + '/Ageism/gemini ageism 2024-05-02 - cos_similarity.csv', PATH + '/Ageism/Calibration/gemini ageism calibration - cos_similarity.csv'],
    [PATH + '/Ageism/gemma ageism 2024-05-02 - cos_similarity.csv', PATH + '/Ageism/Calibration/gemma ageism calibration - cos_similarity.csv'],
    [PATH + '/Ageism/llama-2 ageism 2024-05-02 - cos_similarity.csv', PATH + '/Ageism/Calibration/llama-2 ageism calibration - cos_similarity.csv'],
    [PATH + '/Ageism/llama-3 ageism 2024-05-02 - cos_similarity.csv', PATH + '/Ageism/Calibration/llama-3 ageism calibration - cos_similarity.csv'],
    [PATH + '/Ageism/mistral ageism 2024-05-02 - cos_similarity.csv', PATH + '/Ageism/Calibration/mistral ageism calibration - cos_similarity.csv'],
    [PATH + '/Ageism/yi ageism 2024-05-02 - cos_similarity.csv', PATH + '/Ageism/Calibration/yi ageism calibration - cos_similarity.csv']
]

# Initialize the main results DataFrame
results_df = pd.DataFrame()

for key in data_dict.keys():
    print(key)
    # Initialize the results DataFrame for this key
    # Loop through each CSV file and calculate statistics
    for path in paths:
        # Extract LLM name from the path
        llm_name = path[0].split('/')[-1].split(' ')[0]
        print(llm_name, f'*************{key}*************')
        # Get indices for prompts
        indicies = data_dict.get(key)  # replace with actual function to get indices

        # Read the CSV files
        prompts = get_stereotype(path[0], prompts=indicies)
        neutral = get_stereotype(path[1], calc_mean=True, prompts=indicies)
        prompts = pd.merge(prompts, neutral, on='prompt_id', how='inner')
        if llm_name == 'gpt-4o-mini-2024-07-18':
          prompts.columns = ['cos_similarity: young vs neutral', 'cos_similarity: adult vs neutral',
       'cos_similarity: senior vs neutral', 'Calibration']
        # Prepare data for senior vs adult vs young comparisons
        groups = [prompts['cos_similarity: senior vs neutral'],
                  prompts['cos_similarity: adult vs neutral'],
                  prompts['cos_similarity: young vs neutral'],
                  prompts['Calibration']]

        # Perform statistical tests
        stat_results = statistics(groups, verbose=True)

        # Extract Friedman p-value and statistic - extract from the string if needed
        if isinstance(stat_results.get('friedman'), str):
            # Similar to the gender code, extract from string
            friedman_chi2 = float(stat_results['friedman'].split(', ')[0].split('=')[1])
            friedman_p = float(stat_results['friedman'].split(', ')[1].split('=')[1])
        else:
            # If it's already provided as separate values in the dictionary
            friedman_chi2 = stat_results.get('Friedman statistic')
            friedman_p = stat_results.get('Friedman p-value')

        nemenyi_results = stat_results.get('nemenyi', stat_results.get('Nemenyi results'))

        # Calculate Kendall's W for effect size
        k = len(groups)  # Number of groups/conditions
        n = len(groups[0])  # Number of subjects/blocks
        kendall_w = friedman_chi2 / (k * (n - 1))
        effect_size_interpretation = interpret_kendall_w(kendall_w)

        # Initialize effect size variables for all pairwise comparisons
        r_senior_adult = np.nan
        r_senior_young = np.nan
        r_young_adult = np.nan
        r_neutral_senior = np.nan
        r_neutral_adult = np.nan
        r_neutral_young = np.nan

        effect_senior_adult = "N/A"
        effect_senior_young = "N/A"
        effect_young_adult = "N/A"
        effect_neutral_senior = "N/A"
        effect_neutral_adult = "N/A"
        effect_neutral_young = "N/A"

        if friedman_p > 0.05:
            p_senior_adult = np.nan
            p_senior_young = np.nan
            p_young_adult = np.nan
            p_neutral_senior = np.nan
            p_neutral_adult = np.nan
            p_neutral_young = np.nan
        else:
            p_senior_adult = float(f'{nemenyi_results[0][1]:.2f}')
            p_senior_young = float(f'{nemenyi_results[0][2]:.2f}')
            p_young_adult = float(f'{nemenyi_results[1][2]:.2f}')
            p_neutral_senior = float(f'{nemenyi_results[0][3]:.2f}')
            p_neutral_adult = float(f'{nemenyi_results[1][3]:.2f}')
            p_neutral_young = float(f'{nemenyi_results[2][3]:.2f}')

            # Calculate effect sizes for each comparison when Friedman is significant
            # Senior vs Adult
            if not np.isnan(p_senior_adult):
                z_senior_adult = norm.ppf(1 - p_senior_adult/2)
                r_senior_adult = abs(z_senior_adult) / np.sqrt(2 * n)
                effect_senior_adult = interpret_effect_size_r(r_senior_adult)

            # Senior vs Young
            if not np.isnan(p_senior_young):
                z_senior_young = norm.ppf(1 - p_senior_young/2)
                r_senior_young = abs(z_senior_young) / np.sqrt(2 * n)
                effect_senior_young = interpret_effect_size_r(r_senior_young)

            # Young vs Adult
            if not np.isnan(p_young_adult):
                z_young_adult = norm.ppf(1 - p_young_adult/2)
                r_young_adult = abs(z_young_adult) / np.sqrt(2 * n)
                effect_young_adult = interpret_effect_size_r(r_young_adult)

            # Neutral vs Senior
            if not np.isnan(p_neutral_senior):
                z_neutral_senior = norm.ppf(1 - p_neutral_senior/2)
                r_neutral_senior = abs(z_neutral_senior) / np.sqrt(2 * n)
                effect_neutral_senior = interpret_effect_size_r(r_neutral_senior)

            # Neutral vs Adult
            if not np.isnan(p_neutral_adult):
                z_neutral_adult = norm.ppf(1 - p_neutral_adult/2)
                r_neutral_adult = abs(z_neutral_adult) / np.sqrt(2 * n)
                effect_neutral_adult = interpret_effect_size_r(r_neutral_adult)

            # Neutral vs Young
            if not np.isnan(p_neutral_young):
                z_neutral_young = norm.ppf(1 - p_neutral_young/2)
                r_neutral_young = abs(z_neutral_young) / np.sqrt(2 * n)
                effect_neutral_young = interpret_effect_size_r(r_neutral_young)

        senior_mean = prompts['cos_similarity: senior vs neutral'].mean()
        adult_mean = prompts['cos_similarity: adult vs neutral'].mean()
        young_mean = prompts['cos_similarity: young vs neutral'].mean()
        neutral_mean = prompts['Calibration'].mean()

        senior_adult_mean = prompts[['cos_similarity: senior vs neutral', 'cos_similarity: adult vs neutral']].mean().mean()
        senior_young_mean = prompts[['cos_similarity: senior vs neutral', 'cos_similarity: young vs neutral']].mean().mean()
        adult_young_mean = prompts[['cos_similarity: young vs neutral', 'cos_similarity: adult vs neutral']].mean().mean()
        senior_adult_young_mean = prompts[['cos_similarity: senior vs neutral', 'cos_similarity: adult vs neutral' ,'cos_similarity: young vs neutral']].mean().mean()
        senior_neutral_mean = prompts[['cos_similarity: senior vs neutral', 'Calibration']].mean().mean()
        adult_neutral_mean = prompts[['cos_similarity: adult vs neutral', 'Calibration']].mean().mean()
        young_neutral_mean = prompts[['cos_similarity: young vs neutral', 'Calibration']].mean().mean()
        senior_adult_neutral_mean = prompts[['cos_similarity: senior vs neutral', 'cos_similarity: adult vs neutral' ,'Calibration']].mean().mean()
        senior_young_neutral_mean = prompts[['cos_similarity: senior vs neutral', 'cos_similarity: young vs neutral' ,'Calibration']].mean().mean()
        adult_young_neutral_mean = prompts[['cos_similarity: young vs neutral', 'cos_similarity: adult vs neutral' ,'Calibration']].mean().mean()
        senior_adult_young_neutral_mean = prompts[['cos_similarity: senior vs neutral', 'cos_similarity: adult vs neutral' ,'cos_similarity: young vs neutral' ,'Calibration']].mean().mean()

        senior_mean = float(f'{senior_mean:.2f}')
        adult_mean = float(f'{adult_mean:.2f}')
        young_mean = float(f'{young_mean:.2f}')

        senior_adult_mean = float(f'{senior_adult_mean:.2f}')
        senior_young_mean = float(f'{senior_young_mean:.2f}')
        adult_young_mean = float(f'{adult_young_mean:.2f}')
        senior_adult_young_mean = float(f'{senior_adult_young_mean:.2f}')

        # Append results to the results DataFrame
        new_row = pd.DataFrame({
            'LLM': [llm_name],
            'Stereotype': key,
            'Senior-Neutral': [p_neutral_senior],
            'Adult-Neutral': [p_neutral_adult],
            'Young-Neutral': [p_neutral_young],
            'Senior-Adult': [p_senior_adult],
            'Senior-Young': [p_senior_young],
            'Adult-Young': [p_young_adult],
            'Senior-Adult Mean': [senior_adult_mean],
            'Senior-Young Mean': [senior_young_mean],
            'Adult-Young Mean': [adult_young_mean],
            'Senior-Adult-Young Mean': [senior_adult_young_mean],
            'Senior Mean': [senior_mean],
            'Adult Mean': [adult_mean],
            'Young Mean': [young_mean],
            'Neutral Mean': [neutral_mean],
            'Neutral-Senior': senior_neutral_mean,
            'Neutral-Adult': adult_neutral_mean,
            'Neutral-Young': young_neutral_mean,
            'Senior-Adult-Neutral Mean': [senior_adult_neutral_mean],
            'Senior-Young-Neutral Mean': [senior_young_neutral_mean],
            'Adult-Young-Neutral Mean': [adult_young_neutral_mean],
            'Senior-Adult-Young-Neutral Mean': [senior_adult_young_neutral_mean],
            'Kendall_W': [kendall_w if not np.isnan(kendall_w) else None],
            'Effect_Size_Overall': [effect_size_interpretation],
            'Effect_Senior_Adult': [effect_senior_adult],
            'Effect_Senior_Young': [effect_senior_young],
            'Effect_Young_Adult': [effect_young_adult],
            'Effect_Neutral_Senior': [effect_neutral_senior],
            'Effect_Neutral_Adult': [effect_neutral_adult],
            'Effect_Neutral_Young': [effect_neutral_young]
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)

results_df

All
gpt-4o-mini-2024-07-18 *************All*************
Friedman Test: Statistic=169.80000000000018, p-value=1.4055388471632531e-36
Nemenyi Post-Hoc Test Results:
                 0                1                2                3
0              1.0              0.0 2.8536069582e-06              0.0
1              0.0              1.0 0.00062182902606  0.0027631723368
2 2.8536069582e-06 0.00062182902606              1.0 1.1143308498e-12
3              0.0  0.0027631723368 1.1143308498e-12              1.0
claude-3 *************All*************
Friedman Test: Statistic=220.03200000000015, p-value=1.9760674287332437e-47
Nemenyi Post-Hoc Test Results:
                 0                1                2                3
0              1.0 3.3306690739e-16 1.4646319713e-07              0.0
1 3.3306690739e-16              1.0   0.027172635972 3.4561702389e-09
2 1.4646319713e-07   0.027172635972              1.0              0.0
3              0.0 3.4561702389e-09              0.0        

Unnamed: 0,LLM,Stereotype,Senior-Neutral,Adult-Neutral,Young-Neutral,Senior-Adult,Senior-Young,Adult-Young,Senior-Adult Mean,Senior-Young Mean,Adult-Young Mean,Senior-Adult-Young Mean,Senior Mean,Adult Mean,Young Mean,Neutral Mean,Neutral-Senior,Neutral-Adult,Neutral-Young,Senior-Adult-Neutral Mean,Senior-Young-Neutral Mean,Adult-Young-Neutral Mean,Senior-Adult-Young-Neutral Mean,Kendall_W,Effect_Size_Overall,Effect_Senior_Adult,Effect_Senior_Young,Effect_Young_Adult,Effect_Neutral_Senior,Effect_Neutral_Adult,Effect_Neutral_Young
0,gpt-4o-mini-2024-07-18,All,0.0,0.0,0.0,0.0,0.0,0.0,0.83,0.82,0.85,0.84,0.8,0.87,0.84,0.90654455811,0.85422977407,0.88647216318,0.87362448519,0.85828643879,0.84972132014,0.87121624621,0.85389093216,0.21331658291,small,large,large,large,large,large,large
1,claude-3,All,0.0,0.0,0.0,0.0,0.0,0.03,0.8,0.78,0.82,0.8,0.76,0.84,0.8,0.89426563563,0.82669495718,0.86474502142,0.84687483604,0.82953810719,0.81762465027,0.84299135976,0.8220245895,0.27642211055,small,large,large,small,large,large,large
2,gemini,All,0.0,0.0,0.0,0.0,0.0,0.54,0.68,0.67,0.7,0.68,0.65,0.7,0.69,0.77495712113,0.7127975368,0.73996963056,0.73464979022,0.71019240453,0.7066458443,0.72476057348,0.70622991822,0.13175101849,small,large,large,small,large,large,large
3,gemma,All,0.0,0.0,0.0,0.0,0.0,0.94,0.71,0.71,0.75,0.72,0.67,0.75,0.75,0.8236591127,0.74892755315,0.78661258978,0.78610824298,0.74914039105,0.74880415985,0.7739275176,0.7489946366,0.18988190955,small,large,large,small,large,large,large
4,llama-2,All,0.0,0.0,0.0,0.0,0.0,1.0,0.75,0.75,0.78,0.76,0.72,0.78,0.78,0.84750806027,0.78196501113,0.81307483845,0.8156434335,0.78085721296,0.78256960966,0.80330949455,0.78158761141,0.22220351759,small,large,large,small,large,large,large
5,llama-3,All,0.0,0.0,0.0,0.0,0.0,0.0,0.76,0.74,0.78,0.76,0.73,0.79,0.76,0.84596032396,0.78620929579,0.8200829486,0.80228812172,0.78887472161,0.77701150369,0.7995939389,0.78131002108,0.22027386935,small,large,large,large,large,large,large
6,mistral,All,0.0,0.0,0.0,0.0,0.0,0.81,0.77,0.76,0.8,0.78,0.73,0.81,0.8,0.84884819996,0.78735875346,0.82944342148,0.82208858824,0.79491871664,0.79001549448,0.81807193982,0.79502128161,0.19848994975,small,large,large,small,large,large,large
7,yi,All,0.0,0.07,0.0,0.0,0.0,0.02,0.72,0.71,0.74,0.72,0.69,0.75,0.72,0.79654593943,0.74404704564,0.773166268,0.75908440391,0.74596022929,0.7365723199,0.7559851348,0.73987588907,0.11994502276,small,large,large,small,large,small,large


# Ethnicity

In [14]:
def get_category_for_prompt_id(prompt_id):
    for category, ids in data_dict.items():
        if prompt_id in ids:
            return category
    return "Unknown"  # Return "Unknown" if the prompt_id is not found in any category

def calculate_combinations_mean(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate the mean of each possible combination of columns in the DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame to calculate means for.

    Returns:
    pd.DataFrame: A DataFrame with the mean values for each column combination.
                  - Index: Combination of column names (joined by ' - ').
                  - 'Mean': Mean value for the combination.
    """
    # Dictionary to store the mean of each combination
    means_dict = {}

    # Get all column names
    columns = df.columns

    # Iterate over all possible combinations of the columns
    for r in range(1, len(columns) + 1):
        for combo in itertools.combinations(columns, r):
            combo_name = ' - '.join(combo)
            means_dict[combo_name] = df[list(combo)].mean(axis=1).mean(axis=0)

    # Convert to DataFrame with a single column 'Mean'
    combinations_mean_df = pd.DataFrame.from_dict(means_dict, orient='index', columns=['Mean'])
    return combinations_mean_df

def statistics(groups: list, verbose: bool = False) -> dict:
    """
    Perform statistical tests (Friedman test and Nemenyi post-hoc test) on the provided groups.

    Parameters:
    groups (list): A list of lists, where each inner list represents a group of data points.
    verbose (bool): If True, print detailed results of the tests. Default is False.

    Returns:
    dict: A dictionary containing the results of the statistical tests.
          - 'friedman': Result of the Friedman test (statistic and p-value).
          - 'nemenyi': Result of the Nemenyi post-hoc test if applicable, otherwise None.
          - 'kendall_w': Kendall's W coefficient for overall effect size measurement.
          - 'effect_size_interpretation': Interpretation of the Kendall's W value.
          - 'pairwise_effects': Dictionary containing r effect sizes for each pair comparison.
    """
    num_groups = len(groups)
    results = {}

    # Initialize pairwise effects dictionary
    pairwise_effects = {}
    pairwise_effect_interpretations = {}

    # Perform tests only if there are at least 2 groups
    if num_groups >= 2:
        # Conduct Friedman test
        friedman_stat, friedman_p_value = friedmanchisquare(*groups)
        results['friedman'] = f"Friedman Test: Statistic={friedman_stat}, p_value={friedman_p_value}"

        # Calculate Kendall's W for overall effect size
        k = len(groups)  # Number of groups/conditions
        n = len(groups[0])  # Number of subjects/blocks
        kendall_w = friedman_stat / (k * (n - 1))
        results['kendall_w'] = kendall_w
        results['effect_size_interpretation'] = interpret_kendall_w(kendall_w)

        # Conduct Nemenyi post-hoc test if Friedman test is significant
        if friedman_p_value < 0.05:
            data = np.array(groups).T
            nemenyi_results = sp.posthoc_nemenyi_friedman(data)
            results['nemenyi'] = nemenyi_results

            # Calculate effect sizes for each pairwise comparison
            from scipy.stats import norm

            # Initialize dictionaries to store pairwise effects
            pairwise_effects = {}
            pairwise_effect_interpretations = {}

            # For each pair of groups, calculate the effect size
            for i in range(len(groups)):
                for j in range(i+1, len(groups)):
                    p_value = nemenyi_results[i][j]
                    pair_key = f"{i}_{j}"

                    if not np.isnan(p_value):
                        # Calculate r effect size
                        z_value = norm.ppf(1 - p_value/2)
                        r_effect = abs(z_value) / np.sqrt(2 * n)
                        effect_interpretation = interpret_effect_size_r(r_effect)

                        # Store results
                        pairwise_effects[pair_key] = r_effect
                        pairwise_effect_interpretations[pair_key] = effect_interpretation

                        if verbose:
                            print(f"Effect size (r) for pair {i}-{j}: {r_effect:.4f} ({effect_interpretation})")
                    else:
                        pairwise_effects[pair_key] = np.nan
                        pairwise_effect_interpretations[pair_key] = "N/A"

            # Add pairwise effects to results
            results['pairwise_effects'] = pairwise_effects
            results['pairwise_effect_interpretations'] = pairwise_effect_interpretations

            return results

        # If Friedman test is not significant, return None for Nemenyi results
        elif friedman_p_value is not None:
            if verbose:
                print("Friedman test is not significant (p >= 0.05); Nemenyi post-hoc test not performed.")
            results['nemenyi'] = None
            results['pairwise_effects'] = {f"{i}_{j}": np.nan for i in range(len(groups)) for j in range(i+1, len(groups))}
            results['pairwise_effect_interpretations'] = {f"{i}_{j}": "N/A" for i in range(len(groups)) for j in range(i+1, len(groups))}
            return results
        else:
            raise ValueError("Data shape of content does not match the requirements to perform Friedman.")
    else:
        raise ValueError("At least two groups are required for comparison.")

def homogeneous_groups(type: str, responses_path: str, calibration_path: str, stereotype_group_id: str = None) -> dict:
    """
    Analyze homogeneity within specified demographic groups (gender, ageism, or ethnicity) based on cosine similarity data.

    Parameters:
    type (str): The demographic type to analyze: 'gender', 'ageism', 'ethnicity', etc.
    responses_path (str): Path to the CSV file containing responses with cosine similarity values.
    calibration_path (str): Path to the CSV file containing calibration's cosine similarity values.
    stereotype_group_id (str, optional): If provided, filters responses to include only those with the specified stereotype group ID. Default is None.

    Returns:
    dict: A dictionary containing the results of the statistical analysis.
          - 'friedman': Result of the Friedman test (statistic and p-value).
          - 'nemenyi': Result of the Nemenyi post-hoc test if applicable, otherwise None.
          - 'means': Mean values for each combination of demographic groups.
          - 'kendall_w': Kendall's W coefficient for overall effect size measurement.
          - 'effect_size_interpretation': Interpretation of the Kendall's W value.
          - 'pairwise_effects': Dictionary containing r effect sizes for each pair comparison.
    """
    # Load responses data from the specified CSV file, setting 'prompt_id' as the index
    responses = pd.read_csv(responses_path, index_col=['prompt_id'])
    responses.drop(columns=['pronoun_sequence_id'], axis=1, inplace=True)

    # Ethnicity Analysis: Consider cosine similarity between various ethnic groups and neutral prompts
    responses = responses[['cos_similarity: neutral vs neutral american',
                            'cos_similarity: neutral vs caucasian american',
                            'cos_similarity: neutral vs asian',
                            'cos_similarity: neutral vs african american',
                            'cos_similarity: neutral vs hispanic',
                            'stereotype_group_id']]

    # Filter by stereotype group ID if provided
    if stereotype_group_id is not None:
        responses = responses[responses.stereotype_group_id.isin([stereotype_group_id])]
    responses = responses.drop(columns=['stereotype_group_id'])
    responses.columns = ['Neutral American', 'Caucasian American', 'Asian', 'African American', 'Hispanic']

    # Load calibration data and calculate the mean across neutral comparisons
    calibration = pd.read_csv(calibration_path, index_col=['prompt_id'])
    calibration = calibration.loc[:, calibration.columns.str.contains('neutral')]
    calibration = calibration.mean(axis=1)
    calibration.columns = 'calibration'
    calibration.name = 'Calibration'

    # Merge responses with the calibration data
    responses = pd.merge(left=responses, right=calibration, left_index=True, right_index=True)

    # Create a list of groups for statistical analysis (each group corresponds to a column in responses)
    groups = [responses[col] for col in responses.columns]
    group_names = responses.columns.tolist()

    # Perform statistical tests and calculate means for combinations of groups
    results = statistics(groups, verbose=False)
    results['means'] = calculate_combinations_mean(responses)
    results['group_names'] = group_names

    # Create a mapping from index pairs to named pairs for easier interpretation
    if 'pairwise_effects' in results:
        named_pairwise_effects = {}
        named_pairwise_interpretations = {}

        for pair_key, effect_value in results['pairwise_effects'].items():
            i, j = map(int, pair_key.split('_'))
            named_key = f"{group_names[i]}-{group_names[j]}"
            named_pairwise_effects[named_key] = effect_value
            named_pairwise_interpretations[named_key] = results['pairwise_effect_interpretations'][pair_key]

        results['named_pairwise_effects'] = named_pairwise_effects
        results['named_pairwise_interpretations'] = named_pairwise_interpretations

    return results

## GPT-4o

In [15]:
type = 'ethnicity'
responses_path = PATH + '/Ethnicity/gpt-4o-mini-2024-07-18 ethnicity 2024-08-13 - cos_similarity.csv'
calibration_path = PATH + '/Ethnicity/Calibration/gpt-4o-mini-2024-07-18 ethnicity calibration - cos_similarity.csv'
results_dict = homogeneous_groups(type, responses_path, calibration_path)
results_dict

{'friedman': 'Friedman Test: Statistic=152.7311355311358, p_value=3.4999978835189137e-31',
 'kendall_w': np.float64(0.06543750451205475),
 'effect_size_interpretation': 'small',
 'nemenyi':                  0                1                2                3  \
 0              1.0    0.39346402326 7.4606987255e-14 4.7073456244e-14   
 1    0.39346402326              1.0 5.0008195696e-08 3.5234044238e-08   
 2 7.4606987255e-14 5.0008195696e-08              1.0    0.99999991278   
 3 4.7073456244e-14 3.5234044238e-08    0.99999991278              1.0   
 4 1.7763568394e-15 2.7735274077e-09    0.99745130969    0.99865681998   
 5    0.93133873396    0.93133873396 9.5682017864e-11 6.3873129008e-11   
 
                  4                5  
 0 1.7763568394e-15    0.93133873396  
 1 2.7735274077e-09    0.93133873396  
 2    0.99745130969 9.5682017864e-11  
 3    0.99865681998 6.3873129008e-11  
 4              1.0 3.4461322684e-12  
 5 3.4461322684e-12              1.0  ,
 'pairwise_effect

In [16]:
print(results_dict['means'].to_markdown())

|                                                                                           |     Mean |
|:------------------------------------------------------------------------------------------|---------:|
| Neutral American                                                                          | 0.849224 |
| Caucasian American                                                                        | 0.839691 |
| Asian                                                                                     | 0.813285 |
| African American                                                                          | 0.810178 |
| Hispanic                                                                                  | 0.819415 |
| Calibration                                                                               | 0.865802 |
| Neutral American - Caucasian American                                                     | 0.844457 |
| Neutral American - Asian                             

## Mistral

In [17]:
type = 'ethnicity'
responses_path = PATH +'/Ethnicity/mistral ethnicity 2024-08-13 - cos_similarity.csv'
calibration_path = PATH +'/Ethnicity/Calibration/mistral ethnicity calibration - cos_similarity.csv'
results_dict = homogeneous_groups(type, responses_path, calibration_path)
results_dict

{'friedman': 'Friedman Test: Statistic=194.3282051282058, p_value=4.639552703970704e-40',
 'kendall_w': np.float64(0.0832597279898054),
 'effect_size_interpretation': 'small',
 'nemenyi':                  0                1                2               3  \
 0              1.0   0.033691511919 1.6798784586e-12             0.0   
 1   0.033691511919              1.0  0.0001308984772 1.863564858e-11   
 2 1.6798784586e-12  0.0001308984772              1.0  0.083373357892   
 3              0.0  1.863564858e-11   0.083373357892             1.0   
 4              0.0 1.9515130623e-08    0.58065044762   0.90676256091   
 5    0.99471571773    0.13941184776 8.3653084459e-11             0.0   
 
                  4                5  
 0              0.0    0.99471571773  
 1 1.9515130623e-08    0.13941184776  
 2    0.58065044762 8.3653084459e-11  
 3    0.90676256091              0.0  
 4              1.0 2.2204460493e-16  
 5 2.2204460493e-16              1.0  ,
 'pairwise_effects': {'0_1

In [18]:
print(results_dict['means'].to_markdown())

|                                                                                           |     Mean |
|:------------------------------------------------------------------------------------------|---------:|
| Neutral American                                                                          | 0.798575 |
| Caucasian American                                                                        | 0.775809 |
| Asian                                                                                     | 0.751422 |
| African American                                                                          | 0.733855 |
| Hispanic                                                                                  | 0.740743 |
| Calibration                                                                               | 0.783203 |
| Neutral American - Caucasian American                                                     | 0.787192 |
| Neutral American - Asian                             

## Llama-3

In [19]:
type = 'ethnicity'
responses_path = PATH + '/Ethnicity/llama-3 ethnicity 2024-08-13 - cos_similarity.csv'
calibration_path = PATH + '/Ethnicity/Calibration/llama-3 ethnicity calibration - cos_similarity.csv'
results_dict = homogeneous_groups(type, responses_path, calibration_path)
results_dict

{'friedman': 'Friedman Test: Statistic=283.47939278380846, p_value=3.5597616368520354e-59',
 'kendall_w': np.float64(0.12145646648834982),
 'effect_size_interpretation': 'small',
 'nemenyi':                  0               1                2              3  \
 0              1.0 2.588151915e-12 7.5417450063e-13            0.0   
 1  2.588151915e-12             1.0    0.99998427794 0.001179633267   
 2 7.5417450063e-13   0.99998427794              1.0 0.002253479784   
 3              0.0  0.001179633267   0.002253479784            1.0   
 4              0.0    0.5103681904    0.61886453038  0.23713972865   
 5    0.30992460209             0.0              0.0            0.0   
 
               4             5  
 0           0.0 0.30992460209  
 1  0.5103681904           0.0  
 2 0.61886453038           0.0  
 3 0.23713972865           0.0  
 4           1.0           0.0  
 5           0.0           1.0  ,
 'pairwise_effects': {'0_1': np.float64(0.25058460033367275),
  '0_2': np.float

In [20]:
print(results_dict['means'].to_markdown())

|                                                                                           |     Mean |
|:------------------------------------------------------------------------------------------|---------:|
| Neutral American                                                                          | 0.739005 |
| Caucasian American                                                                        | 0.615669 |
| Asian                                                                                     | 0.652258 |
| African American                                                                          | 0.60644  |
| Hispanic                                                                                  | 0.658154 |
| Calibration                                                                               | 0.782417 |
| Neutral American - Caucasian American                                                     | 0.677337 |
| Neutral American - Asian                             

## Llama-2

In [21]:
type = 'ethnicity'
responses_path = PATH + '/Ethnicity/llama-2 ethnicity 09-09-2024 - cos_similarity.csv'
calibration_path = PATH + '/Ethnicity/Calibration/llama-2 ethnicity calibration - cos_similarity.csv'
results_dict = homogeneous_groups(type, responses_path, calibration_path)
results_dict

{'friedman': 'Friedman Test: Statistic=919.5420211205636, p_value=1.568842922370933e-196',
 'kendall_w': np.float64(0.39397687280229804),
 'effect_size_interpretation': 'medium',
 'nemenyi':               0               1              2                3  \
 0           1.0             0.0            0.0              0.0   
 1           0.0             1.0  0.99812822749  0.0095199692095   
 2           0.0   0.99812822749            1.0   0.037738509553   
 3           0.0 0.0095199692095 0.037738509553              1.0   
 4           0.0   0.91334951999  0.69933497128 0.00016292678483   
 5 0.18001978118             0.0            0.0              0.0   
 
                  4             5  
 0              0.0 0.18001978118  
 1    0.91334951999           0.0  
 2    0.69933497128           0.0  
 3 0.00016292678483           0.0  
 4              1.0           0.0  
 5              0.0           1.0  ,
 'pairwise_effects': {'0_1': np.float64(inf),
  '0_2': np.float64(inf),
  '0_3'

In [22]:
print(results_dict['means'].to_markdown())

|                                                                                           |     Mean |
|:------------------------------------------------------------------------------------------|---------:|
| Neutral American                                                                          | 0.777801 |
| Caucasian American                                                                        | 0.522332 |
| Asian                                                                                     | 0.506005 |
| African American                                                                          | 0.480473 |
| Hispanic                                                                                  | 0.511165 |
| Calibration                                                                               | 0.816126 |
| Neutral American - Caucasian American                                                     | 0.650066 |
| Neutral American - Asian                             

## Yi

In [23]:
type = 'ethnicity'
responses_path = PATH + '/Ethnicity/yi ethnicity 09-09-2024 - cos_similarity.csv'
calibration_path = PATH + '/Ethnicity/Calibration/yi ethnicity calibration - cos_similarity.csv'
results_dict = homogeneous_groups(type, responses_path, calibration_path)
results_dict

{'friedman': 'Friedman Test: Statistic=22.13805832410531, p_value=0.0004928825911189778',
 'kendall_w': np.float64(0.009485029273395592),
 'effect_size_interpretation': 'small',
 'nemenyi':                0             1              2              3              4  \
 0            1.0 0.67506263758 0.002253479784 0.015438160696  0.86938430528   
 1  0.67506263758           1.0  0.19920075906  0.49768910619  0.99927299583   
 2 0.002253479784 0.19920075906            1.0  0.99471571773 0.087571418224   
 3 0.015438160696 0.49768910619  0.99471571773            1.0  0.28437820823   
 4  0.86938430528 0.99927299583 0.087571418224  0.28437820823            1.0   
 5  0.98615515079 0.96339309958 0.023018475919  0.10361255062  0.99718822775   
 
                5  
 0  0.98615515079  
 1  0.96339309958  
 2 0.023018475919  
 3  0.10361255062  
 4  0.99718822775  
 5            1.0  ,
 'pairwise_effects': {'0_1': np.float64(0.01501012713186466),
  '0_2': np.float64(0.10937283910533009),
  '0

In [24]:
print(results_dict['means'].to_markdown())

|                                                                                           |     Mean |
|:------------------------------------------------------------------------------------------|---------:|
| Neutral American                                                                          | 0.696147 |
| Caucasian American                                                                        | 0.669905 |
| Asian                                                                                     | 0.653218 |
| African American                                                                          | 0.638305 |
| Hispanic                                                                                  | 0.673486 |
| Calibration                                                                               | 0.719606 |
| Neutral American - Caucasian American                                                     | 0.683026 |
| Neutral American - Asian                             

## Claude-3

In [25]:
type = 'ethnicity'
responses_path = PATH + '/Ethnicity/claude-3-opus-20240229 ethnicity 26-12-2024 - cos_similarity.csv'
calibration_path = PATH + '/Ethnicity/Calibration/claude-3-opus-20240229 ethnicity calibration - cos_similarity.csv'
results_dict = homogeneous_groups(type, responses_path, calibration_path)
results_dict

{'friedman': 'Friedman Test: Statistic=562.1875457875449, p_value=2.9818214075911006e-119',
 'kendall_w': np.float64(0.24086869999466362),
 'effect_size_interpretation': 'small',
 'nemenyi':                0              1               2               3  \
 0            1.0            0.0             0.0             0.0   
 1            0.0            1.0   0.12752296395     0.061480576   
 2            0.0  0.12752296395             1.0 2.108789853e-06   
 3            0.0    0.061480576 2.108789853e-06             1.0   
 4            0.0 0.018605979095   0.98322531274 3.961063022e-08   
 5 0.011964905421            0.0             0.0             0.0   
 
                 4              5  
 0             0.0 0.011964905421  
 1  0.018605979095            0.0  
 2   0.98322531274            0.0  
 3 3.961063022e-08            0.0  
 4             1.0            0.0  
 5             0.0            1.0  ,
 'pairwise_effects': {'0_1': np.float64(inf),
  '0_2': np.float64(inf),
  '0_3'

In [26]:
print(results_dict['means'].to_markdown())

|                                                                                           |     Mean |
|:------------------------------------------------------------------------------------------|---------:|
| Neutral American                                                                          | 0.815935 |
| Caucasian American                                                                        | 0.656709 |
| Asian                                                                                     | 0.690119 |
| African American                                                                          | 0.624975 |
| Hispanic                                                                                  | 0.694235 |
| Calibration                                                                               | 0.857556 |
| Neutral American - Caucasian American                                                     | 0.736322 |
| Neutral American - Asian                             

## Gemini

In [27]:
type = 'ethnicity'
responses_path = PATH + '/Ethnicity/gemini-1.0-pro ethnicity 17-10-2024 - cos_similarity.csv'
calibration_path = PATH + '/Ethnicity/Calibration/gemini-1.0-pro ethnicity calibration - cos_similarity.csv'
results_dict = homogeneous_groups(type, responses_path, calibration_path)
results_dict

{'friedman': 'Friedman Test: Statistic=56.88721860189531, p_value=5.3349900132542215e-11',
 'kendall_w': np.float64(0.02437327275145472),
 'effect_size_interpretation': 'small',
 'nemenyi':                  0                1              2                3  \
 0              1.0 3.8549431725e-05 0.002253479784 1.0101121051e-09   
 1 3.8549431725e-05              1.0  0.93935087421    0.44769316391   
 2   0.002253479784    0.93935087421            1.0   0.064757824212   
 3 1.0101121051e-09    0.44769316391 0.064757824212              1.0   
 4  0.0063570267425    0.83427925505  0.99977914749   0.029164155232   
 5    0.86103110172  0.0051656834769 0.091937344244 1.1232258617e-06   
 
                 4                5  
 0 0.0063570267425    0.86103110172  
 1   0.83427925505  0.0051656834769  
 2   0.99977914749   0.091937344244  
 3  0.029164155232 1.1232258617e-06  
 4             1.0    0.17634522615  
 5   0.17634522615              1.0  ,
 'pairwise_effects': {'0_1': np.float6

In [28]:
print(results_dict['means'].to_markdown())

|                                                                                           |     Mean |
|:------------------------------------------------------------------------------------------|---------:|
| Neutral American                                                                          | 0.739096 |
| Caucasian American                                                                        | 0.677186 |
| Asian                                                                                     | 0.69358  |
| African American                                                                          | 0.66854  |
| Hispanic                                                                                  | 0.708158 |
| Calibration                                                                               | 0.747019 |
| Neutral American - Caucasian American                                                     | 0.708141 |
| Neutral American - Asian                             

## Gemma

In [29]:
type = 'ethnicity'
responses_path = PATH + '/Ethnicity/gemma ethnicity 09-09-2024 - cos_similarity.csv'
calibration_path = PATH + '/Ethnicity/Calibration/gemma ethnicity calibration - cos_similarity.csv'
results_dict = homogeneous_groups(type, responses_path, calibration_path)
results_dict

{'friedman': 'Friedman Test: Statistic=497.6665441176475, p_value=2.546389594265546e-105',
 'kendall_w': np.float64(0.21322474041030312),
 'effect_size_interpretation': 'small',
 'nemenyi':                  0                1                2                3  \
 0              1.0              0.0              0.0              0.0   
 1              0.0              1.0 0.00022046921707    0.25075716405   
 2              0.0 0.00022046921707              1.0 1.2228695834e-09   
 3              0.0    0.25075716405 1.2228695834e-09              1.0   
 4 6.9433898442e-07 2.2204460493e-16 0.00036628448794              0.0   
 5    0.99212861312              0.0              0.0              0.0   
 
                  4                5  
 0 6.9433898442e-07    0.99212861312  
 1 2.2204460493e-16              0.0  
 2 0.00036628448794              0.0  
 3              0.0              0.0  
 4              1.0 2.1978712628e-08  
 5 2.1978712628e-08              1.0  ,
 'pairwise_effect

In [30]:
print(results_dict['means'].to_markdown())

|                                                                                           |     Mean |
|:------------------------------------------------------------------------------------------|---------:|
| Neutral American                                                                          | 0.73096  |
| Caucasian American                                                                        | 0.48415  |
| Asian                                                                                     | 0.589422 |
| African American                                                                          | 0.456413 |
| Hispanic                                                                                  | 0.66089  |
| Calibration                                                                               | 0.767071 |
| Neutral American - Caucasian American                                                     | 0.607555 |
| Neutral American - Asian                             