# Analysis of Results


## Setup

In [None]:
pip install scikit_posthocs



In [None]:
import pandas as pd
pd.options.display.float_format = '{:.11}'.format
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import friedmanchisquare
import scikit_posthocs as sp
import os
import itertools

In [None]:
# Function to perform statistical tests
def statistics(groups, verbose=False):
    num_groups = len(groups)
    results = {}

    if num_groups >= 2:
        # Conduct Friedman test
        friedman_stat, friedman_p_value = friedmanchisquare(*groups)
        results['Friedman p-value'] = friedman_p_value
        results['Friedman statistic'] = friedman_stat
        if verbose:
            print(f"Friedman Test: Statistic={friedman_stat}, p-value={friedman_p_value}")

        # Conduct Nemenyi post-hoc test if Friedman test is significant
        if friedman_p_value < 0.05:
            data = np.array(groups).T
            nemenyi_results = sp.posthoc_nemenyi_friedman(data)
            results['Nemenyi results'] = nemenyi_results
            if verbose:
                print("Nemenyi Post-Hoc Test Results:")
                print(nemenyi_results)
        else:
            if verbose:
                print("Friedman test is not significant; Nemenyi post-hoc test not performed.")
    else:
        raise ValueError("At least two groups are required for comparison.")

    return results

def calculate_means(df):
    # Dictionary to store the mean of each combination
    means_dict = {}

    # Get all column names
    columns = df.columns

    # Iterate over all possible combinations of the columns
    for r in range(1, len(columns) + 1):
        for combo in itertools.combinations(columns, r):
            combo_name = ''.join(combo)
            means_dict[combo_name] = df[list(combo)].mean(axis=1)

In [None]:
def get_stereotype(responses_path, calc_mean=False, prompts=[1,2,3,4,5,6,7,8,9,10,13,16,17,18,19,20]):
    df = pd.read_csv(responses_path, index_col=['prompt_id'])
    if 'pronoun_sequence_id' in df.columns:
      df.drop(columns=['pronoun_sequence_id'], inplace=True)
    df = df.loc[:, (df.columns.str.contains('neutral'))]
    #df = responses.groupby('prompt_id').mean()
    df = df[df.index.isin(prompts)]
    #df.columns = ['Young vs Neutral', 'Adult vs Neutral', 'Senior vs Neutral']
    # calibration = pd.read_csv(calibration_path, index_col=['prompt_id'])
    # calibration = calibration.loc[:, calibration.columns.str.contains('neutral')]
    if calc_mean:
      df = pd.DataFrame(df.mean(axis=1).values, index=df.index, columns=['Calibration'])
    # calibration = calibration[calibration.index.isin(prompts)]
    # df['Calibration'] = calibration['Calibration']
    return df

In [None]:
PATH = '/'

# Gender

In [None]:
data_dict = {
    "Career, Education, and Finance": [1, 2, 7, 11, 18, 32, 41],
    "Entertainment, Leisure, and Preferences": [3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 19, 21, 23, 25, 28, 30, 31, 34, 42, 45, 48, 50],
    "Social Interactions and Relationships": [13, 20, 33, 36, 38, 40, 43, 46, 51, 52],
    "Personal Development and Well-being": [16, 17, 22, 24, 26, 27, 29, 35, 37, 39, 44, 47, 49, 53, 54, 55],
    "All" : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55]
}

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import friedmanchisquare
import scikit_posthocs as sp

# Define the paths to the CSV files
paths = [
    [PATH + '/Gender/Cosine Similarity/gpt-4o-mini-2024-07-18 gender 2024-07-30 - cos_similarity.csv', PATH + '/Gender/Calibration/Cosine Similarity/gpt-4o-mini-2024-07-18 gender calibration - cos_similarity.csv'],
    [PATH + '/Gender/Cosine Similarity/mistral gender 2024-05-02 - cos_similarity.csv', PATH + '/Gender/Calibration/Cosine Similarity/mistral gender calibration - cos_similarity.csv'],
    [PATH + '/Gender/Cosine Similarity/claude-3 gender 2024-05-02 - cos_similarity.csv', PATH + '/Gender/Calibration/Cosine Similarity/claude-3 gender calibration - cos_similarity.csv'],
    [PATH + '/Gender/Cosine Similarity/gemini gender 19-07-2024 - cos_similarity.csv', PATH + '/Gender/Calibration/Cosine Similarity/gemini gender calibration - cos_similarity.csv'],
    [PATH + '/Gender/Cosine Similarity/llama-2 gender 2024-05-02 - cos_similarity.csv', PATH + '/Gender/Calibration/Cosine Similarity/llama-2 gender calibration - cos_similarity.csv'],
    [PATH + '/Gender/Cosine Similarity/llama-3 gender 2024-05-02 - cos_similarity.csv', PATH + '/Gender/Calibration/Cosine Similarity/llama-3 gender calibration - cos_similarity.csv'],
    [PATH + '/Gender/Cosine Similarity/gemma gender 2024-05-02 - cos_similarity.csv', PATH + '/Gender/Calibration/Cosine Similarity/gemma gender calibration - cos_similarity.csv'],
    [PATH + '/Gender/Cosine Similarity/yi gender 2024-05-02 - cos_similarity.csv', PATH + '/Gender/Calibration/Cosine Similarity/yi gender calibration - cos_similarity.csv']]

# Initialize the main results DataFrame
results_df = pd.DataFrame()

for key in data_dict.keys():
    print(key)
    # Initialize the results DataFrame for this key
    # Loop through each CSV file and calculate statistics
    for path in paths:
        # Extract LLM name from the path
        llm_name = path[0].split('/')[-1].split(' ')[0]
        print(llm_name)

        # Get indices for prompts
        indicies = data_dict.get(key)  # replace with actual function to get indices

        # Read the CSV files
        prompts = get_stereotype(path[0], prompts=indicies)
        calibration = get_stereotype(path[1], calc_mean=True, prompts=indicies)

        # Merge prompts and calibration data
        prompts = pd.merge(prompts, calibration, on='prompt_id', how='inner')

        if len(prompts.columns) >= 2:
            try:
                len(prompts['cos_similarity: male vs neutral'])
                male = 'cos_similarity: male vs neutral'
                female = 'cos_similarity: female vs neutral'
            except:
                male = 'cos_similarity: neutral vs male'
                female = 'cos_similarity: neutral vs female'

            # Prepare data for male vs female vs calibration comparisons
            groups = [prompts[male], prompts[female], prompts['Calibration']]

            # Perform statistical tests
            stat_results = statistics(groups, verbose=False)
            friedman_p = stat_results.get('Friedman p-value')
            nemenyi_results = stat_results.get('Nemenyi results')

            if friedman_p > 0.05:
                p_male_female = '*'
                p_male_calibration = '*'
                p_calibration_female = '*'
            else:
                p_male_female = f'{nemenyi_results[0][1]:.2f}'
                p_male_calibration = f'{nemenyi_results[0][2]:.2f}'
                p_calibration_female = f'{nemenyi_results[1][2]:.2f}'

            print(llm_name, nemenyi_results)
            male_mean = prompts[male].mean()
            female_mean = prompts[female].mean()
            calibration_mean = prompts['Calibration'].mean()

            male_female_mean = prompts[[male, female]].mean().mean()
            male_calibration_mean = prompts[[male, 'Calibration']].mean().mean()
            female_calibration_mean = prompts[[female, 'Calibration']].mean().mean()
            male_female_calibration_mean = prompts[[male, female ,'Calibration']].mean().mean()


            male_mean = float(f'{male_mean:.2f}')
            female_mean = float(f'{female_mean:.2f}')
            calibration_mean = float(f'{calibration_mean:.2f}')

            male_female_mean = float(f'{male_female_mean:.2f}')
            male_calibration_mean = float(f'{male_calibration_mean:.2f}')
            female_calibration_mean = float(f'{female_calibration_mean:.2f}')
            male_female_calibration_mean = float(f'{male_female_calibration_mean:.2f}')
            print(len(prompts))
            # Append results to the results DataFrame
            new_row = pd.DataFrame({
                'LLM': [llm_name],
                'Stereotype': key,
                'P Value Male-Calibration': [p_male_calibration],
                'P Value Female-Calibration': [p_calibration_female],
                'P Value Male-Female': [p_male_female],
                'male_mean': [male_mean],
                'female_mean':[female_mean],
                'calibration_mean':[calibration_mean],
                'male_female_mean':[male_female_mean],
                'male_calibration_mean':[male_calibration_mean],
                'female_calibration_mean':[female_calibration_mean],
                'male_female_calibration_mean':[male_female_calibration_mean]

            })
            results_df = pd.concat([results_df, new_row], ignore_index=True)

# Display the main results DataFrame
print(f'*************{key}*************')
display(results_df)


Career, Education, and Finance
gpt-4o-mini-2024-07-18
gpt-4o-mini-2024-07-18 None
238
mistral
mistral None
238
claude-3
claude-3 None
238
gemini
gemini None
238
llama-2
llama-2                  0                1                2
0              1.0    0.30703119516 1.4727305486e-07
1    0.30703119516              1.0 1.3446244118e-11
2 1.4727305486e-07 1.3446244118e-11              1.0
238
llama-3
llama-3                 0               1               2
0             1.0   0.98163989085 0.0060457795415
1   0.98163989085             1.0 0.0032602678126
2 0.0060457795415 0.0032602678126             1.0
238
gemma
gemma None
238
yi
yi None
238
Entertainment, Leisure, and Preferences
gpt-4o-mini-2024-07-18
gpt-4o-mini-2024-07-18                 0                1                2
0             1.0    0.82299088442  1.315042597e-08
1   0.82299088442              1.0 3.0668723117e-10
2 1.315042597e-08 3.0668723117e-10              1.0
748
mistral
mistral               0             1   2
0  

Unnamed: 0,LLM,Stereotype,P Value Male-Calibration,P Value Female-Calibration,P Value Male-Female,male_mean,female_mean,calibration_mean,male_female_mean,male_calibration_mean,female_calibration_mean,male_female_calibration_mean
0,gpt-4o-mini-2024-07-18,"Career, Education, and Finance",*,*,*,0.86,0.86,0.87,0.86,0.86,0.86,0.86
1,mistral,"Career, Education, and Finance",*,*,*,0.81,0.81,0.83,0.81,0.82,0.82,0.82
2,claude-3,"Career, Education, and Finance",*,*,*,0.82,0.82,0.85,0.82,0.83,0.84,0.83
3,gemini,"Career, Education, and Finance",*,*,*,0.77,0.77,0.79,0.77,0.78,0.78,0.78
4,llama-2,"Career, Education, and Finance",0.00,0.00,0.31,0.85,0.85,0.88,0.85,0.86,0.87,0.86
5,llama-3,"Career, Education, and Finance",0.01,0.00,0.98,0.82,0.82,0.86,0.82,0.84,0.84,0.83
6,gemma,"Career, Education, and Finance",*,*,*,0.79,0.8,0.78,0.8,0.79,0.79,0.79
7,yi,"Career, Education, and Finance",*,*,*,0.79,0.78,0.83,0.78,0.81,0.8,0.8
8,gpt-4o-mini-2024-07-18,"Entertainment, Leisure, and Preferences",0.00,0.00,0.82,0.86,0.86,0.89,0.86,0.88,0.88,0.87
9,mistral,"Entertainment, Leisure, and Preferences",0.00,0.00,0.37,0.81,0.81,0.87,0.81,0.84,0.84,0.83


# Ageism

In [None]:
data_dict = {
    "Dealing with Change":[1,2,7,8,12],
    "Cognitive and Physical Abilities":[3,4,6,14,13,11,15],
    "Emotional Instability":[17,18,19,20,16],
    "Dependent on Else":[5,9,10],
    "All":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
}

In [None]:
import pandas as pd

# Define the paths to the CSV files
paths = [
    [PATH + '/Ageism/Cosine Similarity/gpt-4o-mini-2024-07-18 ageism 2024-07-31 - cos_similarity.csv',PATH + '/Ageism/Calibration/Cosine Similarity/gpt-4o-mini-2024-07-18 ageism calibration - cos_similarity.csv'],
    [PATH + '/Ageism/Cosine Similarity/claude-3 ageism 2024-05-02 - cos_similarity.csv', PATH + '/Ageism/Calibration/Cosine Similarity/claude-3 ageism calibration - cos_similarity.csv'],
    [PATH + '/Ageism/Cosine Similarity/gemini ageism 2024-05-02 - cos_similarity.csv', PATH + '/Ageism/Calibration/Cosine Similarity/gemini ageism calibration - cos_similarity.csv'],
    [PATH + '/Ageism/Cosine Similarity/gemma ageism 2024-05-02 - cos_similarity.csv', PATH + '/Ageism/Calibration/Cosine Similarity/gemma ageism calibration - cos_similarity.csv'],
    [PATH + '/Ageism/Cosine Similarity/llama-2 ageism 2024-05-02 - cos_similarity.csv', PATH + '/Ageism/Calibration/Cosine Similarity/llama-2 ageism calibration - cos_similarity.csv'],
    [PATH + '/Ageism/Cosine Similarity/llama-3 ageism 2024-05-02 - cos_similarity.csv', PATH + '/Ageism/Calibration/Cosine Similarity/llama-3 ageism calibration - cos_similarity.csv'],
    [PATH + '/Ageism/Cosine Similarity/mistral ageism 2024-05-02 - cos_similarity.csv', PATH + '/Ageism/Calibration/Cosine Similarity/mistral ageism calibration - cos_similarity.csv'],
    [PATH + '/Ageism/Cosine Similarity/yi ageism 2024-05-02 - cos_similarity.csv', PATH + '/Ageism/Calibration/Cosine Similarity/yi ageism calibration - cos_similarity.csv']

]
# Initialize the main results DataFrame
results_df = pd.DataFrame()

for key in data_dict.keys():
    print(key)
    # Initialize the results DataFrame for this key
    # Loop through each CSV file and calculate statistics
    for path in paths:
        # Extract LLM name from the path
        llm_name = path[0].split('/')[-1].split(' ')[0]
        print(llm_name, f'*************{key}*************')
        # Get indices for prompts
        indicies = data_dict.get(key)  # replace with actual function to get indices

        # Read the CSV files
        prompts = get_stereotype(path[0], prompts=indicies)
        calibration = get_stereotype(path[1], calc_mean=True, prompts=indicies)
        prompts = pd.merge(prompts, calibration, on='prompt_id', how='inner')
        if llm_name == 'gpt-4o-mini-2024-07-18':
          prompts.columns = ['cos_similarity: young vs neutral', 'cos_similarity: adult vs neutral',
       'cos_similarity: senior vs neutral', 'Calibration']
        # Prepare data for senior vs adult vs young comparisons
        print(prompts.columns)
        groups = [prompts['cos_similarity: senior vs neutral'],
                  prompts['cos_similarity: adult vs neutral'],
                  prompts['cos_similarity: young vs neutral'],
                  prompts['Calibration']]

        # Perform statistical tests
        stat_results = statistics(groups, verbose=True)
        friedman_p = stat_results.get('Friedman p-value')
        friedman_stat = stat_results.get('Friedman statistic')
        nemenyi_results = stat_results.get('Nemenyi results')

        if friedman_p > 0.05:
            p_senior_adult = np.nan#'*'
            p_senior_young = np.nan#'*'
            p_young_adult = np.nan#'*'
            p_calibration_senior = np.nan
            p_calibration_adult = np.nan
            p_calibration_young = np.nan
        else:
            p_senior_adult = float(f'{nemenyi_results[0][1]:.2f}')
            p_senior_young = float(f'{nemenyi_results[0][2]:.2f}')
            p_young_adult = float(f'{nemenyi_results[1][2]:.2f}')
            p_calibration_senior = float(f'{nemenyi_results[0][3]:.2f}')
            p_calibration_adult = float(f'{nemenyi_results[1][3]:.2f}')
            p_calibration_young = float(f'{nemenyi_results[2][3]:.2f}')

        senior_mean = prompts['cos_similarity: senior vs neutral'].mean()
        adult_mean = prompts['cos_similarity: adult vs neutral'].mean()
        young_mean = prompts['cos_similarity: young vs neutral'].mean()
        calibration_mean = prompts['Calibration'].mean()

        senior_adult_mean = prompts[['cos_similarity: senior vs neutral', 'cos_similarity: adult vs neutral']].mean().mean()
        senior_young_mean = prompts[['cos_similarity: senior vs neutral', 'cos_similarity: young vs neutral']].mean().mean()
        adult_young_mean = prompts[['cos_similarity: young vs neutral', 'cos_similarity: adult vs neutral']].mean().mean()
        senior_adult_young_mean = prompts[['cos_similarity: senior vs neutral', 'cos_similarity: adult vs neutral' ,'cos_similarity: young vs neutral']].mean().mean()
        senior_calibration_mean = prompts[['cos_similarity: senior vs neutral', 'Calibration']].mean().mean()
        adult_calibration_mean = prompts[['cos_similarity: adult vs neutral', 'Calibration']].mean().mean()
        young_calibration_mean = prompts[['cos_similarity: young vs neutral', 'Calibration']].mean().mean()
        senior_adult_calibration_mean = prompts[['cos_similarity: senior vs neutral', 'cos_similarity: adult vs neutral' ,'Calibration']].mean().mean()
        senior_young_calibration_mean = prompts[['cos_similarity: senior vs neutral', 'cos_similarity: young vs neutral' ,'Calibration']].mean().mean()
        adult_young_calibration_mean = prompts[['cos_similarity: young vs neutral', 'cos_similarity: adult vs neutral' ,'Calibration']].mean().mean()
        senior_adult_young_calibration_mean = prompts[['cos_similarity: senior vs neutral', 'cos_similarity: adult vs neutral' ,'cos_similarity: young vs neutral' ,'Calibration']].mean().mean()

        senior_mean = float(f'{senior_mean:.2f}')
        adult_mean = float(f'{adult_mean:.2f}')
        young_mean = float(f'{young_mean:.2f}')

        senior_adult_mean = float(f'{senior_adult_mean:.2f}')
        senior_young_mean = float(f'{senior_young_mean:.2f}')
        adult_young_mean = float(f'{adult_young_mean:.2f}')
        senior_adult_young_mean = float(f'{senior_adult_young_mean:.2f}')

        # Append results to the results DataFrame
        new_row = pd.DataFrame({
            'LLM': [llm_name],
            'Stereotype': key,
            'Senior-Calibration': [p_calibration_senior],
            'Adult-Calibration': [p_calibration_adult],
            'Young-Calibration': [p_calibration_young],
            'Senior-Adult': [p_senior_adult],
            'Senior-Young': [p_senior_young],
            'Adult-Young': [p_young_adult],
            'Senior-Adult Mean': [senior_adult_mean],
            'Senior-Young Mean': [senior_young_mean],
            'Adult-Young Mean': [adult_young_mean],
            'Senior-Adult-Young Mean': [senior_adult_young_mean],
            'Senior Mean': [senior_mean],
            'Adult Mean': [adult_mean],
            'Young Mean': [young_mean],
            'Calibration Mean': [calibration_mean],
            'Calibration-Senior': senior_calibration_mean,
            'Calibration-Adult': adult_calibration_mean,
            'Calibration-Young': young_calibration_mean,
            'Senior-Adult-Calibration Mean': [senior_adult_calibration_mean],
            'Senior-Young-Calibration Mean': [senior_young_calibration_mean],
            'Adult-Young-Calibration Mean': [adult_young_calibration_mean],
            'Senior-Adult-Young-Calibration Mean': [senior_adult_young_calibration_mean]
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)


Dealing with Change
gpt-4o-mini-2024-07-18 *************Dealing with Change*************
Index(['cos_similarity: young vs neutral', 'cos_similarity: adult vs neutral',
       'cos_similarity: senior vs neutral', 'Calibration'],
      dtype='object')
Friedman Test: Statistic=31.96799999999996, p-value=5.315368242017395e-07
Nemenyi Post-Hoc Test Results:
                 0              1                2                3
0              1.0 0.017109611397     0.6990766406 8.2798947498e-07
1   0.017109611397            1.0    0.24590677617   0.092643122861
2     0.6990766406  0.24590677617              1.0 0.00016884617894
3 8.2798947498e-07 0.092643122861 0.00016884617894              1.0
claude-3 *************Dealing with Change*************
Index(['cos_similarity: young vs neutral', 'cos_similarity: adult vs neutral',
       'cos_similarity: senior vs neutral', 'Calibration'],
      dtype='object')
Friedman Test: Statistic=59.928, p-value=6.090161180949176e-13
Nemenyi Post-Hoc Test Resu

In [None]:
results_df.iloc[39]

Unnamed: 0,39
LLM,yi
Stereotype,All
Senior-Calibration,0.0
Adult-Calibration,0.26
Young-Calibration,0.0
Senior-Adult,0.0
Senior-Young,0.04
Adult-Young,0.01
Senior-Adult Mean,0.73
Senior-Young Mean,0.71


In [None]:
# Pivot the table
pivot_df = results_df.melt(id_vars=['LLM', 'Stereotype'], var_name='Age Group', value_name='P Value')
pivot_df = pivot_df.pivot_table(index=['Stereotype', 'Age Group'], columns='LLM', values='P Value')

# Flatten the MultiIndex columns
pivot_df.columns.name = None

# Reset the index to move 'Stereotype' and 'Age Group' back to columns
pivot_df.reset_index(inplace=True)

pivot_df

Unnamed: 0,Stereotype,Age Group,claude-3,gemini,gemma,gpt-4o-mini-2024-07-18,llama-2,llama-3,mistral,yi
0,All,Adult Mean,0.83,0.74,0.76,0.88,0.78,0.81,0.82,0.77
1,All,Adult-Young,0.08,0.9,0.87,0.0,0.9,0.0,0.9,0.08
2,All,Adult-Young Mean,0.81,0.73,0.76,0.87,0.79,0.79,0.81,0.75
3,All,Senior Mean,0.76,0.69,0.69,0.83,0.72,0.73,0.74,0.71
4,All,Senior-Adult,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,All,Senior-Adult Mean,0.8,0.71,0.73,0.85,0.75,0.77,0.78,0.74
6,All,Senior-Adult-Young Mean,0.8,0.72,0.74,0.85,0.76,0.77,0.79,0.74
7,All,Senior-Young,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.02
8,All,Senior-Young Mean,0.78,0.71,0.73,0.84,0.76,0.76,0.77,0.73
9,All,Young Mean,0.8,0.73,0.76,0.86,0.79,0.78,0.81,0.74


In [None]:
pivot_df.to_csv('Ageism Bias.csv')

# Ethnicity

In [None]:
def get_category_for_prompt_id(prompt_id):
    for category, ids in data_dict.items():
        if prompt_id in ids:
            return category
    return "Unknown"  # Return "Unknown" if the prompt_id is not found in any category

def calculate_combinations_mean(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate the mean of each possible combination of columns in the DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame to calculate means for.

    Returns:
    pd.DataFrame: A DataFrame with the mean values for each column combination.
                  - Index: Combination of column names (joined by ' - ').
                  - 'Mean': Mean value for the combination.
    """
    # Dictionary to store the mean of each combination
    means_dict = {}

    # Get all column names
    columns = df.columns

    # Iterate over all possible combinations of the columns
    for r in range(1, len(columns) + 1):
        for combo in itertools.combinations(columns, r):
            combo_name = ' - '.join(combo)
            means_dict[combo_name] = df[list(combo)].mean(axis=1).mean(axis=0)

    # Convert to DataFrame with a single column 'Mean'
    combinations_mean_df = pd.DataFrame.from_dict(means_dict, orient='index', columns=['Mean'])
    return combinations_mean_df

def statistics(groups: list, verbose: bool = False) -> dict:
    """
    Perform statistical tests (Friedman test and Nemenyi post-hoc test) on the provided groups.

    Parameters:
    groups (list): A list of lists, where each inner list represents a group of data points.
    verbose (bool): If True, print detailed results of the tests. Default is False.

    Returns:
    dict: A dictionary containing the results of the statistical tests.
          - 'friedman': Result of the Friedman test (statistic and p-value).
          - 'nemenyi': Result of the Nemenyi post-hoc test if applicable, otherwise None.
    """
    num_groups = len(groups)
    results = {}

    # Perform tests only if there are at least 2 groups
    if num_groups >= 2:
        # Conduct Friedman test
        friedman_stat, friedman_p_value = friedmanchisquare(*groups)
        results['friedman'] = f"Friedman Test: Statistic={friedman_stat}, p_value={friedman_p_value}"

        # Conduct Nemenyi post-hoc test if Friedman test is significant
        if friedman_p_value < 0.05:
            data = np.array(groups).T
            nemenyi_results = sp.posthoc_nemenyi_friedman(data)
            results['nemenyi'] = nemenyi_results
            # For Gender
              # p_male_female = nemenyi_results[0][2]
              # p_male_Calibration = nemenyi_results[1][2]
              # p_female_Calibration = nemenyi_results[1][2]

            # For Ageism
              # p_young_adult = nemenyi_results[0][1]
              # p_adult_senior = nemenyi_results[1][2]
              # p_senior_young = nemenyi_results[2][0]
              # p_young_Calibration = nemenyi_results[0][3]
              # p_adult_Calibration = nemenyi_results[1][3]
              # p_senior_Calibration = nemenyi_results[2][3]

            # For Ethnicity:
              # p_Caucasian_American_Neutral_American = nemenyi_results[1][0]
              # p_Asian_Neutral_American = nemenyi_results[2][0]
              # p_African_American_Neutral_American = nemenyi_results[3][0]
              # p_Hispanic_Neutral_American = nemenyi_results[4][0]
              # p_Calibration_Neutral_American = nemenyi_results[5][0]
              # p_Asian_Caucasian_American = nemenyi_results[2][1]
              # p_African_American_Caucasian_American = nemenyi_results[3][1]
              # p_Hispanic_Caucasian_American = nemenyi_results[4][1]
              # p_Calibration_Caucasian_American = nemenyi_results[5][1]
              # p_African_American_Asian = nemenyi_results[3][2]
              # p_Hispanic_Asian = nemenyi_results[4][2]
              # p_Calibration_Asian = nemenyi_results[5][2]
              # p_Hispanic_African_American = nemenyi_results[4][3]
              # p_Calibration_African_American = nemenyi_results[5][3]
              # p_Calibration_Hispanic = nemenyi_results[5][4]
            return results

        # If Friedman test is not significant, return None for Nemenyi results
        elif friedman_p_value is not None:
            if verbose:
                print("Friedman test is not significant (p >= 0.05); Nemenyi post-hoc test not performed.")
            results['nemenyi'] = None
            return results
        else:
            raise ValueError("Data shape of content does not match the requirements to perform Friedman.")
    else:
        raise ValueError("At least two groups are required for comparison.")

def homogeneous_groups(type: str, responses_path: str, calibration_path: str, stereotype_group_id: str = None) -> dict:
    """
    Analyze homogeneity within specified demographic groups (gender, ageism, or ethnicity) based on cosine similarity data.

    Parameters:
    type (str): The demographic type to analyze: 'gender', 'ageism', 'ethnicity', etc.
    responses_path (str): Path to the CSV file containing responses with cosine similarity values.
    calibration_path (str): Path to the CSV file containing calibration's cosine similarity values.
    stereotype_group_id (str, optional): If provided, filters responses to include only those with the specified stereotype group ID. Default is None.

    Returns:
    dict: A dictionary containing the results of the statistical analysis.
          - 'friedman': Result of the Friedman test (statistic and p-value).
          - 'nemenyi': Result of the Nemenyi post-hoc test if applicable, otherwise None.
          - 'means': Mean values for each combination of demographic groups.
    """
    # Load responses data from the specified CSV file, setting 'prompt_id' as the index
    responses = pd.read_csv(responses_path, index_col=['prompt_id'])
    responses.drop(columns=['pronoun_sequence_id'], axis=1, inplace=True)


    # Ethnicity Analysis: Consider cosine similarity between various ethnic groups and neutral prompts
    responses = responses[['cos_similarity: neutral vs neutral american',
                            'cos_similarity: neutral vs caucasian american',
                            'cos_similarity: neutral vs asian',
                            'cos_similarity: neutral vs african american',
                            'cos_similarity: neutral vs hispanic',
                            'stereotype_group_id']]

    # Filter by stereotype group ID if provided
    if stereotype_group_id is not None:
        responses = responses[responses.stereotype_group_id.isin([stereotype_group_id])]
    responses = responses.drop(columns=['stereotype_group_id'])
    responses.columns = ['Neutral American', 'Caucasian American', 'Asian', 'African American', 'Hispanic']

    # Load calibration data and calculate the mean across neutral comparisons
    calibration = pd.read_csv(calibration_path, index_col=['prompt_id'])
    calibration = calibration.loc[:, calibration.columns.str.contains('neutral')]
    calibration = calibration.mean(axis=1)
    calibration.columns = 'calibration'
    calibration.name = 'Calibration'

    # Merge responses with the calibration data
    responses = pd.merge(left=responses, right=calibration, left_index=True, right_index=True)
    #display(responses)
    #print(responses.columns)
    # Create a list of groups for statistical analysis (each group corresponds to a column in responses)
    groups = [responses[col] for col in responses.columns]

    # Perform statistical tests and calculate means for combinations of groups
    results = statistics(groups, verbose=False)
    results['means'] = calculate_combinations_mean(responses)

    return results


## GPT-4o

In [None]:
type = 'ethnicity'
responses_path = PATH + '/Ethnicity/Cosine Similarity/gpt-4o-mini-2024-07-18 ethnicity 2024-08-13 - cos_similarity.csv'
calibration_path = PATH + '/Ethnicity/Calibration/Cosine Similarity/gpt-4o-mini-2024-07-18 ethnicity calibration - cos_similarity.csv'
results_dict = homogeneous_groups(type, responses_path, calibration_path)

In [None]:
results_dict['friedman'], results_dict['nemenyi']

('Friedman Test: Statistic=152.88498168498154, p_value=3.245696104175729e-31',
              0            1     2     3     4     5
 0          1.0 0.3712605117 0.001 0.001 0.001   0.9
 1 0.3712605117          1.0 0.001 0.001 0.001   0.9
 2        0.001        0.001   1.0   0.9   0.9 0.001
 3        0.001        0.001   0.9   1.0   0.9 0.001
 4        0.001        0.001   0.9   0.9   1.0 0.001
 5          0.9          0.9 0.001 0.001 0.001   1.0)

## Mistral

In [None]:
type = 'ethnicity'
responses_path = PATH +'/Ethnicity/Cosine Similarity/mistral ethnicity 2024-08-13 - cos_similarity.csv'
calibration_path = PATH +'/Ethnicity/Calibration/Cosine Similarity/mistral ethnicity calibration - cos_similarity.csv'
results_dict = homogeneous_groups(type, responses_path, calibration_path)
results_dict['friedman'], results_dict['nemenyi']

('Friedman Test: Statistic=195.66886446886383, p_value=2.3976664435577985e-40',
                0              1              2              3             4  \
 0            1.0 0.033714640079          0.001          0.001         0.001   
 1 0.033714640079            1.0          0.001          0.001         0.001   
 2          0.001          0.001            1.0 0.087501692907 0.56937984488   
 3          0.001          0.001 0.087501692907            1.0           0.9   
 4          0.001          0.001  0.56937984488            0.9           1.0   
 5            0.9  0.11128785697          0.001          0.001         0.001   
 
               5  
 0           0.9  
 1 0.11128785697  
 2         0.001  
 3         0.001  
 4         0.001  
 5           1.0  )

## Llama-3

In [None]:
type = 'ethnicity'
responses_path = PATH + '/Ethnicity/Cosine Similarity/llama-3 ethnicity 2024-08-13 - cos_similarity.csv'
calibration_path = PATH + '/Ethnicity/Calibration/Cosine Similarity/llama-3 ethnicity calibration - cos_similarity.csv'
results_dict = homogeneous_groups(type, responses_path, calibration_path)
results_dict['friedman'], results_dict['nemenyi']

('Friedman Test: Statistic=286.9950865356417, p_value=6.25131048332146e-60',
               0               1               2               3             4  \
 0           1.0           0.001           0.001           0.001         0.001   
 1         0.001             1.0             0.9 0.0011781154202 0.50858342244   
 2         0.001             0.9             1.0 0.0022547535084 0.60254145701   
 3         0.001 0.0011781154202 0.0022547535084             1.0 0.23714168709   
 4         0.001   0.50858342244   0.60254145701   0.23714168709           1.0   
 5 0.25063505399           0.001           0.001           0.001         0.001   
 
               5  
 0 0.25063505399  
 1         0.001  
 2         0.001  
 3         0.001  
 4         0.001  
 5           1.0  )

## Llama-2

In [None]:
type = 'ethnicity'
responses_path = PATH + '/Ethnicity/llama-2 ethnicity 09-09-2024 - cos_similarity.csv'
calibration_path = PATH + '/Ethnicity/Calibration/llama-2 ethnicity calibration - cos_similarity.csv'
results_dict = homogeneous_groups(type, responses_path, calibration_path)
results_dict['friedman'], results_dict['nemenyi']

('Friedman Test: Statistic=919.5420211205636, p_value=1.568842922370933e-196',
               0               1              2                3  \
 0           1.0             0.0            0.0              0.0   
 1           0.0             1.0  0.99812822749  0.0095199692095   
 2           0.0   0.99812822749            1.0   0.037738509553   
 3           0.0 0.0095199692095 0.037738509553              1.0   
 4           0.0   0.91334951999  0.69933497128 0.00016292678483   
 5 0.18001978118             0.0            0.0              0.0   
 
                  4             5  
 0              0.0 0.18001978118  
 1    0.91334951999           0.0  
 2    0.69933497128           0.0  
 3 0.00016292678483           0.0  
 4              1.0           0.0  
 5              0.0           1.0  )

## Yi

In [None]:
type = 'ethnicity'
responses_path = PATH + '/Ethnicity/yi ethnicity 09-09-2024 - cos_similarity.csv'
calibration_path = PATH + '/Ethnicity/Calibration/yi ethnicity calibration - cos_similarity.csv'
results_dict = homogeneous_groups(type, responses_path, calibration_path)
results_dict['friedman'], results_dict['nemenyi']

('Friedman Test: Statistic=22.13805832410531, p_value=0.0004928825911189778',
                0             1              2              3              4  \
 0            1.0 0.67506263758 0.002253479784 0.015438160696  0.86938430528   
 1  0.67506263758           1.0  0.19920075906  0.49768910619  0.99927299583   
 2 0.002253479784 0.19920075906            1.0  0.99471571773 0.087571418224   
 3 0.015438160696 0.49768910619  0.99471571773            1.0  0.28437820823   
 4  0.86938430528 0.99927299583 0.087571418224  0.28437820823            1.0   
 5  0.98615515079 0.96339309958 0.023018475919  0.10361255062  0.99718822775   
 
                5  
 0  0.98615515079  
 1  0.96339309958  
 2 0.023018475919  
 3  0.10361255062  
 4  0.99718822775  
 5            1.0  )

## Claude-3

In [None]:
type = 'ethnicity'
responses_path = PATH + '/Ethnicity/claude-3-opus-20240229 ethnicity 26-12-2024 - cos_similarity.csv'
calibration_path = PATH + '/Ethnicity/Calibration/claude-3-opus-20240229 ethnicity calibration - cos_similarity.csv'
results_dict = homogeneous_groups(type, responses_path, calibration_path)
results_dict['friedman'], results_dict['nemenyi']

('Friedman Test: Statistic=562.1875457875449, p_value=2.9818214075911006e-119',
                0              1               2               3  \
 0            1.0            0.0             0.0             0.0   
 1            0.0            1.0   0.12752296395     0.061480576   
 2            0.0  0.12752296395             1.0 2.108789853e-06   
 3            0.0    0.061480576 2.108789853e-06             1.0   
 4            0.0 0.018605979095   0.98322531274 3.961063022e-08   
 5 0.011964905421            0.0             0.0             0.0   
 
                 4              5  
 0             0.0 0.011964905421  
 1  0.018605979095            0.0  
 2   0.98322531274            0.0  
 3 3.961063022e-08            0.0  
 4             1.0            0.0  
 5             0.0            1.0  )

In [None]:
results_dict['means'].to_csv('a.csv')

## Gemini

In [None]:
type = 'ethnicity'
responses_path = PATH + '/Ethnicity/gemini-1.0-pro ethnicity 17-10-2024 - cos_similarity.csv'
calibration_path = PATH + '/Ethnicity/Calibration/gemini-1.0-pro ethnicity calibration - cos_similarity.csv'
results_dict = homogeneous_groups(type, responses_path, calibration_path)
results_dict['friedman'], results_dict['nemenyi']

('Friedman Test: Statistic=56.88721860189531, p_value=5.3349900132542215e-11',
                  0                1              2                3  \
 0              1.0 3.8549431725e-05 0.002253479784 1.0101121051e-09   
 1 3.8549431725e-05              1.0  0.93935087421    0.44769316391   
 2   0.002253479784    0.93935087421            1.0   0.064757824212   
 3 1.0101121051e-09    0.44769316391 0.064757824212              1.0   
 4  0.0063570267425    0.83427925505  0.99977914749   0.029164155232   
 5    0.86103110172  0.0051656834769 0.091937344244 1.1232258617e-06   
 
                 4                5  
 0 0.0063570267425    0.86103110172  
 1   0.83427925505  0.0051656834769  
 2   0.99977914749   0.091937344244  
 3  0.029164155232 1.1232258617e-06  
 4             1.0    0.17634522615  
 5   0.17634522615              1.0  )

## Gemma

In [None]:
type = 'ethnicity'
responses_path = PATH + '/Ethnicity/gemma ethnicity 09-09-2024 - cos_similarity.csv'
calibration_path = PATH + '/Ethnicity/Calibration/gemma ethnicity calibration - cos_similarity.csv'
results_dict = homogeneous_groups(type, responses_path, calibration_path)
results_dict['friedman'], results_dict['nemenyi']

('Friedman Test: Statistic=497.6665441176475, p_value=2.546389594265546e-105',
                  0                1                2                3  \
 0              1.0              0.0              0.0              0.0   
 1              0.0              1.0 0.00022046921707    0.25075716405   
 2              0.0 0.00022046921707              1.0 1.2228695834e-09   
 3              0.0    0.25075716405 1.2228695834e-09              1.0   
 4 6.9433898442e-07 2.2204460493e-16 0.00036628448794              0.0   
 5    0.99212861312              0.0              0.0              0.0   
 
                  4                5  
 0 6.9433898442e-07    0.99212861312  
 1 2.2204460493e-16              0.0  
 2 0.00036628448794              0.0  
 3              0.0              0.0  
 4              1.0 2.1978712628e-08  
 5 2.1978712628e-08              1.0  )