# Evaluation of the Results of the Validation Experiments
The following script produces the win rates for different argument categories of the validation experiments based on the raw survey results.

In [1]:
import pandas as pd 
import numpy as np
from pathlib import Path
import random

data_dir = Path.cwd().parent / 'data'

## Evaluation of Validation Experiment 1.

In [2]:
# import original argument data
df_experiment1 = pd.read_excel(data_dir / "Validation_Study1_Arguments_with_Topics.xlsx", index_col=0).iloc[:, 2:]
df_experiment1 = df_experiment1.reset_index(drop=True)

# import survey results 
df_experiment_results = pd.read_excel(data_dir / "Validation_Study1_Survey_Results.xlsx").iloc[1:,:]
df_experiment_results = df_experiment_results.reset_index(drop=True)

df_experiment_results['usedIndices'] = df_experiment_results.__js_usedIndices.str.split("|")
df_experiment_results['arg_selected1'] = (df_experiment_results.__js_comparisonTextSelected01 == df_experiment_results.__js_comparisonText01B)*1
df_experiment_results['arg_selected2'] = (df_experiment_results.__js_comparisonTextSelected02 == df_experiment_results.__js_comparisonText02B)*1
df_experiment_results['arg_selected3'] = (df_experiment_results.__js_comparisonTextSelected03 == df_experiment_results.__js_comparisonText03B)*1
df_experiment_results['arg_selected4'] = (df_experiment_results.__js_comparisonTextSelected04 == df_experiment_results.__js_comparisonText04B)*1
df_experiment_results['arg_selected5'] = (df_experiment_results.__js_comparisonTextSelected05 == df_experiment_results.__js_comparisonText05B)*1

df_experiment_results.head()

Unnamed: 0,StartDate,EndDate,UserLanguage,ConsentComprehend,ConsentConfirm,browserInfo_Browser,browserInfo_Version,browserInfo_Operating System,browserInfo_Resolution,Age,...,__js_comparisonText04A,__js_comparisonText04B,__js_comparisonText05A,__js_comparisonText05B,usedIndices,arg_selected1,arg_selected2,arg_selected3,arg_selected4,arg_selected5
0,2024-03-13 08:19:52,2024-03-13 08:29:14,EN,"My participation is voluntary.,Refusal to part...","I consent, proceed to survey.",Brave,122.0.0.0,Windows,1537x719,35-44 years old,...,The meat industry emits more greenhouse gases ...,Industrial animal farming produces three times...,Meat production squanders 70% of grains and 20...,Adopting veganism is a powerful step towards a...,"[94, 1, 11, 63, 4, 90, 23, 59, 2, 54]",1,1,1,1,0
1,2024-03-13 08:19:54,2024-03-13 08:26:49,EN,"My participation is voluntary.,Refusal to part...","I consent, proceed to survey.",Chrome,122.0.0.0,Windows,1536x864,35-44 years old,...,It's a general consensus that unnecessary anim...,Adopting veganism is a powerful step towards a...,Veganism combats global warming and resource w...,"By reducing our meat consumption, we're lessen...","[43, 70, 69, 41, 8, 3, 67, 54, 18, 94]",1,0,0,1,0
2,2024-03-13 08:19:57,2024-03-13 08:28:02,EN,"My participation is voluntary.,Refusal to part...","I consent, proceed to survey.",Firefox,123.0,Windows,2560x1440,55-64 years old,...,Animal farming is a significant contributor to...,Switching to a plant-based diet is crucial for...,Adopt a vegan diet to combat global hunger and...,"Producing 1kg of meat requires 2.8kg of crops,...","[95, 12, 86, 29, 14, 68, 90, 30, 8, 69]",1,1,1,1,0
3,2024-03-13 08:19:57,2024-03-13 08:28:50,EN,"My participation is voluntary.,Refusal to part...","I consent, proceed to survey.",Chrome,122.0.0.0,Mac OS,2240x1260,65+ years old,...,Veganism safeguards our planet by reducing car...,Dairy cows are often slaughtered after only 3-...,Shifting to veganism combats global hunger by ...,Adopt a vegan lifestyle to combat climate chan...,"[32, 92, 4, 39, 71, 18, 48, 76, 34, 5]",1,1,1,0,0
4,2024-03-13 08:19:58,2024-03-13 08:26:05,EN,"My participation is voluntary.,Refusal to part...","I consent, proceed to survey.",Firefox,123.0,Windows,1920x1080,35-44 years old,...,Adopting a vegan lifestyle can significantly c...,Adopt veganism and fight climate change! This ...,Switching to a plant-based diet is crucial for...,The misuse of antibiotics in livestock breedin...,"[19, 59, 42, 47, 31, 1, 66, 25, 30, 57]",0,1,1,0,1


In [3]:
# Get tuples of experiment outcomes, each tuple is a pair of indices with the winner in position 0
tuples_experiment1 = []

for i, indices in enumerate(df_experiment_results.usedIndices):
    # Split the list into tuples of two
    tuples = [(indices[j], indices[j + 1]) for j in range(0, len(indices), 2)]

    # Check if the length of tuples matches the expected length
    if len(tuples) != 5:
        print("Error: Length of tuples does not match expected length")

    # Reverse tuples based on the corresponding 'arg_selected' values
    for j in range(5):
        if df_experiment_results[f'arg_selected{j + 1}'][i] == 1:
            tuples[j] = tuples[j][::-1]

    # Extend the main list with formatted tuples
    tuples_experiment1.extend((int(tup[0]), int(tup[1])) for tup in tuples)

len(tuples_experiment1)

# Optionally remove argument 16 which does not fully satisfy the filtering criteria
# tuples_experiment = [tup for tup in tuples_experiment if not any(i in tup for i in [16])]
# len(tuples_experiment1)

990

In [4]:
df_experiment_results_table = pd.DataFrame(None, columns = np.sort(df_experiment1.Source.unique()), index= np.sort(df_experiment1.Source.unique()))
df_experiment_results_table = df_experiment_results_table.fillna(0)

for tup in tuples_experiment1:
    df_experiment_results_table.loc[df_experiment1.Source[tup[1]], df_experiment1.Source[tup[0]]] += 1

df_experiment_results_table

Unnamed: 0,New_GPT_Argument,New_Synthetic_Argument_Combined,New_Synthetic_Argument_Stronger_Emphasis,Original_Collection
New_GPT_Argument,10,42,18,54
New_Synthetic_Argument_Combined,37,93,43,160
New_Synthetic_Argument_Stronger_Emphasis,24,54,37,73
Original_Collection,58,186,101,0


Calculate win rates per argument group including confidence intervals

In [5]:
def calculate_raw_win_rates(df_experiment, tuples_experiment):
    """
    Calculate raw win rates from the provided DataFrame and tuples of experiments.

    Parameters:
    - df_experiment: DataFrame containing experiment results with a 'Source' column.
    - tuples_experiment: List of tuples where each tuple represents a comparison.

    Returns:
    - raw_win_rates: Series containing raw win rates for each source.
    """
    # Initialize results table with zeros
    sources = np.sort(df_experiment.Source.unique())
    df_results_table = pd.DataFrame(0, columns=sources, index=sources)

    # Calculate counts
    for tup in tuples_experiment:
        df_results_table.loc[df_experiment.Source[tup[1]], df_experiment.Source[tup[0]]] += 1

    # Set diagonal values to zero
    np.fill_diagonal(df_results_table.values, 0)

    # Calculate raw win rates
    raw_win_rates = df_results_table.sum(axis=0) / (df_results_table.sum(axis=0) + df_results_table.sum(axis=1))
    
    return raw_win_rates

def CI_experiment_bootstrap(df_experiment, tuples_experiment, n=10):
    """
    Perform bootstrap sampling to calculate win rates for each source.

    Parameters:
    - df_experiment: DataFrame containing experiment results with a 'Source' column.
    - tuples_experiment: List of tuples where each tuple represents a comparison between sources.
    - n: Number of bootstrap samples to generate (default is 10).

    Returns:
    - win_rates_bs: A 2D NumPy array where each row represents the win rates from a bootstrap sample.
    """

    win_rates_bs = np.matrix(np.zeros((n, len(df_experiment.Source.unique()))))
    for k in range(n):
        df_results_table = pd.DataFrame(None, columns = np.sort(df_experiment.Source.unique()), index= np.sort(df_experiment.Source.unique()))
        df_results_table = df_results_table.fillna(0)

        bootstrap_sample = random.choices(tuples_experiment, k=len(tuples_experiment))
        
        for tup in bootstrap_sample:
            df_results_table.loc[df_experiment.Source[tup[1]], df_experiment.Source[tup[0]]] += 1

        for i in range(4): 
            df_results_table.iloc[i,i] = 0

        # get column sums of results table 
        win_rates = df_results_table.sum(axis=0) / (df_results_table.sum(axis=0) + df_results_table.sum(axis=1))
        
        win_rates_bs[k,:] = win_rates
    
    return win_rates_bs


In [6]:
# Set random seed for reproducibility
random.seed(2024)

ci = 95
# Calculate raw win rates
raw_win_rates = calculate_raw_win_rates(df_experiment1, tuples_experiment1)

# Run bootstrap experiment and get raw win rates
win_rates_bs = CI_experiment_bootstrap(df_experiment1, tuples_experiment1, n=500)

win_rates_df = pd.DataFrame(win_rates_bs)
lower = win_rates_df.quantile((100 - ci) / 200)  # Lower bound (e.g., 2.5th percentile for 95% CI)
upper = win_rates_df.quantile(1 - (100 - ci) / 200)  # Upper bound (e.g., 97.5th percentile for 95% CI)

diff_rates_df = win_rates_df
win_synth = win_rates_df.loc[:,1]
for col in diff_rates_df.columns:
    diff_rates_df[col] = diff_rates_df[col] - win_synth
diff_rates_df
lower_diff = diff_rates_df.quantile((100 - ci) / 200)  # Lower bound (e.g., 2.5th percentile for 95% CI)
upper_diff = diff_rates_df.quantile(1 - (100 - ci) / 200)  # Upper bound (e.g., 97.5th percentile for 95% CI)


# Create confidence intervals DataFrame
CIs = pd.DataFrame(np.vstack([lower, upper]), columns=df_experiment1.Source.unique(), index=["Lower", "Upper"])
CIs_list = [f"[{100 * CIs.iloc[0, i]:.1f}, {100 * CIs.iloc[1, i]:.1f}]" for i in range(4)]

# Create difference confidence intervals list
CIs_diff = pd.DataFrame(np.vstack([lower_diff, upper_diff]), columns=df_experiment1.Source.unique(), index=["Lower", "Upper"])
CIs_diff_list = [f"[{100 * CIs_diff.iloc[0, i]:.1f}, {100 * CIs_diff.iloc[1, i]:.1f}]" for i in range(4)]

# Create lists for raw win rates and differences
win_rates_list = [rate * 100 for rate in raw_win_rates]
win_diff_list = [win_rates_list[i] - win_rates_list[1] if i != 1 else "-" for i in range(4)]

# Create final DataFrame with confidence intervals and differences
df_CI1 = pd.DataFrame({
    "Win Rates (%)": win_rates_list,
    "95% CI": CIs_list,
    "Diff to SY": win_diff_list,
    "Diff 95% CI": CIs_diff_list
})

# Reorder rows and update index labels
df_CI1 = df_CI1.reindex([1, 3, 0, 2])
df_CI1.index = ['Argument Synthesis (SY)', 'Original (OG)', 'GPT-best (GPT)', 'Stronger Emphasis (SE)']

# Display the final DataFrame
df_CI1


Unnamed: 0,Win Rates (%),95% CI,Diff to SY,Diff 95% CI
Argument Synthesis (SY),54.022989,"[49.9, 58.3]",-,"[0.0, 0.0]"
Original (OG),45.411392,"[41.4, 49.1]",-8.611596,"[-15.7, -1.5]"
GPT-best (GPT),51.072961,"[44.4, 57.1]",-2.950027,"[-12.1, 5.5]"
Stronger Emphasis (SE),51.757188,"[46.3, 56.8]",-2.2658,"[-9.7, 5.2]"


## Evaluation of Validation Experiment 2.

In [7]:
# import original argument data
df_experiment2 = pd.read_excel(data_dir / "Validation_Study1_Arguments_with_Topics.xlsx", index_col=0).iloc[:, 2:]
df_experiment2 = df_experiment2.reset_index(drop=True)

# import survey results 
df_experiment_results = df_experiment_results.reset_index(drop=True)
df_experiment_results = pd.read_excel(data_dir / "Validation_Study2_Survey_Results.xlsx", index_col=0).iloc[1:,:]

df_experiment_results['usedIndices'] = df_experiment_results.__js_usedIndices.str.split("|")
df_experiment_results['arg_selected1'] = (df_experiment_results.__js_comparisonTextSelected01 == df_experiment_results.__js_comparisonText01B)*1
df_experiment_results['arg_selected2'] = (df_experiment_results.__js_comparisonTextSelected02 == df_experiment_results.__js_comparisonText02B)*1
df_experiment_results['arg_selected3'] = (df_experiment_results.__js_comparisonTextSelected03 == df_experiment_results.__js_comparisonText03B)*1
df_experiment_results['arg_selected4'] = (df_experiment_results.__js_comparisonTextSelected04 == df_experiment_results.__js_comparisonText04B)*1
df_experiment_results['arg_selected5'] = (df_experiment_results.__js_comparisonTextSelected05 == df_experiment_results.__js_comparisonText05B)*1

df_experiment_results.head()

Unnamed: 0_level_0,EndDate,UserLanguage,ConsentComprehend,ConsentConfirm,browserInfo_Browser,browserInfo_Version,browserInfo_Operating System,browserInfo_Resolution,Age,Gender,...,__js_comparisonText04A,__js_comparisonText04B,__js_comparisonText05A,__js_comparisonText05B,usedIndices,arg_selected1,arg_selected2,arg_selected3,arg_selected4,arg_selected5
StartDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-04-22 09:06:26,2024-04-22 09:11:16,EN,"My participation is voluntary.,Refusal to part...","I consent, proceed to survey.",Chrome,123.0.0.0,Windows,1280x720,65+ years old,Female,...,Adopt veganism and fight climate change by sto...,Lower your carbon emissions effortlessly by ad...,Veganism could boost the global food supply by...,"By consuming 70% of all grain, 98% of soy, and...","[31, 24, 54, 8, 28]",1,0,0,1,1
2024-04-22 09:06:35,2024-04-22 09:13:35,EN,"My participation is voluntary.,Refusal to part...","I consent, proceed to survey.",Chrome,123.0.0.0,Windows,1920x1080,35-44 years old,Male,...,"Producing 1kg of meat requires 2.8kg of crops,...",Livestock farming devours 1/3 of Earth's ferti...,Adopting veganism is not just a diet change bu...,Embracing veganism drastically cuts resource w...,"[73, 24, 57, 49, 65]",1,0,1,1,0
2024-04-22 09:06:45,2024-04-22 09:13:29,EN,"My participation is voluntary.,Refusal to part...","I consent, proceed to survey.",Chrome,124.0.0.0,Mac OS,2240x1260,65+ years old,Female,...,"A vegan diet, rich in essential nutrients, has...",Countless health studies have validated that a...,"By consuming 70% of all grain, 98% of soy, and...",Shifting to a vegan diet combats global hunger...,"[23, 25, 17, 70, 19]",1,1,1,1,0
2024-04-22 09:06:46,2024-04-22 09:12:28,EN,"My participation is voluntary.,Refusal to part...","I consent, proceed to survey.",Edge,124.0.0.0,Windows,2560x1440,25-34 years old,Female,...,Livestock farming emits potent greenhouse gase...,The livestock industry is a major contributor ...,Livestock farming devours 1/3 of Earth's ferti...,Animal-based diets contribute to 60% of global...,"[29, 25, 39, 22, 48]",0,0,1,0,1
2024-04-22 09:06:56,2024-04-22 09:15:53,EN,"My participation is voluntary.,Refusal to part...","I consent, proceed to survey.",Firefox,124.0,Windows,2560x1440,55-64 years old,Male,...,Choosing veganism combats climate change by re...,The global farming of billions of animals for ...,Shifting to a vegan diet tackles global hunger...,Adopting a vegan lifestyle can significantly c...,"[7, 17, 16, 38, 32]",1,1,1,1,1


In [8]:
# Get tuples of experiment outcomes, each tuple is a pair of indices with winner in position 0
tuples_experiment2 = [] 
for i in range(df_experiment_results.shape[0]):
    for j in range(1,6):
        if df_experiment_results[f"arg_selected{j}"][i] == 0:
            tuples_experiment2.append(
                (np.where(df_experiment2.Argument == df_experiment_results[f"__js_comparisonText0{j}A"][i])[0][0],
                np.where(df_experiment2.Argument == df_experiment_results[f"__js_comparisonText0{j}B"][i])[0][0])
            )
        else:
            tuples_experiment2.append(
                (np.where(df_experiment2.Argument == df_experiment_results[f"__js_comparisonText0{j}B"][i])[0][0],
                np.where(df_experiment2.Argument == df_experiment_results[f"__js_comparisonText0{j}A"][i])[0][0])
            )

len(tuples_experiment2)

510

In [9]:
# Set random seed for reproducibility
random.seed(2024)

ci = 95
# Calculate raw win rates
raw_win_rates = calculate_raw_win_rates(df_experiment2, tuples_experiment2)

# Run bootstrap experiment and get raw win rates
win_rates_bs = CI_experiment_bootstrap(df_experiment2, tuples_experiment2, n=500)

win_rates_df = pd.DataFrame(win_rates_bs)
lower = win_rates_df.quantile((100 - ci) / 200)  # Lower bound (e.g., 2.5th percentile for 95% CI)
upper = win_rates_df.quantile(1 - (100 - ci) / 200)  # Upper bound (e.g., 97.5th percentile for 95% CI)

diff_rates_df = win_rates_df
win_synth = win_rates_df.loc[:,1]
for col in diff_rates_df.columns:
    diff_rates_df[col] = diff_rates_df[col] - win_synth
diff_rates_df
lower_diff = diff_rates_df.quantile((100 - ci) / 200)  # Lower bound (e.g., 2.5th percentile for 95% CI)
upper_diff = diff_rates_df.quantile(1 - (100 - ci) / 200)  # Upper bound (e.g., 97.5th percentile for 95% CI)


# Create confidence intervals DataFrame
CIs = pd.DataFrame(np.vstack([lower, upper]), columns=df_experiment2.Source.unique(), index=["Lower", "Upper"])
CIs_list = [f"[{100 * CIs.iloc[0, i]:.1f}, {100 * CIs.iloc[1, i]:.1f}]" for i in range(4)]

# Create difference confidence intervals list
CIs_diff = pd.DataFrame(np.vstack([lower_diff, upper_diff]), columns=df_experiment2.Source.unique(), index=["Lower", "Upper"])
CIs_diff_list = [f"[{100 * CIs_diff.iloc[0, i]:.1f}, {100 * CIs_diff.iloc[1, i]:.1f}]" for i in range(4)]

# Create lists for raw win rates and differences
win_rates_list = [rate * 100 for rate in raw_win_rates]
win_diff_list = [win_rates_list[i] - win_rates_list[1] if i != 1 else "-" for i in range(4)]

# Create final DataFrame with confidence intervals and differences
df_CI2 = pd.DataFrame({
    "Win Rates (%)": win_rates_list,
    "95% CI": CIs_list,
    "Diff to SY": win_diff_list,
    "Diff 95% CI": CIs_diff_list
})

# Reorder rows and update index labels
df_CI2 = df_CI2.reindex([1, 3, 2])
df_CI2.index = ['Argument Synthesis (SY)', 'Original (OG)', 'Stronger Emphasis (SE)']

# Display the final DataFrame
df_CI2

Unnamed: 0,Win Rates (%),95% CI,Diff to SY,Diff 95% CI
Argument Synthesis (SY),48.470588,"[44.3, 53.1]",-,"[0.0, 0.0]"
Original (OG),51.960784,"[47.4, 56.3]",3.490196,"[-5.7, 11.5]"
Stronger Emphasis (SE),45.882353,"[35.8, 56.4]",-2.588235,"[-12.9, 9.2]"


## Evaluation of Validation Experiment 3.

In [13]:
# import original argument data
df_experiment3 = pd.read_excel(data_dir / "Validation_Study3_Arguments_with_Topics.xlsx", index_col=0)
df_experiment3 = df_experiment3.reset_index(drop=True)

# import survey results 
df_experiment_results = pd.read_excel(data_dir / "Validation_Study3_Survey_Results.xlsx", index_col=0).iloc[1:,:]
df_experiment_results = df_experiment_results.reset_index(drop=True)

df_experiment_results['usedIndices'] = df_experiment_results.__js_usedIndices.str.split("|")
df_experiment_results['arg_selected1'] = (df_experiment_results.__js_comparisonTextSelected01 == df_experiment_results.__js_comparisonText01B)*1
df_experiment_results['arg_selected2'] = (df_experiment_results.__js_comparisonTextSelected02 == df_experiment_results.__js_comparisonText02B)*1
df_experiment_results['arg_selected3'] = (df_experiment_results.__js_comparisonTextSelected03 == df_experiment_results.__js_comparisonText03B)*1
df_experiment_results['arg_selected4'] = (df_experiment_results.__js_comparisonTextSelected04 == df_experiment_results.__js_comparisonText04B)*1
df_experiment_results['arg_selected5'] = (df_experiment_results.__js_comparisonTextSelected05 == df_experiment_results.__js_comparisonText05B)*1

df_experiment_results.head()

Unnamed: 0,EndDate,UserLanguage,ConsentComprehend,ConsentConfirm,browserInfo_Browser,browserInfo_Version,browserInfo_Operating System,browserInfo_Resolution,Age,Gender,...,__js_comparisonText04A,__js_comparisonText04B,__js_comparisonText05A,__js_comparisonText05B,usedIndices,arg_selected1,arg_selected2,arg_selected3,arg_selected4,arg_selected5
0,2024-05-18 12:39:19,EN,"My participation is voluntary.,Refusal to part...","I consent, proceed to survey.",Chrome,124.0.0.0,Windows,1366x768,35-44 years old,Male,...,While it's important to note that not all anim...,The vast majority of animal farming requires h...,The inefficiency of animal agriculture is star...,Some argue that even if we granted the unlikel...,"[34, 48, 68, 75, 64]",1,1,0,1,0
1,2024-05-18 12:40:37,EN,"My participation is voluntary.,Refusal to part...","I consent, proceed to survey.",Chrome,124.0.0.0,Windows,1920x1080,35-44 years old,Male,...,Many religions profess reverence for life and ...,Many religions profess reverence for life and ...,Given the lack of transparency in meat product...,Given the lack of transparency in meat product...,"[0, 72, 19, 10, 68]",0,0,1,0,1
2,2024-05-18 12:42:42,EN,"My participation is voluntary.,Refusal to part...","I consent, proceed to survey.",Chrome,124.0.0.0,Chromium OS,1745x981,35-44 years old,Female,...,No creature should suffer for an imposed purpo...,No creature should suffer for an imposed purpo...,"Yes, soy farming poses environmental challenge...","While soy farming has its issues, it pales in ...","[32, 65, 61, 58, 1]",0,1,0,1,1
3,2024-05-18 12:43:25,EN,"My participation is voluntary.,Refusal to part...","I consent, proceed to survey.",Edge,109.0.1518.70,Windows,2560x1440,35-44 years old,Male,...,"As humans in an advanced society, we have diet...","As humans living in an advanced society, we po...",It's difficult to justify torturing puppies me...,It's difficult to justify torturing puppies me...,"[56, 2, 74, 4, 79]",0,0,0,1,1
4,2024-05-18 12:43:31,EN,"My participation is voluntary.,Refusal to part...","I consent, proceed to survey.",Chrome,124.0.0.0,Windows,1366x768,55-64 years old,Male,...,Humanity doesn't just undervalue animal existe...,Humanity doesn't grant personal right to under...,"In our world, it's vital to conserve water. Wh...","In our water-scarce world, critical resources ...","[46, 25, 81, 12, 86]",0,1,0,0,1


In [16]:
# Get tuples of experiment outcomes, each tuple is a pair of indices with winner in position 0
tuples_experiment3 = [] 
for i in range(df_experiment_results.shape[0]):
    for j in range(1,6):
        if df_experiment_results[f"arg_selected{j}"][i] == 0:
            tuples_experiment3.append(
                (np.where(df_experiment3.Argument == df_experiment_results[f"__js_comparisonText0{j}A"][i])[0][0],
                np.where(df_experiment3.Argument == df_experiment_results[f"__js_comparisonText0{j}B"][i])[0][0])
            )
        else:
            tuples_experiment3.append(
                (np.where(df_experiment3.Argument == df_experiment_results[f"__js_comparisonText0{j}B"][i])[0][0],
                np.where(df_experiment3.Argument == df_experiment_results[f"__js_comparisonText0{j}A"][i])[0][0])
            )

len(tuples_experiment3)

500

In [17]:
# Set random seed for reproducibility
random.seed(2024)

ci = 95
# Calculate raw win rates
raw_win_rates = calculate_raw_win_rates(df_experiment3, tuples_experiment3)

# Run bootstrap experiment and get raw win rates
win_rates_bs = CI_experiment_bootstrap(df_experiment3, tuples_experiment3, n=500)

win_rates_df = pd.DataFrame(win_rates_bs)
lower = win_rates_df.quantile((100 - ci) / 200)  # Lower bound (e.g., 2.5th percentile for 95% CI)
upper = win_rates_df.quantile(1 - (100 - ci) / 200)  # Upper bound (e.g., 97.5th percentile for 95% CI)


# Create confidence intervals DataFrame
CIs = pd.DataFrame(np.vstack([lower, upper]), columns=df_experiment3.Source.unique(), index=["Lower", "Upper"])
CIs_list = [f"[{100 * CIs.iloc[0, i]:.1f}, {100 * CIs.iloc[1, i]:.1f}]" for i in range(4)]


# Create lists for raw win rates and differences
win_rates_list = [rate * 100 for rate in raw_win_rates]
win_diff_list = [win_rates_list[i] - win_rates_list[1] if i != 1 else "-" for i in range(4)]

# Create final DataFrame with confidence intervals and differences
df_CI3 = pd.DataFrame({
    "Win Rates (%)": win_rates_list,
    "95% CI": CIs_list,
})

# Reorder rows and update index labels
df_CI3 = df_CI3.reindex([3, 2])
df_CI3.index = ['Synthetic Increased Topic (2) Args', 'Synthetic Decreased Topic (2) Args']

# Display the final DataFrame
df_CI3

Unnamed: 0,Win Rates (%),95% CI
Synthetic Increased Topic (2) Args,69.69697,"[65.3, 74.1]"
Synthetic Decreased Topic (2) Args,31.730769,"[22.7, 41.4]"


### Combine results

In [13]:
# bind df_CI1, df_CI2, df_CI3
print("Validation Study 1")
print(df_CI1)
print("\n")

print("Validation Study 2")
print(df_CI2)
print("\n")

print("Validation Study 3")
print(df_CI3)

Validation Study 1
                         Win Rates (%)        95% CI Diff to SY    Diff 95% CI
Argument Synthesis (SY)      54.022989  [49.9, 58.3]          -     [0.0, 0.0]
Original (OG)                45.411392  [41.4, 49.1]  -8.611596  [-15.7, -1.5]
GPT-best (GPT)               51.072961  [44.4, 57.1]  -2.950027   [-12.1, 5.5]
Stronger Emphasis (SE)       51.757188  [46.3, 56.8]    -2.2658    [-9.7, 5.2]


Validation Study 2
                         Win Rates (%)        95% CI Diff to SY   Diff 95% CI
Argument Synthesis (SY)      48.470588  [44.3, 53.1]          -    [0.0, 0.0]
Original (OG)                51.960784  [47.4, 56.3]   3.490196  [-5.7, 11.5]
Stronger Emphasis (SE)       45.882353  [35.8, 56.4]  -2.588235  [-12.9, 9.2]


Validation Study 3
                                    Win Rates (%)        95% CI
Synthetic Increased Topic (2) Args      69.696970  [65.3, 74.1]
Synthetic Decreased Topic (2) Args      31.730769  [22.7, 41.4]


## Filter arguments based on CVXPY topic inference 
Now we consider only the subset of pairwise comparisons that include arguments that satisfy the filtering when inferring topic loadings using CVXPY.

In [14]:
tuples_experiment1 = [tup for tup in tuples_experiment1 if not any(i in tup for i in [2, 3, 6, 11, 12, 13, 16, 17, 18, 20, 22, 24, 29, 31, 33, 34, 36, 41])]
print(f"{len(tuples_experiment1) = }" + " of previously 990 \n")

tuples_experiment2 = [tup for tup in tuples_experiment2 if not any(i in tup for i in [2, 3, 6, 11, 12, 13, 16, 17, 18, 20, 22, 24, 29, 31, 33, 34, 36, 41])]
print(f"{len(tuples_experiment2) = }" + " of previously 510 \n")

tuples_experiment3 = [tup for tup in tuples_experiment3 if not any(i in tup for i in [4, 5, 15, 16, 17, 19, 20, 26, 27, 30, 38, 47, 70])]
print(f"{len(tuples_experiment3) = }" + " of previously 500 \n")

len(tuples_experiment1) = 594 of previously 990 

len(tuples_experiment2) = 295 of previously 510 

len(tuples_experiment3) = 434 of previously 500 



Recalculate the results based on these subsets of the pairwise comparisons

In [15]:
# Set random seed for reproducibility
random.seed(2024)

ci = 95
# Calculate raw win rates
raw_win_rates = calculate_raw_win_rates(df_experiment1, tuples_experiment1)

# Run bootstrap experiment and get raw win rates
win_rates_bs = CI_experiment_bootstrap(df_experiment1, tuples_experiment1, n=500)

win_rates_df = pd.DataFrame(win_rates_bs)
lower = win_rates_df.quantile((100 - ci) / 200)  # Lower bound (e.g., 2.5th percentile for 95% CI)
upper = win_rates_df.quantile(1 - (100 - ci) / 200)  # Upper bound (e.g., 97.5th percentile for 95% CI)

diff_rates_df = win_rates_df
win_synth = win_rates_df.loc[:,1]
for col in diff_rates_df.columns:
    diff_rates_df[col] = diff_rates_df[col] - win_synth
diff_rates_df
lower_diff = diff_rates_df.quantile((100 - ci) / 200)  # Lower bound (e.g., 2.5th percentile for 95% CI)
upper_diff = diff_rates_df.quantile(1 - (100 - ci) / 200)  # Upper bound (e.g., 97.5th percentile for 95% CI)


# Create confidence intervals DataFrame
CIs = pd.DataFrame(np.vstack([lower, upper]), columns=df_experiment1.Source.unique(), index=["Lower", "Upper"])
CIs_list = [f"[{100 * CIs.iloc[0, i]:.1f}, {100 * CIs.iloc[1, i]:.1f}]" for i in range(4)]

# Create difference confidence intervals list
CIs_diff = pd.DataFrame(np.vstack([lower_diff, upper_diff]), columns=df_experiment1.Source.unique(), index=["Lower", "Upper"])
CIs_diff_list = [f"[{100 * CIs_diff.iloc[0, i]:.1f}, {100 * CIs_diff.iloc[1, i]:.1f}]" for i in range(4)]

# Create lists for raw win rates and differences
win_rates_list = [rate * 100 for rate in raw_win_rates]
win_diff_list = [win_rates_list[i] - win_rates_list[1] if i != 1 else "-" for i in range(4)]

# Create final DataFrame with confidence intervals and differences
df_CI1 = pd.DataFrame({
    "Win Rates (%)": win_rates_list,
    "95% CI": CIs_list,
    "Diff to SY": win_diff_list,
    "Diff 95% CI": CIs_diff_list
})

# Reorder rows and update index labels
df_CI1 = df_CI1.reindex([1, 3, 0, 2])
df_CI1.index = ['Argument Synthesis (SY)', 'Original (OG)', 'GPT-best (GPT)', 'Stronger Emphasis (SE)']

# Display the final DataFrame
df_CI1


Unnamed: 0,Win Rates (%),95% CI,Diff to SY,Diff 95% CI
Argument Synthesis (SY),54.477612,"[48.5, 60.2]",-,"[0.0, 0.0]"
Original (OG),45.183486,"[40.5, 50.0]",-9.294126,"[-18.3, 0.4]"
GPT-best (GPT),53.225806,"[46.4, 60.4]",-1.251805,"[-11.9, 9.1]"
Stronger Emphasis (SE),51.648352,"[44.4, 58.9]",-2.82926,"[-11.8, 7.4]"


In [16]:
# Set random seed for reproducibility
random.seed(2024)

ci = 95
# Calculate raw win rates
raw_win_rates = calculate_raw_win_rates(df_experiment2, tuples_experiment2)

# Run bootstrap experiment and get raw win rates
win_rates_bs = CI_experiment_bootstrap(df_experiment2, tuples_experiment2, n=500)

win_rates_df = pd.DataFrame(win_rates_bs)
lower = win_rates_df.quantile((100 - ci) / 200)  # Lower bound (e.g., 2.5th percentile for 95% CI)
upper = win_rates_df.quantile(1 - (100 - ci) / 200)  # Upper bound (e.g., 97.5th percentile for 95% CI)

diff_rates_df = win_rates_df
win_synth = win_rates_df.loc[:,1]
for col in diff_rates_df.columns:
    diff_rates_df[col] = diff_rates_df[col] - win_synth
diff_rates_df
lower_diff = diff_rates_df.quantile((100 - ci) / 200)  # Lower bound (e.g., 2.5th percentile for 95% CI)
upper_diff = diff_rates_df.quantile(1 - (100 - ci) / 200)  # Upper bound (e.g., 97.5th percentile for 95% CI)


# Create confidence intervals DataFrame
CIs = pd.DataFrame(np.vstack([lower, upper]), columns=df_experiment2.Source.unique(), index=["Lower", "Upper"])
CIs_list = [f"[{100 * CIs.iloc[0, i]:.1f}, {100 * CIs.iloc[1, i]:.1f}]" for i in range(4)]

# Create difference confidence intervals list
CIs_diff = pd.DataFrame(np.vstack([lower_diff, upper_diff]), columns=df_experiment2.Source.unique(), index=["Lower", "Upper"])
CIs_diff_list = [f"[{100 * CIs_diff.iloc[0, i]:.1f}, {100 * CIs_diff.iloc[1, i]:.1f}]" for i in range(4)]

# Create lists for raw win rates and differences
win_rates_list = [rate * 100 for rate in raw_win_rates]
win_diff_list = [win_rates_list[i] - win_rates_list[1] if i != 1 else "-" for i in range(4)]

# Create final DataFrame with confidence intervals and differences
df_CI2 = pd.DataFrame({
    "Win Rates (%)": win_rates_list,
    "95% CI": CIs_list,
    "Diff to SY": win_diff_list,
    "Diff 95% CI": CIs_diff_list
})

# Reorder rows and update index labels
df_CI2 = df_CI2.reindex([1, 3, 2])
df_CI2.index = ['Argument Synthesis (SY)', 'Original (OG)', 'Stronger Emphasis (SE)']

# Display the final DataFrame
df_CI2

Unnamed: 0,Win Rates (%),95% CI,Diff to SY,Diff 95% CI
Argument Synthesis (SY),46.666667,"[40.5, 53.3]",-,"[0.0, 0.0]"
Original (OG),54.237288,"[48.8, 59.3]",7.570621,"[-4.6, 18.8]"
Stronger Emphasis (SE),41.818182,"[29.1, 54.4]",-4.848485,"[-19.2, 8.4]"


In [17]:
# Set random seed for reproducibility
random.seed(2024)

ci = 95
# Calculate raw win rates
raw_win_rates = calculate_raw_win_rates(df_experiment3, tuples_experiment3)

# Run bootstrap experiment and get raw win rates
win_rates_bs = CI_experiment_bootstrap(df_experiment3, tuples_experiment3, n=500)

win_rates_df = pd.DataFrame(win_rates_bs)
lower = win_rates_df.quantile((100 - ci) / 200)  # Lower bound (e.g., 2.5th percentile for 95% CI)
upper = win_rates_df.quantile(1 - (100 - ci) / 200)  # Upper bound (e.g., 97.5th percentile for 95% CI)


# Create confidence intervals DataFrame
CIs = pd.DataFrame(np.vstack([lower, upper]), columns=df_experiment3.Source.unique(), index=["Lower", "Upper"])
CIs_list = [f"[{100 * CIs.iloc[0, i]:.1f}, {100 * CIs.iloc[1, i]:.1f}]" for i in range(4)]


# Create lists for raw win rates and differences
win_rates_list = [rate * 100 for rate in raw_win_rates]
win_diff_list = [win_rates_list[i] - win_rates_list[1] if i != 1 else "-" for i in range(4)]

# Create final DataFrame with confidence intervals and differences
df_CI3 = pd.DataFrame({
    "Win Rates (%)": win_rates_list,
    "95% CI": CIs_list,
})

# Reorder rows and update index labels
df_CI3 = df_CI3.reindex([3, 2])
df_CI3.index = ['Synthetic Increased Topic (2) Args', 'Synthetic Decreased Topic (2) Args']

# Display the final DataFrame
df_CI3

Unnamed: 0,Win Rates (%),95% CI
Synthetic Increased Topic (2) Args,70.0,"[64.6, 74.8]"
Synthetic Decreased Topic (2) Args,31.730769,"[22.3, 40.3]"


In [18]:
# bind df_CI1, df_CI2, df_CI3
print("Validation Study 1")
print(df_CI1)
print("\n")

print("Validation Study 2")
print(df_CI2)
print("\n")

print("Validation Study 3")
print(df_CI3)

Validation Study 1
                         Win Rates (%)        95% CI Diff to SY   Diff 95% CI
Argument Synthesis (SY)      54.477612  [48.5, 60.2]          -    [0.0, 0.0]
Original (OG)                45.183486  [40.5, 50.0]  -9.294126  [-18.3, 0.4]
GPT-best (GPT)               53.225806  [46.4, 60.4]  -1.251805  [-11.9, 9.1]
Stronger Emphasis (SE)       51.648352  [44.4, 58.9]   -2.82926  [-11.8, 7.4]


Validation Study 2
                         Win Rates (%)        95% CI Diff to SY   Diff 95% CI
Argument Synthesis (SY)      46.666667  [40.5, 53.3]          -    [0.0, 0.0]
Original (OG)                54.237288  [48.8, 59.3]   7.570621  [-4.6, 18.8]
Stronger Emphasis (SE)       41.818182  [29.1, 54.4]  -4.848485  [-19.2, 8.4]


Validation Study 3
                                    Win Rates (%)        95% CI
Synthetic Increased Topic (2) Args      70.000000  [64.6, 74.8]
Synthetic Decreased Topic (2) Args      31.730769  [22.3, 40.3]
