# Evaluation of the Results of the Validation Experiments
The following script produces the win rates for different argument categories of the validation experiments based on the raw survey results.

In [1]:
import pandas as pd 
import numpy as np
from pathlib import Path
import random

data_dir= Path("~/Dropbox (Princeton)/CF-Text/Software_Data/Data/AutoPersuadePaper_Data_Sep2024/").expanduser()

## Evaluation of Validation Experiment 1.

In [2]:
# import original argument data
df_experiment = pd.read_excel(data_dir / "Validation_Study1_Arguments.xlsx", index_col=0).iloc[:, 2:]
df_experiment = df_experiment.reset_index(drop=True)

# import survey results 
df_experiment_results = pd.read_excel(data_dir / "Validation_Study1_Survey_Results.xlsx").iloc[1:,:]
df_experiment_results = df_experiment_results.reset_index(drop=True)

df_experiment_results['usedIndices'] = df_experiment_results.__js_usedIndices.str.split("|")
df_experiment_results['arg_selected1'] = (df_experiment_results.__js_comparisonTextSelected01 == df_experiment_results.__js_comparisonText01B)*1
df_experiment_results['arg_selected2'] = (df_experiment_results.__js_comparisonTextSelected02 == df_experiment_results.__js_comparisonText02B)*1
df_experiment_results['arg_selected3'] = (df_experiment_results.__js_comparisonTextSelected03 == df_experiment_results.__js_comparisonText03B)*1
df_experiment_results['arg_selected4'] = (df_experiment_results.__js_comparisonTextSelected04 == df_experiment_results.__js_comparisonText04B)*1
df_experiment_results['arg_selected5'] = (df_experiment_results.__js_comparisonTextSelected05 == df_experiment_results.__js_comparisonText05B)*1

df_experiment_results.head()

Unnamed: 0,StartDate,EndDate,Status,IPAddress,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,RecipientLastName,...,__js_comparisonText04A,__js_comparisonText04B,__js_comparisonText05A,__js_comparisonText05B,usedIndices,arg_selected1,arg_selected2,arg_selected3,arg_selected4,arg_selected5
0,2024-03-13 08:19:52,2024-03-13 08:29:14,IP Address,*******,100,561,True,2024-03-13 08:29:14.989000,R_1rD81Z51MN3cFPd,*******,...,The meat industry emits more greenhouse gases ...,Industrial animal farming produces three times...,Meat production squanders 70% of grains and 20...,Adopting veganism is a powerful step towards a...,"[94, 1, 11, 63, 4, 90, 23, 59, 2, 54]",1,1,1,1,0
1,2024-03-13 08:19:54,2024-03-13 08:26:49,IP Address,*******,100,414,True,2024-03-13 08:26:49.890000,R_26ypZ06I4BIADIZ,*******,...,It's a general consensus that unnecessary anim...,Adopting veganism is a powerful step towards a...,Veganism combats global warming and resource w...,"By reducing our meat consumption, we're lessen...","[43, 70, 69, 41, 8, 3, 67, 54, 18, 94]",1,0,0,1,0
2,2024-03-13 08:19:57,2024-03-13 08:28:02,IP Address,*******,100,485,True,2024-03-13 08:28:03.042000,R_2Wgv4ekigSHBfz3,*******,...,Animal farming is a significant contributor to...,Switching to a plant-based diet is crucial for...,Adopt a vegan diet to combat global hunger and...,"Producing 1kg of meat requires 2.8kg of crops,...","[95, 12, 86, 29, 14, 68, 90, 30, 8, 69]",1,1,1,1,0
3,2024-03-13 08:19:57,2024-03-13 08:28:50,IP Address,*******,100,533,True,2024-03-13 08:28:51.223000,R_1I2L1BtQKGCMLzH,*******,...,Veganism safeguards our planet by reducing car...,Dairy cows are often slaughtered after only 3-...,Shifting to veganism combats global hunger by ...,Adopt a vegan lifestyle to combat climate chan...,"[32, 92, 4, 39, 71, 18, 48, 76, 34, 5]",1,1,1,0,0
4,2024-03-13 08:19:58,2024-03-13 08:26:05,IP Address,*******,100,367,True,2024-03-13 08:26:06.713000,R_2GycBEvLYsCQA78,*******,...,Adopting a vegan lifestyle can significantly c...,Adopt veganism and fight climate change! This ...,Switching to a plant-based diet is crucial for...,The misuse of antibiotics in livestock breedin...,"[19, 59, 42, 47, 31, 1, 66, 25, 30, 57]",0,1,1,0,1


In [3]:
# Get tuples of experiment outcomes, each tuple is a pair of indices with the winner in position 0
tuples_experiment = []

for i, indices in enumerate(df_experiment_results.usedIndices):
    # Split the list into tuples of two
    tuples = [(indices[j], indices[j + 1]) for j in range(0, len(indices), 2)]

    # Check if the length of tuples matches the expected length
    if len(tuples) != 5:
        print("Error: Length of tuples does not match expected length")

    # Reverse tuples based on the corresponding 'arg_selected' values
    for j in range(5):
        if df_experiment_results[f'arg_selected{j + 1}'][i] == 1:
            tuples[j] = tuples[j][::-1]

    # Extend the main list with formatted tuples
    tuples_experiment.extend((int(tup[0]), int(tup[1])) for tup in tuples)


In [4]:
df_experiment_results_table = pd.DataFrame(None, columns = np.sort(df_experiment.Source.unique()), index= np.sort(df_experiment.Source.unique()))
df_experiment_results_table = df_experiment_results_table.fillna(0)

for tup in tuples_experiment:
    df_experiment_results_table.loc[df_experiment.Source[tup[1]], df_experiment.Source[tup[0]]] += 1

df_experiment_results_table

Unnamed: 0,New_GPT_Argument,New_Synthetic_Argument_Combined,New_Synthetic_Argument_Stronger_Emphasis,Original_Collection
New_GPT_Argument,10,42,18,54
New_Synthetic_Argument_Combined,37,93,43,160
New_Synthetic_Argument_Stronger_Emphasis,24,54,37,73
Original_Collection,58,186,101,0


Calculate win rates per argument group including confidence intervals

In [5]:
def calculate_raw_win_rates(df_experiment, tuples_experiment):
    """
    Calculate raw win rates from the provided DataFrame and tuples of experiments.

    Parameters:
    - df_experiment: DataFrame containing experiment results with a 'Source' column.
    - tuples_experiment: List of tuples where each tuple represents a comparison.

    Returns:
    - raw_win_rates: Series containing raw win rates for each source.
    """
    # Initialize results table with zeros
    sources = np.sort(df_experiment.Source.unique())
    df_results_table = pd.DataFrame(0, columns=sources, index=sources)

    # Calculate counts
    for tup in tuples_experiment:
        df_results_table.loc[df_experiment.Source[tup[1]], df_experiment.Source[tup[0]]] += 1

    # Set diagonal values to zero
    np.fill_diagonal(df_results_table.values, 0)

    # Calculate raw win rates
    raw_win_rates = df_results_table.sum(axis=0) / (df_results_table.sum(axis=0) + df_results_table.sum(axis=1))
    
    return raw_win_rates

def CI_experiment_bootstrap(df_experiment, tuples_experiment, n=10):
    """
    Perform bootstrap sampling to calculate win rates for each source.

    Parameters:
    - df_experiment: DataFrame containing experiment results with a 'Source' column.
    - tuples_experiment: List of tuples where each tuple represents a comparison between sources.
    - n: Number of bootstrap samples to generate (default is 10).

    Returns:
    - win_rates_bs: A 2D NumPy array where each row represents the win rates from a bootstrap sample.
    """
    df_results_table = pd.DataFrame(None, columns = np.sort(df_experiment.Source.unique()), index= np.sort(df_experiment.Source.unique()))
    df_results_table = df_results_table.fillna(0)

    win_rates_bs = np.matrix(np.zeros((n, 4)))
    for k in range(n):
        bootstrap_sample = random.choices(tuples_experiment, k=len(tuples_experiment))
        
        for tup in bootstrap_sample:
            df_results_table.loc[df_experiment.Source[tup[1]], df_experiment.Source[tup[0]]] += 1

        for i in range(4): 
            df_results_table.iloc[i,i] = 0

        # get column sums of results table 
        win_rates = df_results_table.sum(axis=0) / (df_results_table.sum(axis=0) + df_results_table.sum(axis=1))
        
        win_rates_bs[k,:] = win_rates
    
    return win_rates_bs


In [6]:
# Set random seed for reproducibility
random.seed(2024)

# Calculate raw win rates
raw_win_rates = calculate_raw_win_rates(df_experiment, tuples_experiment)

# Run bootstrap experiment and get raw win rates
win_rates_bs = CI_experiment_bootstrap(df_experiment, tuples_experiment, n=500)

# Calculate confidence intervals for bootstrapped results
mean_win_rates = np.mean(win_rates_bs, axis=0)
std_win_rates = np.std(win_rates_bs - mean_win_rates, axis=0)
lower = (mean_win_rates - 1.96 * std_win_rates).reshape(1, -1)
upper = (mean_win_rates + 1.96 * std_win_rates).reshape(1, -1)

# Calculate differences and their confidence intervals
diffs = [win_rates_bs[:, 1] - win_rates_bs[:, i] for i in range(4)]
diff_CIs = [(np.mean(diff) - 1.96 * np.std(diff - np.mean(diff)),
             np.mean(diff) + 1.96 * np.std(diff - np.mean(diff))) for diff in diffs]

# Create confidence intervals DataFrame
CIs = pd.DataFrame(np.vstack([lower, upper]), columns=df_experiment.Source.unique(), index=["Lower", "Upper"])
CIs_list = [f"[{100 * CIs.iloc[0, i]:.1f}, {100 * CIs.iloc[1, i]:.1f}]" for i in range(4)]

# Create difference confidence intervals list
CIs_diff_list = [f"[{100 * low:.1f}, {100 * up:.1f}]" if i != 1 else "-" for i, (low, up) in enumerate(diff_CIs)]

# Create lists for raw win rates and differences
win_rates_list = [rate * 100 for rate in raw_win_rates]
win_diff_list = [win_rates_list[1] - win_rates_list[i] if i != 1 else "-" for i in range(4)]

# Create final DataFrame with confidence intervals and differences
df_CI = pd.DataFrame({
    "Win Rates (%)": win_rates_list,
    "95% CI": CIs_list,
    "Diff to SY": win_diff_list,
    "Diff 95% CI": CIs_diff_list
})

# Reorder rows and update index labels
df_CI = df_CI.reindex([1, 3, 0, 2])
df_CI.index = ['Argument Synthesis (SY)', 'Original (OG)', 'GPT-best (GPT)', 'Stronger Emphasis (SE)']

# Display the final DataFrame
df_CI


Unnamed: 0,Win Rates (%),95% CI,Diff to SY,Diff 95% CI
Argument Synthesis (SY),54.022989,"[53.7, 54.7]",-,-
Original (OG),45.411392,"[45.0, 45.5]",8.611596,"[8.2, 9.6]"
GPT-best (GPT),51.072961,"[50.4, 51.2]",2.950027,"[2.5, 4.2]"
Stronger Emphasis (SE),51.757188,"[51.7, 52.3]",2.2658,"[1.4, 2.9]"


## Evaluation of Validation Experiment 2.

In [7]:
# import original argument data
df_experiment = pd.read_excel(data_dir / "Validation_Study1_Arguments.xlsx", index_col=0).iloc[:, 2:]
df_experiment = df_experiment.reset_index(drop=True)

# import survey results 
df_experiment_results = df_experiment_results.reset_index(drop=True)
df_experiment_results = pd.read_excel(data_dir / "Validation_Study2_Survey_Results.xlsx", index_col=0).iloc[1:,:]

df_experiment_results['usedIndices'] = df_experiment_results.__js_usedIndices.str.split("|")
df_experiment_results['arg_selected1'] = (df_experiment_results.__js_comparisonTextSelected01 == df_experiment_results.__js_comparisonText01B)*1
df_experiment_results['arg_selected2'] = (df_experiment_results.__js_comparisonTextSelected02 == df_experiment_results.__js_comparisonText02B)*1
df_experiment_results['arg_selected3'] = (df_experiment_results.__js_comparisonTextSelected03 == df_experiment_results.__js_comparisonText03B)*1
df_experiment_results['arg_selected4'] = (df_experiment_results.__js_comparisonTextSelected04 == df_experiment_results.__js_comparisonText04B)*1
df_experiment_results['arg_selected5'] = (df_experiment_results.__js_comparisonTextSelected05 == df_experiment_results.__js_comparisonText05B)*1

df_experiment_results.head()

Unnamed: 0_level_0,EndDate,Status,IPAddress,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,RecipientLastName,RecipientFirstName,...,__js_comparisonText04A,__js_comparisonText04B,__js_comparisonText05A,__js_comparisonText05B,usedIndices,arg_selected1,arg_selected2,arg_selected3,arg_selected4,arg_selected5
StartDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-04-22 09:06:26,2024-04-22 09:11:16,IP Address,71.193.44.237,100,290,True,2024-04-22 09:11:17.735000,R_2q1vJCGauUQUcuu,,,...,Adopt veganism and fight climate change by sto...,Lower your carbon emissions effortlessly by ad...,Veganism could boost the global food supply by...,"By consuming 70% of all grain, 98% of soy, and...","[31, 24, 54, 8, 28]",1,0,0,1,1
2024-04-22 09:06:35,2024-04-22 09:13:35,IP Address,216.245.127.154,100,420,True,2024-04-22 09:13:36.643000,R_5wisyMhdjcXlZF1,,,...,"Producing 1kg of meat requires 2.8kg of crops,...",Livestock farming devours 1/3 of Earth's ferti...,Adopting veganism is not just a diet change bu...,Embracing veganism drastically cuts resource w...,"[73, 24, 57, 49, 65]",1,0,1,1,0
2024-04-22 09:06:45,2024-04-22 09:13:29,IP Address,73.248.2.129,100,403,True,2024-04-22 09:13:30.546000,R_1llAFiJin9iKacx,,,...,"A vegan diet, rich in essential nutrients, has...",Countless health studies have validated that a...,"By consuming 70% of all grain, 98% of soy, and...",Shifting to a vegan diet combats global hunger...,"[23, 25, 17, 70, 19]",1,1,1,1,0
2024-04-22 09:06:46,2024-04-22 09:12:28,IP Address,72.46.221.238,100,341,True,2024-04-22 09:12:29.151000,R_715HokoNibnkUqU,,,...,Livestock farming emits potent greenhouse gase...,The livestock industry is a major contributor ...,Livestock farming devours 1/3 of Earth's ferti...,Animal-based diets contribute to 60% of global...,"[29, 25, 39, 22, 48]",0,0,1,0,1
2024-04-22 09:06:56,2024-04-22 09:15:53,IP Address,73.161.247.123,100,536,True,2024-04-22 09:15:53.931000,R_20OWNMgRZ1nvJ93,,,...,Choosing veganism combats climate change by re...,The global farming of billions of animals for ...,Shifting to a vegan diet tackles global hunger...,Adopting a vegan lifestyle can significantly c...,"[7, 17, 16, 38, 32]",1,1,1,1,1


In [8]:
# Get tuples of experiment outcomes, each tuple is a pair of indices with winner in position 0
tuples_experiment = [] 
for i in range(df_experiment_results.shape[0]):
    for j in range(1,6):
        if df_experiment_results[f"arg_selected{j}"][i] == 0:
            tuples_experiment.append(
                (np.where(df_experiment.synth_text == df_experiment_results[f"__js_comparisonText0{j}A"][i])[0][0],
                np.where(df_experiment.synth_text == df_experiment_results[f"__js_comparisonText0{j}B"][i])[0][0])
            )
        else:
            tuples_experiment.append(
                (np.where(df_experiment.synth_text == df_experiment_results[f"__js_comparisonText0{j}B"][i])[0][0],
                np.where(df_experiment.synth_text == df_experiment_results[f"__js_comparisonText0{j}A"][i])[0][0])
            )

In [9]:
# Set random seed for reproducibility
random.seed(2024)

# Calculate raw win rates
raw_win_rates = calculate_raw_win_rates(df_experiment, tuples_experiment)

# Run bootstrap experiment and get raw win rates
win_rates_bs = CI_experiment_bootstrap(df_experiment, tuples_experiment, n=500)

# Calculate confidence intervals for bootstrapped results
mean_win_rates = np.mean(win_rates_bs, axis=0)
std_win_rates = np.std(win_rates_bs - mean_win_rates, axis=0)
lower = (mean_win_rates - 1.96 * std_win_rates).reshape(1, -1)
upper = (mean_win_rates + 1.96 * std_win_rates).reshape(1, -1)

# Calculate differences and their confidence intervals
diffs = [win_rates_bs[:, 1] - win_rates_bs[:, i] for i in range(4)]
diff_CIs = [(np.mean(diff) - 1.96 * np.std(diff - np.mean(diff)),
             np.mean(diff) + 1.96 * np.std(diff - np.mean(diff))) for diff in diffs]

# Create confidence intervals DataFrame
CIs = pd.DataFrame(np.vstack([lower, upper]), columns=df_experiment.Source.unique(), index=["Lower", "Upper"])
CIs_list = [f"[{100 * CIs.iloc[0, i]:.1f}, {100 * CIs.iloc[1, i]:.1f}]" for i in range(4)]

# Create difference confidence intervals list
CIs_diff_list = [f"[{100 * low:.1f}, {100 * up:.1f}]" if i != 1 else "-" for i, (low, up) in enumerate(diff_CIs)]

# Create lists for raw win rates and differences
win_rates_list = [rate * 100 for rate in raw_win_rates]
win_diff_list = [win_rates_list[1] - win_rates_list[i] if i != 1 else "-" for i in range(4)]

# Create final DataFrame with confidence intervals and differences
df_CI = pd.DataFrame({
    "Win Rates (%)": win_rates_list,
    "95% CI": CIs_list,
    "Diff to SY": win_diff_list,
    "Diff 95% CI": CIs_diff_list
})

# Reorder rows and update index labels
df_CI = df_CI.reindex([1, 3, 2])
df_CI.index = ['Argument Synthesis (SY)', 'Original (OG)', 'Stronger Emphasis (SE)']

# Display the final DataFrame
df_CI

Unnamed: 0,Win Rates (%),95% CI,Diff to SY,Diff 95% CI
Argument Synthesis (SY),48.470588,"[47.9, 49.0]",-,-
Original (OG),51.960784,"[51.5, 52.5]",-3.490196,"[-4.6, -2.5]"
Stronger Emphasis (SE),45.882353,"[45.6, 46.8]",2.588235,"[1.6, 2.8]"


## Evaluation of Validation Experiment 3.

In [10]:
# import original argument data
df_experiment = pd.read_excel(data_dir / "Validation_Study3_Arguments.xlsx", index_col=0)
df_experiment.rename(columns = {'kind': 'Source'}, inplace = True)
df_experiment = df_experiment.reset_index(drop=True)

# import survey results 
df_experiment_results = pd.read_excel(data_dir / "Validation_Study3_Survey_Results.xlsx", index_col=0).iloc[1:101,:]
df_experiment_results = df_experiment_results.reset_index(drop=True)

df_experiment_results['usedIndices'] = df_experiment_results.__js_usedIndices.str.split("|")
df_experiment_results['arg_selected1'] = (df_experiment_results.__js_comparisonTextSelected01 == df_experiment_results.__js_comparisonText01B)*1
df_experiment_results['arg_selected2'] = (df_experiment_results.__js_comparisonTextSelected02 == df_experiment_results.__js_comparisonText02B)*1
df_experiment_results['arg_selected3'] = (df_experiment_results.__js_comparisonTextSelected03 == df_experiment_results.__js_comparisonText03B)*1
df_experiment_results['arg_selected4'] = (df_experiment_results.__js_comparisonTextSelected04 == df_experiment_results.__js_comparisonText04B)*1
df_experiment_results['arg_selected5'] = (df_experiment_results.__js_comparisonTextSelected05 == df_experiment_results.__js_comparisonText05B)*1

df_experiment_results.head()

Unnamed: 0,EndDate,Status,IPAddress,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,RecipientLastName,RecipientFirstName,...,__js_comparisonText04A,__js_comparisonText04B,__js_comparisonText05A,__js_comparisonText05B,usedIndices,arg_selected1,arg_selected2,arg_selected3,arg_selected4,arg_selected5
0,2024-05-18 12:39:19,IP Address,*******,100,258,True,2024-05-18 12:39:20.253000,R_3Us9Yd6ShAggEUh,*******,*******,...,While it's important to note that not all anim...,The vast majority of animal farming requires h...,The inefficiency of animal agriculture is star...,Some argue that even if we granted the unlikel...,"[34, 48, 68, 75, 64]",1,1,0,1,0
1,2024-05-18 12:40:37,IP Address,*******,100,289,True,2024-05-18 12:40:38.585000,R_2CbOD8Cvg1pLljr,*******,*******,...,Many religions profess reverence for life and ...,Many religions profess reverence for life and ...,Given the lack of transparency in meat product...,Given the lack of transparency in meat product...,"[0, 72, 19, 10, 68]",0,0,1,0,1
2,2024-05-18 12:42:42,IP Address,*******,100,484,True,2024-05-18 12:42:43.300000,R_5f19jzxRuUufrHq,*******,*******,...,No creature should suffer for an imposed purpo...,No creature should suffer for an imposed purpo...,"Yes, soy farming poses environmental challenge...","While soy farming has its issues, it pales in ...","[32, 65, 61, 58, 1]",0,1,0,1,1
3,2024-05-18 12:43:25,IP Address,*******,100,219,True,2024-05-18 12:43:26.743000,R_7hud78Z8OuP3QVJ,*******,*******,...,"As humans in an advanced society, we have diet...","As humans living in an advanced society, we po...",It's difficult to justify torturing puppies me...,It's difficult to justify torturing puppies me...,"[56, 2, 74, 4, 79]",0,0,0,1,1
4,2024-05-18 12:43:31,IP Address,*******,100,537,True,2024-05-18 12:43:32.643000,R_2YXPXtn3PTxjOm9,*******,*******,...,Humanity doesn't just undervalue animal existe...,Humanity doesn't grant personal right to under...,"In our world, it's vital to conserve water. Wh...","In our water-scarce world, critical resources ...","[46, 25, 81, 12, 86]",0,1,0,0,1


In [11]:
# Get tuples of experiment outcomes, each tuple is a pair of indices with winner in position 0
tuples_experiment = [] 
for i in range(df_experiment_results.shape[0]):
    for j in range(1,6):
        if df_experiment_results[f"arg_selected{j}"][i] == 0:
            tuples_experiment.append(
                (np.where(df_experiment.New_Argument == df_experiment_results[f"__js_comparisonText0{j}A"][i])[0][0],
                np.where(df_experiment.New_Argument == df_experiment_results[f"__js_comparisonText0{j}B"][i])[0][0])
            )
        else:
            tuples_experiment.append(
                (np.where(df_experiment.New_Argument == df_experiment_results[f"__js_comparisonText0{j}B"][i])[0][0],
                np.where(df_experiment.New_Argument == df_experiment_results[f"__js_comparisonText0{j}A"][i])[0][0])
            )

In [12]:
# Set random seed for reproducibility
random.seed(2024)

# Calculate raw win rates
raw_win_rates = calculate_raw_win_rates(df_experiment, tuples_experiment)

# Run bootstrap experiment and get raw win rates
win_rates_bs = CI_experiment_bootstrap(df_experiment, tuples_experiment, n =500)

# Calculate confidence intervals for bootstrapped results
mean_win_rates = np.mean(win_rates_bs, axis=0)
std_win_rates = np.std(win_rates_bs - mean_win_rates, axis=0)
lower = (mean_win_rates - 1.96 * std_win_rates).reshape(1, -1)
upper = (mean_win_rates + 1.96 * std_win_rates).reshape(1, -1)


# Create confidence intervals DataFrame
CIs = pd.DataFrame(np.vstack([lower, upper]), columns=df_experiment.Source.unique(), index=["Lower", "Upper"])
CIs_list = [f"[{100 * CIs.iloc[0, i]:.1f}, {100 * CIs.iloc[1, i]:.1f}]" for i in range(4)]

# Create lists for raw win rates and differences
win_rates_list = [rate * 100 for rate in raw_win_rates]

# Create final DataFrame with confidence intervals and differences
df_CI = pd.DataFrame({
    "Win Rates (%)": win_rates_list,
    "95% CI": CIs_list,
})

# Reorder rows and update index labels
df_CI = df_CI.reindex([3, 2])
df_CI.index = ['Synthetic Increased Topic (2) Args', 'Synthetic Decreased Topic (2) Args']

# Display the final DataFrame
df_CI

Unnamed: 0,Win Rates (%),95% CI
Synthetic Increased Topic (2) Args,69.69697,"[69.4, 69.9]"
Synthetic Decreased Topic (2) Args,31.730769,"[31.0, 32.5]"
