# Extract Survey Results and Calculate Bradley-Terry Scores
The following script extracts the original survey results and calculates the Bradley-Terry scores used for later analysis.

In [30]:
import pandas as pd 
import numpy as np
from pathlib import Path
data_dir= Path("~/Dropbox (Princeton)/CF-Text/Software_Data/Data/Veganism").expanduser()

In [31]:
# import original argument data
df = pd.read_excel(data_dir / "Veganism_Argument_Data.xlsx")

# import survey data
df_results = pd.read_excel(data_dir / "Survey/Veganism_survey_results_complete_clean.xlsx").iloc[1:,:]
df_results.reset_index(drop=True, inplace=True)

# Include column in df to indicate which argument was selected
df_results['usedIndices'] = df_results.__js_usedIndices.str.split("|")
df_results['arg_selected1'] = (df_results.__js_comparisonTextSelected01 == df_results.__js_comparisonText01B)*1
df_results['arg_selected2'] = (df_results.__js_comparisonTextSelected02 == df_results.__js_comparisonText02B)*1
df_results['arg_selected3'] = (df_results.__js_comparisonTextSelected03 == df_results.__js_comparisonText03B)*1
df_results['arg_selected4'] = (df_results.__js_comparisonTextSelected04 == df_results.__js_comparisonText04B)*1
df_results['arg_selected5'] = (df_results.__js_comparisonTextSelected05 == df_results.__js_comparisonText05B)*1

df_results.head()

Unnamed: 0,StartDate,EndDate,Status,IPAddress,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,RecipientLastName,...,__js_comparisonText04A,__js_comparisonText04B,__js_comparisonText05A,__js_comparisonText05B,usedIndices,arg_selected1,arg_selected2,arg_selected3,arg_selected4,arg_selected5
0,2024-02-04 06:47:18,2024-02-04 06:52:56,IP Address,*******,100,337,True,2024-02-04 06:52:57.880000,R_614r9SO7WrJO3dc,*******,...,The poultry industry is responsible for the ma...,Regardless of how localized or organic your me...,Regardless of whether it's free-range or indus...,"Every sentient being, including animals, inher...","[197, 142, 245, 826, 660, 1223, 512, 669, 45, ...",1,0,0,0,0
1,2024-02-04 06:47:15,2024-02-04 06:53:34,IP Address,*******,100,378,True,2024-02-04 06:53:35.309000,R_32bgAfie4Vw3vak,*******,...,Taking the life of an animal prematurely for m...,Every person consuming meat indirectly partake...,"Morality, including dietary ethics, must be se...",Our capacity to do something doesn't validate ...,"[1265, 1060, 72, 391, 148, 146, 672, 757, 653,...",1,0,1,0,1
2,2024-02-04 06:48:37,2024-02-04 06:54:04,IP Address,*******,100,326,True,2024-02-04 06:54:05.790000,R_3wWkRO3jWvDuyKR,*******,...,It is inconsistent to denounce the horrors of ...,Technological progress doesn't justify animal ...,"Consuming plant-based products, including thos...","By adopting veganism, you can vastly lower you...","[1090, 483, 274, 1134, 282, 239, 855, 353, 534...",0,0,1,0,0
3,2024-02-04 06:47:30,2024-02-04 06:54:50,IP Address,*******,100,440,True,2024-02-04 06:54:52.297000,R_1HobTJ9yVHLYCp4,*******,...,The act of ending an animal's life prematurely...,Factory-farmed chickens live their lives in to...,Industrial farming places monetary gains ahead...,"Factory farming embodies a flawed ethos, prior...","[132, 287, 1280, 639, 1208, 921, 1055, 506, 86...",0,0,1,1,0
4,2024-02-04 06:49:47,2024-02-04 06:55:06,IP Address,*******,100,319,True,2024-02-04 06:55:07.889000,R_3OoIefhBhQGfN8j,*******,...,The World Wildlife Fund reports that 60% of wi...,Animals should have the right to autonomy over...,Religious mandates don't always align with eth...,It is a fact that the majority of our meat sup...,"[774, 837, 894, 707, 1050, 1155, 1247, 372, 33...",0,1,0,0,1


In [32]:
# Create tuples that contain the indices of the compared arguments, first one is the one that was selected

tuples_all = []
for i in range(df_results.shape[0]):
    indices = df_results.usedIndices[i] 

    # Split the list into tuples of two
    tuples = [(indices[i], indices[i+1]) for i in range(0, len(indices), 2)]
    # print(tuples)

    # Iterate over the tuples and df_results simultaneously
    if len(tuples) != 5:
        print("Error: Length of tuples does not match expected length")

    if df_results.arg_selected1[i] == 1: 
        tuples[0] = tuples[0][::-1]
        # print("Switch tuple 1")

    if df_results.arg_selected2[i] == 1: 
        tuples[1] = tuples[1][::-1]
        # print("Switch tuple 2")

    if df_results.arg_selected3[i] == 1:
        tuples[2] = tuples[2][::-1]
        # print("Switch tuple 3")

    if df_results.arg_selected4[i] == 1:
        tuples[3] = tuples[3][::-1]
        # print("Switch tuple 4")

    if df_results.arg_selected5[i] == 1:
        tuples[4] = tuples[4][::-1]
        # print("Switch tuple 5")

    # print(tuples)

    tuples_all.extend(tuples)

# change format of all elements in tuples_all to int 
tuples_all = [(int(tup[0]), int(tup[1])) for tup in tuples_all]

len(flat_list) = 10360, len(set(flat_list)) = 1307


In [33]:
# split tuples data into training and test set
tuples_train = []
tuples_test = [] 

for i in range(len(tuples_all)):
    if int(df.Training_set[tuples_all[i][0]] + df.Training_set[tuples_all[i][1]]) == 2:
        tuples_train.append(tuples_all[i])
    if int(df.Training_set[tuples_all[i][0]] + df.Training_set[tuples_all[i][1]]) == 1:
        tuples_test.append(tuples_all[i])
    if int(df.Training_set[tuples_all[i][0]] + df.Training_set[tuples_all[i][1]]) == 0:
        print("Error: Both entries are in test set")

# Check number of unique arguments in training and test set
flat_list_train = []
flat_list_test = []

for tup in tuples_train:
    for entry in tup:
        flat_list_train.append(entry)

for tup in tuples_test:
    for entry in tup:
        flat_list_test.append(entry)

print(f"{len(flat_list_train) = }, {len(set(flat_list_train)) = }")
print(f"{len(flat_list_test) = }, {len(set(flat_list_test)) = }")

# get difference between set(flat_list_train) and set(flat_list_test)
print(f"{len(set(flat_list_test) - set(flat_list_train)) = }")

# check that all elements in flat_list_train have df.Training_set == 1
for entry in set(flat_list_train):
    if df.Training_set[entry] != 1:
        print("Error: Element in flat_list_train not in training set")

len(flat_list_train) = 4672, len(set(flat_list_train)) = 800
len(flat_list_test) = 5688, len(set(flat_list_test)) = 1288
len(set(flat_list_test) - set(flat_list_train)) = 507


# Calculate Bradley-Terry Scores
Based on the survey outcomes, we calculate the Bradley-Terry scores of the arguments

In [34]:
def create_dataframe_BT(results, n_col=None):
    """
    Creates a DataFrame for BT score calculations based on match results.

    Args:
        results (list of tuples): List of match results where each tuple 
                                  represents (winner, loser).
        n_col (int, optional): The number of columns (arguments) to include 
                               initially. Defaults to None, which only includes
                               arguments found in results.

    Returns:
        pd.DataFrame: A DataFrame representing the wins between each argument.
    """
    # Initialize arguments set, optionally including indices up to n_col
    if n_col is not None:
        args = set(range(n_col))
    else:
        args = set()
        
    # Add unique arguments from results
    for winner, loser in results:
        args.add(winner)
        args.add(loser)

    # Create a dictionary to map argument names to indices
    arg_dict = {arg: index for index, arg in enumerate(sorted(args))}

    # Initialize the win matrix with zeros
    num_args = len(args)
    W = np.zeros((num_args, num_args), dtype=int)

    # Update the matrix based on match results
    for winner, loser in results:
        winner_index = arg_dict[winner]
        loser_index = arg_dict[loser]
        W[winner_index][loser_index] += 1

    # Create a DataFrame with argument names as columns and rows
    arg_names = sorted(args)
    df = pd.DataFrame(W, columns=arg_names, index=arg_names)
    df.index.name = 'arg_i'

    return df


# Calculate Bradley-Terry scores
def BT_fitting(df_W, fixed_scores=None, initial_scores=None, max_iter=10, tol=1e-6):
    """
    Fits Bradley-Terry scores based on the win matrix.

    Args:
        df_W (pd.DataFrame): DataFrame of wins between arguments.
        fixed_scores (list, optional): List of arguments with fixed scores. Default is None.
        initial_scores (array-like, optional): Initial scores for arguments. Default is ones.
        max_iter (int, optional): Maximum number of iterations for fitting. Default is 10.
        tol (float, optional): Tolerance for convergence. Default is 1e-6.

    Returns:
        np.ndarray: Array of fitted Bradley-Terry scores.
    """
    unique_args = list(df_W.index)

    # Initialize scores
    if initial_scores is None:
        initial_scores = np.ones(len(unique_args))
    else:
        initial_scores = np.array(initial_scores)

    # Validate fixed scores
    if fixed_scores is None:
        fixed_scores = []
    else:
        for arg in fixed_scores:
            if arg not in unique_args:
                print(f"Error: {arg} not in the list of unique arguments.")
                return

    # Initialize Bradley-Terry scores with the initial values
    bradley_terry_scores = initial_scores.copy()

    # Perform Bradley-Terry fitting
    for _ in range(max_iter):
        old_scores = bradley_terry_scores.copy()

        # Update each score using the Bradley-Terry model, skipping fixed scores
        for i, arg_i in enumerate(unique_args):
            if arg_i in fixed_scores:
                continue

            numerator = np.sum(df_W.iloc[i, :] * bradley_terry_scores /
                               (bradley_terry_scores[i] + bradley_terry_scores))
            denominator = np.sum(df_W.iloc[:, i] /
                                 (bradley_terry_scores[i] + bradley_terry_scores))
            bradley_terry_scores[i] = numerator / np.maximum(denominator, 1e-10)

        # Normalize scores: if no fixed scores, normalize all; otherwise, normalize only non-fixed scores
        if fixed_scores == []:
            bradley_terry_scores /= (np.prod(bradley_terry_scores) ** (1 / len(bradley_terry_scores)))
        else:
            for i, arg_i in enumerate(unique_args):
                if arg_i not in fixed_scores:
                    bradley_terry_scores[i] /= (np.prod(bradley_terry_scores) ** (1 / len(bradley_terry_scores)))

        # Check for convergence
        if np.linalg.norm(bradley_terry_scores - old_scores) < tol:
            print("Converged")
            break

    return bradley_terry_scores

# Example usage:
sample_results = [(1, 4), (10, 3), (1, 10), (3, 4)]
result_dataframe = create_dataframe_BT(sample_results)
print(result_dataframe)

# Example usage with n_col
result_dataframe_full = create_dataframe_BT(sample_results, n_col=10)
print(result_dataframe_full)

       1   3   4   10
arg_i                
1       0   0   1   1
3       0   0   1   0
4       0   0   0   0
10      0   1   0   0
       0   1   2   3   4   5   6   7   8   9   10
arg_i                                            
0       0   0   0   0   0   0   0   0   0   0   0
1       0   0   0   0   1   0   0   0   0   0   1
2       0   0   0   0   0   0   0   0   0   0   0
3       0   0   0   0   1   0   0   0   0   0   0
4       0   0   0   0   0   0   0   0   0   0   0
5       0   0   0   0   0   0   0   0   0   0   0
6       0   0   0   0   0   0   0   0   0   0   0
7       0   0   0   0   0   0   0   0   0   0   0
8       0   0   0   0   0   0   0   0   0   0   0
9       0   0   0   0   0   0   0   0   0   0   0
10      0   0   0   1   0   0   0   0   0   0   0


In [35]:
df_W = create_dataframe_BT(tuples_train)
bt_scores_train = BT_fitting(df_W+0.01, max_iter=100)

Converged


## Infer Bradley-Terry Scores on test set


In [36]:
df_W_train = create_dataframe_BT(tuples_train)
bt_scores_train = BT_fitting(df_W_train + 0.01, max_iter=100)

df_W_all = create_dataframe_BT(tuples_all, n_col = df.shape[0])
bt_scores_all = BT_fitting(df_W_all + 0.01, max_iter=100)

initial_scores = np.ones(df.shape[0])
initial_scores[df_W_train.index] = bt_scores_train
bt_scores_all_inferred = BT_fitting(df_W_all + 0.01, initial_scores=initial_scores,fixed_scores=list(df_W_train.index), max_iter=100)

Converged
Converged
Converged


In [37]:
# Add BT scores to the DataFrame of all arguments
df["BT_Score"] = bt_scores_all_inferred

# Export dataframe to excel
df.to_csv(data_dir / "Generated_Veganism_Arguments_with_BT_Scores.csv")