# 2020 Democratic Primary Drop Out Predictions Scoring Formulas

## Author: Oliver Gladfelter

### Date: Dec 9th, 2019
### Updated: Jan 6th, 2020

In [1]:
import itertools
import pandas as pd
import statistics

def kendall_tau_distance(order_a, order_b):
    pairs = itertools.combinations(range(1, len(order_a)+1), 2)
    distance = 0
    for x, y in pairs:
        a = order_a.index(x) - order_a.index(y)
        b = order_b.index(x) - order_b.index(y)
        if a * b < 0:
            distance += 1
    return distance

In [2]:
# load in data
data = pd.read_csv("dropout-contest-6.csv")
data = data.drop(['Timestamp', 'Email Address', 'Do you wish to include any thoughts or explanations for your predictions?'], axis=1)
    
data.columns = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 'participant', 'twitter', 'privacyPreference', 'alias', 'showAnswers']
data['privacyPreference'][45] = "Yes"

## Some survey respondents chose one candidate twice (presumably mistakenly). This also means they left one candidate off their prediction. This function determines cases this occured and automatically fixes it by replacing one instance of the duplicated candidate with the ommitted candidate.

In [None]:
def fixDoubleMissingError(index, double, missing):
    """
    index = row count
    double = name in dictionary. ex: {'Michael Bloomberg': 2}
    missing = name in list. ex: ['Michael Bennet']
    
    """
    
    double = list(double) # put the name in a list
    #double = list({'Michael Bloomberg': 2})
    #missing = ['Michael Bennet']

    secondOccurance = 0 # switch starts as off 
    for column in data.columns[:-5]: # iterate over all columns of the called row
        if secondOccurance == 1 and data[column][index] in double:
            data[column][index] = missing[0] # replace the second occurrence of the repeated name with the missing name
        if data[column][index] in double:
            secondOccurance = 1 # flip switch on

for row in range(0,len(data)):
    allCandidates  = ['Amy Klobuchar','Andrew Yang','Bernie Sanders','Cory Booker','Deval Patrick','Elizabeth Warren','Joe Biden','John Delaney','Marianne Williamson','Michael Bennet','Michael Bloomberg','Pete Buttigieg','Tom Steyer','Tulsi Gabbard']
    candidates = []
    for column in data.columns[:-5]:
        candidates.append(data[column][row])
    if (len(set(candidates))) != 14: # count number of candidates in survey response. Anything less indicates a mistake.
        candidatesCount = {}
        
        # iterate over mistaken response, count how many times each candidate was selected
        for candidate in candidates:
            if candidate in candidatesCount:
                candidatesCount[candidate] = candidatesCount[candidate] + 1
            else:
                candidatesCount[candidate] = 1
                allCandidates.remove(candidate) # if a candidate was selected, remove from allCandidates list. By end of loop, allCandidates will actually be all MISSING candidates
                
        copy = dict(candidatesCount)

        # remove candidates with count of 1. We only want the name of candidates selected 2+ times
        for (key, value) in  candidatesCount.items(): 
            if value == 1:
                del copy[key]

        # call fixDoubleMissingError() function on erred row, with dictionary showing candidates with count > 1, and list of missing candidates
        print(data['participant'][row])
        print(copy)
        print(allCandidates)
        fixDoubleMissingError(row, copy, allCandidates)
        print(" ")

# Now every row has each candidate listed once and only once

In [5]:
for row in range(0,len(data)):
    for column in data.columns[:-5]:
        candidates.append(data[column][row])
    if (len(set(candidates))) != 14:
        print("Error") # this should never happen by this point

# Calculate 'average' scores of each candidate. Who is expected to go far, according to the aggregate?

In [6]:
candidateScores = {}

# find dropout num for each candidate in each guess. Add num to total value for that candidate in the dict.
for row in range(0,len(data)):
    for column in data.columns[:-5]:
        if (data[column][row]) in candidateScores:
            candidateScores[(data[column][row])] = candidateScores[(data[column][row])] + int(column)
        else:
            candidateScores[(data[column][row])] = int(column)

# calc average by dividing each value by number of participants
for (key, value) in  candidateScores.items(): 
    candidateScores[key] = value / len(data)
    
# sort dictionary by values
candidateScoresSorted = {k: v for k, v in sorted(candidateScores.items(), key=lambda item: item[1])}

## This is the predicted drop out order, according to the aggregate.

In [7]:
candidateScoresSorted

{'Michael Bennet': 3.120879120879121,
 'Deval Patrick': 4.0989010989010985,
 'John Delaney': 4.252747252747253,
 'Marianne Williamson': 4.43956043956044,
 'Cory Booker': 5.1208791208791204,
 'Tom Steyer': 6.241758241758242,
 'Tulsi Gabbard': 6.252747252747253,
 'Amy Klobuchar': 6.384615384615385,
 'Andrew Yang': 8.23076923076923,
 'Michael Bloomberg': 8.846153846153847,
 'Pete Buttigieg': 10.527472527472527,
 'Bernie Sanders': 12.142857142857142,
 'Elizabeth Warren': 12.384615384615385,
 'Joe Biden': 12.956043956043956}

## Often, the aggregate of many predictions are more accurate than any one single prediction (this is essentially the theory Nate Silver built FiveThirtyEight on), so let's include the aggregate order in the leaderboard.

In [None]:
aggregateRow = {}

num = 1

for c in candidateScoresSorted.keys():
    aggregateRow[num] = [c]
    num = num + 1
    
aggregateRow = pd.DataFrame(aggregateRow)

aggregateRow['participant'] = "Wisdom of the crowd"
aggregateRow['twitter'] = ""
aggregateRow['privacyPreference'] = "fullName"
aggregateRow['alias'] = "Wisdom of the crowd"
aggregateRow['showAnswers'] = 'Yes'
aggregateRow['score'] = 0

data = data.append(aggregateRow, ignore_index=True)

# Scoring Function

## edit the order of 'candidates' list in below cell before running

In [10]:
# all candidates, sorted alphabetically for now. This should eventually be changed to reflect their dropout order (1st to drop --> winner)
#candidates  = ['Amy Klobuchar','Andrew Yang','Bernie Sanders','Cory Booker','Deval Patrick','Elizabeth Warren','Joe Biden','John Delaney','Marianne Williamson','Michael Bennet','Michael Bloomberg','Pete Buttigieg','Tom Steyer','Tulsi Gabbard']
candidates = list(candidateScoresSorted.keys())

candidateDictionary = {} # we need to assign each candidate a numerical value. This will be alphabetical

for candidate in range(0,len(candidates)):
    candidateDictionary[candidates[candidate]] = candidate + 1
    
# add a score column to df
data['score'] = 0

candidateDictionary

{'Michael Bennet': 1,
 'Deval Patrick': 2,
 'John Delaney': 3,
 'Marianne Williamson': 4,
 'Cory Booker': 5,
 'Tom Steyer': 6,
 'Tulsi Gabbard': 7,
 'Amy Klobuchar': 8,
 'Andrew Yang': 9,
 'Michael Bloomberg': 10,
 'Pete Buttigieg': 11,
 'Bernie Sanders': 12,
 'Elizabeth Warren': 13,
 'Joe Biden': 14}

In [11]:
def scoringFunction(rowIndex):
    participantN = []
    candidatesN = []

    # need to convert each survey respondent's string answers into numbers, using the dictionary
    for column in data.columns[:-6]:
        participantN.append(candidateDictionary[data[column][rowIndex]])

    # convert the candidates list into numbers too, using the same dictionary. This is what we compare against.
    #for num in range(0,len(candidates)):
    #    candidatesN.append(candidateDictionary[candidates[num]])

    candidatesN = [4, 5, 3, 9, 1, 2, 6, 11, 8, 10, 13, 7, 12, 14]
    
    return kendall_tau_distance(candidatesN, participantN)

# Use only for final scoring after all but one candidate has dropped out
for num in range(0,len(data)):
    data['score'][num] = scoringFunction(num)

# How to keep score *as* candidates drop out? When there are still unknowns?

In [None]:
def partialScoringFunction(rowIndex, candidatesDroppedSoFar):
    """
    candidatesDroppedSoFar is a list of all candidates who have dropped out. Ex: ["Kamala Harris", "Beto O'Rourke"]
    """
       
    # convert candidatesDroppedSoFar list into numbers using the dictionary
    candidatesDroppedSoFarN = []
    for candidate in candidatesDroppedSoFar:
        candidatesDroppedSoFarN.append(candidateDictionary[candidate])
    
    # convert each survey respondent's string answers into numbers, using the dictionary
    participantN = []
    for column in data.columns[:-6]:
        participantN.append(candidateDictionary[data[column][rowIndex]])
    
    # create new participant list. Candidates who have dropped out so far keep their spot, but 
    # all other guess positions are replaced with 1-14 (minus the candidates who dropped already)
    num = 1
    newParticipantList = []
    for item in participantN:
        if item in candidatesDroppedSoFarN:
            newParticipantList.append(item) # candidates who have dropped out retain their drop out predicted spot
        else:
            while num in candidatesDroppedSoFarN:
                num = num + 1
            newParticipantList.append(num) # candidates who are still going get replaced with 1-14 (minus drop outs)
            num = num + 1

    return kendall_tau_distance(candidatesN, newParticipantList)

##  Marianne Williamson is the first to drop, followed by Cory Booker, John Delaney, Andrew Yang, Michael Bennet, Deval Patrick, etc...

In [None]:
print(candidateDictionary['Marianne Williamson'])  # Delaney = 4
print(candidateDictionary['Cory Booker'])  # Booker = 5
print(candidateDictionary['John Delaney'])  # Delaney = 3
print(candidateDictionary['Andrew Yang'])  # Yang = 9
print(candidateDictionary['Michael Bennet'])  # Bennet = 1
print(candidateDictionary['Deval Patrick'])  # Patrick = 2
print(candidateDictionary['Tom Steyer'])  # Steyer = 6
print(candidateDictionary['Pete Buttigieg'])  # pete = 11
print(candidateDictionary['Amy Klobuchar'])  # amy = 8

In [None]:
# we update the candidates list accordingly, moving "4" and "5" (in that order) to the front of the chopping order list
candidatesN = [4, 5, 3, 9, 1, 2, 6, 11, 8, 10, 13, 7, 12, 14]

for num in range(0,len(data)):
    data['score'][num] = partialScoringFunction(num, ["Marianne Williamson", "Cory Booker", "John Delaney", "Andrew Yang", 'Michael Bennet', 'Deval Patrick', 'Tom Steyer', 'Pete Buttigieg', 'Amy Klobuchar', "Michael Bloomberg", "Elizabeth Warren", "Tulsi Gabbard"])

# Scores Normalized And Converted Into Accuracy Percentages

In [15]:
def normalize(tauScore):

    # normalized tau score = tau_distance / (n * 15-1 / 2)
    # accuracy percentage = 100 - normalized score (which is 0-1)

    return 100 - tauScore / (14 * (14-1) / 2) * 100

#output_data = data.filter(['participant','twitter', 'privacyPreference', 'alias', 'score']) # remove candidate-prediction columns
output_data = data

output_data['percentage'] = output_data['score'].apply(normalize)

# Prepare Data For Leaderboard And Export To CSV

In [17]:
def stripAt(value):
    return value.strip("@")

def firstNameLastInitial(name):
    firstName = name.split(" ")[0]
    lastName = name.split(" ")[1]
    
    return firstName + " " + lastName[0] + "."

# for people who submitted twice, delete their first submission
output_data = output_data.drop_duplicates('participant',keep="last").reset_index()
del output_data['index']

output_data['twitter'] = output_data['twitter'].fillna("")
output_data['twitter'] = output_data['twitter'].apply(stripAt)

# Some participants gave me their twitter handles but don't want the handles posted on the leaderboard
# Loop removes their twitter handle before exporting data
for num in range(0,len(output_data)):
    if output_data['privacyPreference'][num] == 'Yes, and please include my Twitter handle as well':
        continue
    else:
        output_data['twitter'][num] = ""
        
# Remove last names from participant names (leave the initial) 
# Or replace with their self-chosen alias if that's their preference
output_data['name'] = ''

for num in range(0,len(output_data)):
    if output_data['privacyPreference'][num] == 'Yes':
        output_data['name'][num] = firstNameLastInitial(output_data['participant'][num])
    elif output_data['privacyPreference'][num] == 'Yes, and please include my Twitter handle as well':
        output_data['name'][num] = firstNameLastInitial(output_data['participant'][num])
    elif output_data['privacyPreference'][num] == 'No, list me as "Anonymous"':
        output_data['name'][num] = 'Anonymous'
    elif output_data['privacyPreference'][num] == 'fullName':
        output_data['name'][num] = output_data['participant'][num]
    else:
        output_data['name'][num] = output_data['alias'][num]
        
output_data = output_data.sort_values(['percentage', 'name'], ascending=[False, True])
output_data = output_data.reset_index()
del output_data['index']

output_data.to_csv("scoreboardDataApril8.csv", index=False)

# Analysis For Random Tidbits Section

In [None]:
# this reformatted data will be necessary for answering certain questions
reformattedData = {'participant':[]}

for row in range(0,len(data)):
    for column in data.columns[:-6]:
        if column == 'participant':
            reformattedData['participant'].append(data[column][row])
        elif data[column][row] not in reformattedData:
            reformattedData[data[column][row]] = [int(column)]
        elif data[column][row] in reformattedData:
            reformattedData[data[column][row]].append(int(column))
            
reformattedData = pd.DataFrame(reformattedData)

### 42% of participants predict Biden will be the eventual nominee. Warren and Bernie are the next most common guesses (tied with 25% each).

In [None]:
data[14].value_counts() / len(data) * 100

### Michael Bennet is by far the most popular guess for first-to-drop (31% of submissions). John Delaney is next, with 17% predicting he'll be out first.

In [None]:
data[1].value_counts() / len(data) * 100

### 73% of participants predict Bennet will be among the first 3 candidates to drop out. Delaney (49%), Deval Patrick (45%), and Cory Booker (38%) are all also common 'first 3' picks.

In [None]:
print("% of respondents who think candidate will be one of the next four to drop out")
print(" ")

for candidate in candidates:
    print(candidate + ": " + str(len(reformattedData[reformattedData[candidate] <= 3]) / len(data) * 100))

### Contest participants disagree the most over Cory Booker's chances - it's common to see him all over the place in guesses. 

In [None]:
for candidate in candidates:
    print(candidate + " : " + str(statistics.stdev(reformattedData[candidate])))

### Want/need/predicted ____ out next

In [None]:
candidatesIn = ["Tulsi Gabbard", "Amy Klobuchar", "Michael Bloomberg", "Pete Buttigieg", "Bernie Sanders", "Elizabeth Warren"]

need__OutNext = 0
candidate = ''

for row in range(0,len(data)):
    for column in data.columns[:-14]:
        if data[column][row] in candidatesIn:
            break
        if data[column][row] == candidate:
            need__OutNext = need__OutNext + 1
            
print(need__OutNext / len(data) * 100)