In [12]:
import numpy as np
import pandas as pd
import time
from scipy.stats import mode

In [13]:
# Read in the export of the  df.karger dataframe (data processing done in R script)
df_karger = pd.read_csv("df_for_karger.csv")

In [14]:
### Primary Function Implemnting the Karger Algorithm
# I modified using the starting point at this repository: https://github.com/jpgard/budget-optimal-task-allocation-crowdsourcing

def run_karger(G, A, m, n, max_num_iters):
    e_i, e_j = np.nonzero(G)

    for k in range(max_num_iters):

        # initialize empty x message array
        x_k = np.full((m, n), np.nan)
        # initialize entries of Y with N(1,1) random variables
        y_k = np.random.normal(loc=1, scale=1, size=(m, n))
        # update the task message; represents the log-likelihood of task i being a positive task
        for i, j in zip(e_i, e_j):
            # delta_i_not_j, neighborhood of i excluding j (all workers assigned to task i excluding j)
            delta_i = np.nonzero(G[i, :])[0]
            delta_i_not_j = delta_i[delta_i != j]
            x_k[i, j] = np.sum([A[i, j_prime] * y_k[i, j_prime] for j_prime in delta_i_not_j])
            
        # update the worker message; represents how reliable worker j is
        for i, j in zip(e_i, e_j):
            # delta_j_not_i; neighborhood of j excluding i (all tasks assigned to worker j excluding i)
            delta_j = np.nonzero(G[:, j])[0]
            delta_j_not_i = delta_j[delta_j != i]
            y_k[i, j] = np.sum([A[i_prime, j] * x_k[i_prime, j] for i_prime in delta_j_not_i])
    
    # compute final estimates
    x = np.full(m, np.nan)
    for i in range(m):
        x[i] = np.nansum([A[i, j] * y_k[i, j] for j in np.unique(np.nonzero(G))])
    t_hat = np.sign(x)
    return(t_hat) 

In [15]:
# Main code that calls the function on the input data

final_results = pd.DataFrame()

for task_name in df_karger["task_name"].unique():
    answer_matrix = df_karger[df_karger["task_name"] == task_name].pivot(index = "question_name", columns = "name", values = "value")
    
    questions = answer_matrix.index
    workers = answer_matrix.columns

    num_workers = len(workers)
    num_questions = len(questions)
    G = np.asmatrix(answer_matrix)

    # set some basic parameters
    max_num_iters = 1000
    # number of tasks (in this case, this is the  number of questions)
    m = num_questions
    # number of workers
    n = num_workers
    A = G

    results_for_task = run_karger(G, A, m, n, max_num_iters)

    # Concat the result to the database of answers
    final_export = np.where(results_for_task == -1, 0, 1)
    final_task_results = pd.DataFrame({task_name: final_export}, index = questions)

    final_results = pd.concat([final_results, final_task_results], axis =1)


final_results.to_csv('karger-results.csv')
    