In [1]:
import pandas as pd
import numpy as np

In [2]:
cluster_reviews = dict()
entity_reviews = dict()

for i in range(1, 4):
    with open(f'review_results/reviewer_{i}.tsv') as fp:
        cluster = None
        while True:
            l = fp.readline()
            if l == '':
                break
            fields = l.rstrip().split('\t')
            if len(fields) < 4:
                continue

            review_object = { 'verdict': fields[3] }
            if len(fields) > 4 and fields[4] != '':
                review_object['comment'] = fields[4]

            if fields[2] == 'Overall':
                if cluster not in cluster_reviews:
                    cluster_reviews[cluster] = dict()

                cluster_reviews[cluster][i] = review_object
            else:
                cluster = int(fields[0])

                if cluster not in entity_reviews:
                    entity_reviews[cluster] = dict()

                ent = fields[1]
                if ent not in entity_reviews[cluster]:
                    entity_reviews[cluster][ent] = dict()

                entity_reviews[cluster][ent][i] = review_object

In [3]:
cluster_reviews_flat = {
    cluster: {
        f'{field}_{reviewer}': value
        for reviewer, v_reviewer in v_cluster.items()
        for field, value in v_reviewer.items()
    }
    for cluster, v_cluster in cluster_reviews.items()
}

In [4]:
df_clusters = pd.DataFrame.from_dict(cluster_reviews_flat, orient='index')

In [5]:
verdict_map = {
    'definitely wrong': -2,
    'probably wrong': -1,
    'not sure': 0,
    'probably good': 1,
    'definitely good': 2,
}
df_clusters['verdict_1'] = df_clusters['verdict_1'].apply(lambda x: verdict_map[x])
df_clusters['verdict_2'] = df_clusters['verdict_2'].apply(lambda x: verdict_map[x])
df_clusters['verdict_3'] = df_clusters['verdict_3'].apply(lambda x: verdict_map[x])

In [6]:
reviewer_stats = df_clusters.agg(['mean', np.std])
reviewer_stats

Unnamed: 0,verdict_1,verdict_2,verdict_3
mean,-1.156522,-0.852174,-0.130435
std,1.598197,1.666194,1.789451


In [7]:
def summarize_verdicts(s):
    neg, pos = len(s[s < 0]), len(s[s > 0])
    return \
        neg, \
        pos, \
        neg > 0, \
        neg > pos, \
        neg == 3 or (neg == 2 and pos == 0)

df_clusters['wrong_count'], \
df_clusters['good_count'], \
df_clusters['is_wrong_one'],\
df_clusters['is_wrong_majority'],\
df_clusters['is_wrong_all'], = \
    zip(*df_clusters[['verdict_1', 'verdict_2', 'verdict_3']].apply(summarize_verdicts, axis=1))

Export results

In [8]:
df_clusters.to_csv('results/cluster_reviews.csv', index_label='cluster')

In [9]:
len(df_clusters[df_clusters.is_wrong_one])

95

In [10]:
len(df_clusters[df_clusters.is_wrong_majority])

84

In [11]:
len(df_clusters[df_clusters.is_wrong_all])

58

Calculate Fleiss' kappa

In [12]:
raters = 3
categories = 2
samples = len(df_clusters)

p_wrong = df_clusters.wrong_count.sum() / (samples * raters)
p_good = df_clusters.good_count.sum() / (samples * raters)
P_e = p_wrong ** 2 + p_good ** 2

P = df_clusters.wrong_count.to_numpy() ** 2 + df_clusters.good_count.to_numpy() ** 2 - raters
P = P / (raters * (raters - 1))

kappa = (P.mean() - P_e) / (1 - P_e)
kappa

0.5012892639474916

Calculate Krippendorff's alpha

In [13]:
import krippendorff

In [14]:
rel_matrix = df_clusters[['verdict_1', 'verdict_2', 'verdict_3']].to_numpy().T
krippendorff.alpha(
    reliability_data=rel_matrix,
    value_domain=(-2, -1, 1, 2),
    level_of_measurement='ordinal',
)

0.4743675352909382