In [1]:
from irrCAC.raw import CAC
from irrCAC.benchmark import Benchmark
import pandas as pd

ANNOTATION_CSV_PATH = "../data/annotation_data/Coherence Task March 2022_July 2021 All - both months raw labels.csv"

In [2]:
def benchmark(coefficient_dict):
    """Given computed coefficient values from the irrCAC library, print the likely membership scale probabilities"""
    est_dict = coefficient_dict["est"]
    benchmarker = Benchmark(coeff=est_dict["coefficient_value"], se=est_dict["se"])

    print("Altman Scale")
    print(benchmarker.altman())
    print()

    print("Landis Koch Scale")
    print(benchmarker.landis_koch())
    print()

    print("Fleiss Scale")
    print(benchmarker.fleiss())
    print()

def inter_rater_evaluation(dataframe):
    """Generate and benchmark inter-rater statistics AC1 and Fleiss' Kappa"""
    cac = CAC(dataframe)
    print(cac)

    print("AC1 statistic")
    gwet_dict = cac.gwet()
    print(gwet_dict)
    benchmark(gwet_dict)

    print("Fleiss Kappa (if only 2 annotators - Cohen's Kappa)")
    fleiss_dict = cac.fleiss()
    print(fleiss_dict)
    benchmark(fleiss_dict)

    

In [3]:
annotations_df = pd.read_csv(ANNOTATION_CSV_PATH).drop(columns=["Model ID", "Cluster ID"])
display(annotations_df)

Unnamed: 0,Ryan Coherence,Rebecca Coherence,Ethan Coherence
0,1,1,1
1,1,1,1
2,1,1,0
3,0,1,0
4,1,1,0
...,...,...,...
195,1,1,1
196,1,1,1
197,1,1,1
198,1,1,1


In [4]:
inter_rater_evaluation(annotations_df)

<irrCAC.raw.CAC Subjects: 200, Raters: 3, Categories: [0, 1], Weights: "identity">
AC1 statistic
{'est': {'coefficient_value': 0.8503, 'coefficient_name': 'AC1', 'confidence_interval': (0.79906, 0.90154), 'p_value': 0.0, 'z': 32.72321, 'se': 0.02598, 'pa': 0.88, 'pe': 0.19839}, 'weights': array([[1., 0.],
       [0., 1.]]), 'categories': [0, 1]}
Altman Scale
{'scale': [(0.8, 1.0), (0.6, 0.8), (0.4, 0.6), (0.2, 0.4), (-1.0, 0.2)], 'Altman': ['Very Good', 'Good', 'Moderate', 'Fair', 'Poor'], 'CumProb': [0.97357, 1.0, 1.0, 1.0, 1.0]}

Landis Koch Scale
{'scale': [(0.8, 1.0), (0.6, 0.8), (0.4, 0.6), (0.2, 0.4), (0.0, 0.2), (-1.0, 0.0)], 'Landis-Koch': ['Almost Perfect', 'Substantial', 'Moderate', 'Fair', 'Slight', 'Poor'], 'CumProb': [0.97357, 1.0, 1.0, 1.0, 1.0, 1.0]}

Fleiss Scale
{'scale': [(0.75, 1.0), (0.4, 0.75), (-1.0, 0.4)], 'Fleiss': ['Excellent', 'Intermediate to Good', 'Poor'], 'CumProb': [0.99994, 1.0, 1.0]}

Fleiss Kappa (if only 2 annotators - Cohen's Kappa)
{'est': {'coeffic

In [5]:
# Compute annotation metrics between pairs of annotators
rebecca = "Rebecca Coherence"
ryan = "Ryan Coherence"
ethan = "Ethan Coherence"

for a1, a2 in [(ryan, rebecca), (ryan, ethan), (rebecca, ethan)]:
    print(a1, "/", a2)
    inter_rater_evaluation(annotations_df[[a1, a2]])
    print()
    print()
    



Ryan Coherence / Rebecca Coherence
<irrCAC.raw.CAC Subjects: 200, Raters: 2, Categories: [0, 1], Weights: "identity">
AC1 statistic
{'est': {'coefficient_value': 0.89342, 'coefficient_name': 'AC1', 'confidence_interval': (0.84191, 0.94494), 'p_value': 0.0, 'z': 34.19921, 'se': 0.02612, 'pa': 0.91, 'pe': 0.15555}, 'weights': array([[1., 0.],
       [0., 1.]]), 'categories': [0, 1]}
Altman Scale
{'scale': [(0.8, 1.0), (0.6, 0.8), (0.4, 0.6), (0.2, 0.4), (-1.0, 0.2)], 'Altman': ['Very Good', 'Good', 'Moderate', 'Fair', 'Poor'], 'CumProb': [0.99983, 1.0, 1.0, 1.0, 1.0]}

Landis Koch Scale
{'scale': [(0.8, 1.0), (0.6, 0.8), (0.4, 0.6), (0.2, 0.4), (0.0, 0.2), (-1.0, 0.0)], 'Landis-Koch': ['Almost Perfect', 'Substantial', 'Moderate', 'Fair', 'Slight', 'Poor'], 'CumProb': [0.99983, 1.0, 1.0, 1.0, 1.0, 1.0]}

Fleiss Scale
{'scale': [(0.75, 1.0), (0.4, 0.75), (-1.0, 0.4)], 'Fleiss': ['Excellent', 'Intermediate to Good', 'Poor'], 'CumProb': [1.0, 1.0, 1.0]}

Fleiss Kappa (if only 2 annotators - 