#### Annotation comparison with human annotators

In [79]:
import pandas as pd
from functools import reduce
from statsmodels.stats import inter_rater as irr

In [None]:
# load csvs
tom = pd.read_csv("tom.csv")
thomas = pd.read_csv("thomas.csv")
gaz = pd.read_csv("gareths_annotation.csv")
jobayer = pd.read_csv("Jobayer.csv")
truth = pd.read_csv("full_sample.csv")

In [None]:
# concatenate csvs and index the key
truth["gender_tom"] = tom["gender"]
truth["gender_gareth"] = gaz["gender"]
truth["gender_thomas"] = thomas["gender"]
truth["gender_jobayer"] = jobayer["gender"]
truth = truth.set_index("primary_author_name")

In [None]:
# format for consistent input between csvs
truth.columns = (
            truth.columns.str.strip()
            .str.lower()
        )
for column_name in truth.columns:
    truth[column_name] = truth[column_name].astype(str).str.strip().str.lower()

In [90]:
truth

Unnamed: 0_level_0,gender_anno,gender_tom,gender_gareth,gender_thomas,gender_jobayer
primary_author_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Jean Jacques Vanden Eynde,male,male,male,male,male
Akash Patnaik,male,male,unknown,male,male
Li‐Yuan Yu‐Lee,female,unknown,unknown,unknown,unknown
Jens E. Olesen,male,male,male,male,male
A Boyle,unknown,unknown,unknown,unknown,unknown
...,...,...,...,...,...
Maurizio Piergiovanni,male,male,male,male,male
Marcel Schilling,male,male,male,male,male
George Moussa,male,male,male,male,male
J. Mimila‐Arroyo,unknown,unknown,unknown,unknown,unknown


In [None]:
# categorize columns
human_cols = ['gender_tom', 'gender_gareth', 'gender_thomas', "gender_jobayer"]
machine_col = "gender_anno"
categories = ['male', 'female', 'unknown']

In [None]:
# calculate fleiss kappa between human annotators
df_human = truth[human_cols]
agg_data, categories_used = irr.aggregate_raters(df_human[human_cols].values)
kappa_value = irr.fleiss_kappa(agg_data)
print(f"Fleiss Kappa for 4 annotators is {kappa_value:.3f}")

Fleiss Kappa for 4 annotators is 0.788


In [108]:
# find subset of disagreements between humans and machine
truth['human_consensus'] = truth[human_cols].mode(axis=1)[0] 
truth['is_disagreement'] = (truth[machine_col] != truth['human_consensus'])
df_disagreements = truth[truth['is_disagreement']].copy()


In [110]:
df_disagreements

Unnamed: 0_level_0,gender_anno,gender_tom,gender_gareth,gender_thomas,gender_jobayer,human_consensus,is_disagreement
primary_author_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Li‐Yuan Yu‐Lee,female,unknown,unknown,unknown,unknown,unknown,True
Pentti Karioja,male,female,unknown,female,female,female,True
Nasreddine Sakhri,male,female,unknown,female,female,female,True
Sreerupa Sengupta,unknown,male,female,female,female,female,True
Gail L. Daumit,female,male,female,male,male,male,True
I. Michael Wormstone,unknown,male,male,unknown,unknown,male,True
Andrea Louise Campbell,male,female,female,female,female,female,True
Andrea Farolfi,male,female,female,female,female,female,True
Jing Chen,female,unknown,male,unknown,unknown,unknown,True
Yu‐Pin Lin,female,unknown,unknown,unknown,unknown,unknown,True


In [None]:
# calculate machine agreement with mode of human annotations
agreement_machine_mode = (
    (truth[machine_col] == truth['human_consensus']) 
).mean()

print(f"Machine Agreement with mode of annotators: {agreement_machine_mode:.4f}")

Machine Agreement with mode of annotators: 0.8600


#### Conclusions

Human annotators reached a high level of consencus with a Fleiss Kappa of 79%

Taking the consensus human result as ground truth, the pipeline had an 86% agreement rate with human annotators.

The largest source of disagreement was Chinese names, with 5 out of 14