# Offensive language annotation campaign analysis

* sklearn Cohen kappa: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.cohen_kappa_score.html#sklearn.metrics.cohen_kappa_score
* kappa's comparison: https://towardsdatascience.com/inter-rater-agreement-kappas-69cd8b91ff75

Cohen's Kappa interpretation:
![](kappa-interpretation.png)

# Script - imports and functions

In [1]:
# IMPORTS
import json
import glob
import re
import pandas as pd
from sklearn import metrics as sk_metrics

import warnings
warnings.filterwarnings("ignore")

In [2]:
def get_documents(project_path):
    # Get possible statuses from project JSON
    project_json_file = project_path + "/exportedproject18042961325071563346.json"
    project_data = json.load(open(project_json_file))

    document_statuses = set()
    documents = []
    for document in project_data["source_documents"]:
        document_statuses.add(document["state"])
        documents.append((document["name"], document["state"]))

    #print(f"Possible statuses: {document_statuses}")
    return documents

In [3]:
def select_document_names_by_type(documents, status_types):
    # Select documents
    document_names = [doc[0] for doc in documents if doc[1] in status_types]
    print(f"Number of documents: {len(document_names)}")
    return document_names

In [4]:
# Prepare annotation objects
def annotations_from_tsv_file(filename, annotation_category):
    # Second line should look as follows:
    # #T_SP=webanno.custom.Offensivelanguage|Aspect|Aspect2|Aspect3|Category|Expressiveness|Figurative|Targetlevel|Targettype
    lines = open(filename).readlines()
    
    # get annotation index
    annotation_index = 0
    tsv_header = lines[1].split("|")
    if tsv_header[0] == "#T_SP=webanno.custom.Offensivelanguage":
        annotation_index = tsv_header.index(annotation_category)-1
    else:
        # Document contains no annotations
        annotation_index = -1
        
    # get indices of new lines
    indices = [i for i, x in enumerate(lines) if x == "\n"]
    # Add last index
    indices.append(len(lines))

    # get pairs of indices to get sentences
    pair_indices = []
    for i in range(len(indices)-1):
        if abs(indices[i]-indices[i+1]) > 1: #remove duplicate new lines (beginning of document)
            pair_indices.append(indices[i:i+2])

    # get sentences
    sentences = []
    for pair in pair_indices:
        # First sentence element is always text
        sentences.append(lines[pair[0]+1:pair[1]])

    # get annotations
    annotations = []
    for sentence in sentences:
        if annotation_index == -1:
            annotations.append("None")
        else:
            annotation = sentence[1].split("\t")[3+annotation_index]
            if annotation == "_":
                annotation = "None"                        
            annotation = re.sub(r"\[.*?\]", "", annotation) # remove annotation indexes
            annotation = annotation.replace("*", "None")
            annotation = annotation.replace("|None|", "|")
            annotation = annotation.replace("None|", "")
            annotation = annotation.replace("|None", "")
            annotation = "|".join(sorted(annotation.split("|"))) # sort annotations if multiple present
            annotations.append(annotation)
        
    return annotations

In [5]:
# folder can be annotation or curation
def get_annotation_data_frame(document_names, project_path, folder = "annotation"):
    all_annotations = []
    # get all annotations per document
    # each annotations looks like (document_name, annotator_name, [annotations])
    for document_name in document_names:
        annotator_files = glob.glob(f"{project_path}/{folder}/{document_name}/*.tsv")

        #get annotations per document
        for annotator_file in annotator_files:
            annotator = annotator_file.split("/")[-1]
            annotations = [document_name, annotator]
            # Possible annotation categories:
            # Aspect|Aspect2|Aspect3|Category|Expressiveness|Figurative|Targetlevel|Targettype
            for annotation_category in ["Aspect","Aspect2","Aspect3","Category","Expressiveness","Figurative","Targetlevel","Targettype\n"]:
                annotations.append(annotations_from_tsv_file(annotator_file, annotation_category))

            all_annotations.append(annotations)

    df = pd.DataFrame(all_annotations, columns =['Document', 'Annotator', "Aspect","Aspect2","Aspect3","Category","Expressiveness","Figurative","Targetlevel","Targettype"])
    return df

In [6]:
def get_annotator_groups_per_document(df, verbose = True):
    groups = set()
    for group in df.groupby("Document")["Annotator"].apply(list):
        groups.add(tuple(sorted(group)))
    if verbose:
        print(f"\nGroups: \n\t{groups}\n")
    return groups

In [7]:
def get_cohen_kappa_score(df, annotation_category, verbose = True):

    # get all annotation labels
    annotation_labels = set()
    for annotation_data in list(df[annotation_category]):
        annotation_labels.update(annotation_data)

    if verbose:
        print(f"All read different annotation labels: \n\t{annotation_labels}\n")
    
    # get annotator groups per document
    groups= get_annotator_groups_per_document(df, verbose)

    #for row in df.groupby("Document")["Annotator"]:
    #    print(list(row[1]))
    
    # get annotation pair lists
    annotations_one = []
    annotations_two = []
    for annotations_pair in df.groupby("Document")[annotation_category].apply(list):
        if len(annotations_pair) != 2:
            raise Exception(f"More than two annotators per document: '{annotations_pair}'!")

        annotations_one.extend(annotations_pair[0])
        annotations_two.extend(annotations_pair[1])

    # both lists should be of same length
    assert len(annotations_one) == len(annotations_two)
    if verbose:
        print(f"We are comparing {len(annotations_one)} annotated sentences.")
    
    # change label values (None, 0, 1, 2, 3, ...)
    label_mapping = {"None": -1}
    cur_mapping_id = 0
    for label in annotation_labels:
        if label != "None" and label not in label_mapping:
            label_mapping[label] = cur_mapping_id
            cur_mapping_id += 1
    if verbose:
        print(f"Mapping table: {label_mapping}")
    
    # calculate IAA
    annotations_iaa_a = list(map(lambda x: label_mapping[x] if x!="None" else -1, annotations_one)) 
    annotations_iaa_b = list(map(lambda x: label_mapping[x] if x!="None" else -1, annotations_two))
    cohens = sk_metrics.cohen_kappa_score(annotations_iaa_a, annotations_iaa_b)
    return cohens, len(annotations_one)

# Analysis by type

## Inter-annotator agreement between annotators (in general)

In [8]:
# PARAMETERS
project_path = "Offensive+language+annotation+REAL_project_2022-03-09_0806"
# Possible statuses: {'CURATION_IN_PROGRESS', 'NEW', 'ANNOTATION_IN_PROGRESS', 'CURATION_FINISHED', 'ANNOTATION_FINISHED'}
status_types = ['ANNOTATION_FINISHED']

documents = get_documents(project_path)
document_names = select_document_names_by_type(documents, status_types)
df = get_annotation_data_frame(document_names, project_path, folder = "annotation")

#Eval
for annotation_category in ["Aspect","Aspect2","Aspect3","Category","Expressiveness","Figurative","Targetlevel","Targettype"]: 
    cohens, examples_num = get_cohen_kappa_score(df, annotation_category, verbose = False)
    print(f"Cohens kappa for {annotation_category:15} between annotator a and b: {cohens:.3f}, across {examples_num} examples.")

Number of documents: 567
Cohens kappa for Aspect          between annotator a and b: 0.404, across 2712 examples.
Cohens kappa for Aspect2         between annotator a and b: 0.254, across 2712 examples.
Cohens kappa for Aspect3         between annotator a and b: 0.153, across 2712 examples.
Cohens kappa for Category        between annotator a and b: 0.377, across 2712 examples.
Cohens kappa for Expressiveness  between annotator a and b: 0.592, across 2712 examples.
Cohens kappa for Figurative      between annotator a and b: 0.209, across 2712 examples.
Cohens kappa for Targetlevel     between annotator a and b: 0.534, across 2712 examples.
Cohens kappa for Targettype      between annotator a and b: 0.569, across 2712 examples.


## Inter-annotator agreement between annotators

In [9]:
# PARAMETERS
project_path = "Offensive+language+annotation+REAL_project_2022-03-09_0806"
# Possible statuses: {'CURATION_IN_PROGRESS', 'NEW', 'ANNOTATION_IN_PROGRESS', 'CURATION_FINISHED', 'ANNOTATION_FINISHED'}
status_types = ['ANNOTATION_FINISHED']

documents = get_documents(project_path)
document_names = select_document_names_by_type(documents, status_types)

df_annotators = get_annotation_data_frame(document_names, project_path, folder = "annotation")

# get annotator pairs (groups)
groups = sorted(list(get_annotator_groups_per_document(df_annotators, verbose = False)))

for pair in groups:
    if len(pair) != 2:
        raise Exception(f"More than two annotators per document: '{annotations_pair}'!")
        
    annotator1 = pair[0]
    annotator2 = pair[1]
    
    annotator1_documents = set(df_annotators.groupby("Annotator").filter(lambda x : pd.Series([annotator1]).isin(x['Annotator']).all())["Document"])
    annotator2_documents = set(df_annotators.groupby("Annotator").filter(lambda x : pd.Series([annotator2]).isin(x['Annotator']).all())["Document"])
    annotator_documents = annotator1_documents.intersection(annotator2_documents) 
    
    df = df_annotators[df_annotators['Annotator'].isin([annotator1, annotator2])]
    df = df[df["Document"].isin(annotator_documents)]
    
    print(f"\n\nAnnotators {annotator1.replace('.tsv', ''):15} and {annotator2.replace('.tsv', ''):15} annotated {len(annotator_documents):3} documents:")
    for annotation_category in ["Aspect","Aspect2","Aspect3","Category","Expressiveness","Figurative","Targetlevel","Targettype"]: 
        cohens, examples_num = get_cohen_kappa_score(df, annotation_category, verbose = False)
        print(f"\t Cohens kappa for {annotation_category:15} is: {cohens:.3f}, across {examples_num} examples.")

Number of documents: 567


Annotators agnieszka_borowiak@wp.pl and ana.ostroski@gmail.com annotated   7 documents:
	 Cohens kappa for Aspect          is: 0.523, across 54 examples.
	 Cohens kappa for Aspect2         is: nan, across 54 examples.
	 Cohens kappa for Aspect3         is: nan, across 54 examples.
	 Cohens kappa for Category        is: 0.222, across 54 examples.
	 Cohens kappa for Expressiveness  is: 0.274, across 54 examples.
	 Cohens kappa for Figurative      is: 0.000, across 54 examples.
	 Cohens kappa for Targetlevel     is: 0.523, across 54 examples.
	 Cohens kappa for Targettype      is: 0.523, across 54 examples.


Annotators agnieszka_borowiak@wp.pl and ibrac@ihjj.hr   annotated  16 documents:
	 Cohens kappa for Aspect          is: 0.407, across 153 examples.
	 Cohens kappa for Aspect2         is: 0.072, across 153 examples.
	 Cohens kappa for Aspect3         is: 0.000, across 153 examples.
	 Cohens kappa for Category        is: 0.302, across 153 examples.
	 Cohens k

## Inter-annotator agreement between an annotator and curator 

In [10]:
# PARAMETERS
project_path = "Offensive+language+annotation+REAL_project_2022-03-09_0806"
# Possible statuses: {'CURATION_IN_PROGRESS', 'NEW', 'ANNOTATION_IN_PROGRESS', 'CURATION_FINISHED', 'ANNOTATION_FINISHED'}
status_types = ['CURATION_FINISHED']

documents = get_documents(project_path)
document_names = select_document_names_by_type(documents, status_types)

df_annotators = get_annotation_data_frame(document_names, project_path, folder = "annotation")
df_curator = get_annotation_data_frame(document_names, project_path, folder = "curation")
df_merged = pd.concat([df_annotators, df_curator], ignore_index=True)

# get annotator list
annotators = sorted(list(set(df_annotators["Annotator"])))

# eval each annotator to a curator
for annotator in annotators:
    annotator_documents = set(df_annotators.groupby("Annotator").filter(lambda x : pd.Series([annotator]).isin(x['Annotator']).all())["Document"])

    df = df_merged[df_merged['Annotator'].isin([annotator, "CURATION_USER.tsv"])]
    df = df[df["Document"].isin(annotator_documents)]

    print(f"\n\nAnnotator {annotator.replace('.tsv', ''):15} annotated {len(annotator_documents):3} documents:")
    for annotation_category in ["Aspect","Aspect2","Aspect3","Category","Expressiveness","Figurative","Targetlevel","Targettype"]: 
        cohens, examples_num = get_cohen_kappa_score(df, annotation_category, verbose = False)
        print(f"\t Cohens kappa for {annotation_category:15} is: {cohens:.3f}, across {examples_num} examples.")


Number of documents: 113


Annotator agnieszka_borowiak@wp.pl annotated   2 documents:
	 Cohens kappa for Aspect          is: 0.118, across 15 examples.
	 Cohens kappa for Aspect2         is: nan, across 15 examples.
	 Cohens kappa for Aspect3         is: nan, across 15 examples.
	 Cohens kappa for Category        is: 0.217, across 15 examples.
	 Cohens kappa for Expressiveness  is: 0.211, across 15 examples.
	 Cohens kappa for Figurative      is: 0.217, across 15 examples.
	 Cohens kappa for Targetlevel     is: 0.132, across 15 examples.
	 Cohens kappa for Targettype      is: 0.132, across 15 examples.


Annotator ana.ostroski@gmail.com annotated  34 documents:
	 Cohens kappa for Aspect          is: 0.681, across 399 examples.
	 Cohens kappa for Aspect2         is: 0.620, across 399 examples.
	 Cohens kappa for Aspect3         is: 0.360, across 399 examples.
	 Cohens kappa for Category        is: 0.705, across 399 examples.
	 Cohens kappa for Expressiveness  is: 0.721, across 399 exam

Discussion questions:
   
* Multiple labelings of the same sentence - how to take them into account?
* Curator's feeling aligned with the results above?
* Check if there are documents that can be removed to get better results?
  * Not much data available to really have representative value - only 500 documents annotated and only 113 curated (March 10, 2022)?