In [1]:
import pandas as pd
import stanza
from googletrans import Translator
import os

Create the annotation files

In [17]:
translator = Translator()

for language in ["nl","es":]
    # Read in the data
    article_file = "./data/"+language+"Dataset.csv"
    content = pd.read_csv(article_file)

    # Make sure to be very clear on how you determined the annotation terms.
    # You can also use ngrams, e.g. "vegan diet" instead of "diet"
    terms = ["Trump", "trump", "Donald"]

    # Prepare the nlp pipeline
    stanza.download(language)
    nlp = stanza.Pipeline(language,  processors='tokenize,pos,lemma')

    # Get all instances.
    # Here, we do a simple string comparison.
    # You might want to check for the lemma instead (depending on your research question)
    annotation_instances = {}
    for article in content["text"]:
        # Filter out empty articles
        if len(article.strip())>0:
            processed_article = nlp(article)
            for sentence in processed_article.sentences:
                for term in terms:
                    if term in sentence.text:
                        # Save all instances for each term
                        # This is a condensed way of updating a dictionary. Make sure you understand what is happening
                        annotation_instances[term] = annotation_instances.get(term, []) + [sentence.text.strip()]
    num_annotators = 2
    for term in terms:
        df = {"Term":[],"Instance":[],"Trans":[],"Annotation":[]}
        for instance in annotation_instances[term]:
            instance = instance.replace("\n"," ")
            df["Instance"].append(instance)
            instance = translator.translate(instance, src=language, dest="en").text
            df["Trans"].append(instance)
            df["Term"].append(term)
            df["Annotation"].append("x")
        for i in range(1, num_annotators + 1):
            pd.DataFrame.from_dict(df).to_csv("./data/annotations/annotationsheet_"+language+"_"+term+"_a"+str(i)+".csv")


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2022-11-30 15:18:47 INFO: Downloading default packages for language: es (Spanish) ...
2022-11-30 15:18:48 INFO: File exists: /home/revess/stanza_resources/es/default.zip
2022-11-30 15:18:53 INFO: Finished downloading models and saved to /home/revess/stanza_resources.
2022-11-30 15:18:53 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2022-11-30 15:18:53 INFO: Loading these models for language: es (Spanish):
| Processor | Package |
-----------------------
| tokenize  | ancora  |
| mwt       | ancora  |
| pos       | ancora  |
| lemma     | ancora  |

2022-11-30 15:18:53 INFO: Use device: cpu
2022-11-30 15:18:53 INFO: Loading: tokenize
2022-11-30 15:18:53 INFO: Loading: mwt
2022-11-30 15:18:53 INFO: Loading: pos
2022-11-30 15:18:53 INFO: Loading: lemma
2022-11-30 15:18:53 INFO: Done loading processors!


Create the matrices

In [64]:
import pandas as pd
import glob
import os.path
import random
from itertools import combinations
from sklearn.metrics import cohen_kappa_score, confusion_matrix
terms = ["Trump","Donald"]
categories = [1, -1, 0]

# When we use the random function, we should set a fixed seed, if our results should be reproducible
random.seed(733)

for language in ["nl","es"]:
    for term in terms:
        annotations = {}

        # Read in the data
        for sheet in glob.glob("./data/annotations/annotationsheet_" + language + "_" + term +"*.csv"):
            filename, extension = os.path.basename(sheet).split(".")
            prefix, lang, term, annotator = filename.split("_")

            # Read in annotations
            annotation_data = pd.read_csv(sheet, header=0)
            annotations[annotator] = annotation_data["Annotation"]

            # The example sheets are not annotated. I am using random annotations here. Make sure to comment this part out.
            annotations[annotator] = random.choices(categories, k=len(annotation_data))

        annotators = annotations.keys()
        # Calculate agreement between pairs of annotators. The order of the pair does not matter (a1,a2) is the same as a2,a1)
        # With two annotators, you only have a single pair.
        # If you have three annotators, you need to compare (a1,a2) (a1,a3) (a2,a3)

        for annotator_a, annotator_b in combinations(annotators, 2):
            agreement = [anno1 == anno2 for anno1, anno2 in  zip(annotations[annotator_a], annotations[annotator_b])]
            percentage = sum(agreement)/len(agreement)
            print(annotator_a, annotator_b)
            print("Percentage Agreement: %.2f" %percentage)
            kappa = cohen_kappa_score(annotations[annotator_a], annotations[annotator_b], labels=categories)
            print("Cohen's Kappa: %.2f" %kappa)
            confusions = confusion_matrix(annotations[annotator_a], annotations[annotator_b], labels=categories)
            print(confusions)

a2 a1
Percentage Agreement: 0.31
Cohen's Kappa: -0.03
[[36 41 49]
 [44 39 32]
 [42 41 39]]
a2 a1
Percentage Agreement: 0.57
Cohen's Kappa: 0.35
[[3 0 3]
 [0 5 3]
 [1 2 4]]
a2 a1
Percentage Agreement: 0.35
Cohen's Kappa: 0.03
[[50 41 49]
 [38 55 53]
 [52 40 45]]
a1 a2
Percentage Agreement: 0.34
Cohen's Kappa: 0.01
[[12 10 11]
 [ 6  8 12]
 [13 10 12]]
