# Notebook: Create Confirmation Subsets

## Packages

In [14]:
import pandas as pd
import json
import os

## Settings

In [15]:
N_PARTS = 1

In [16]:
OUTPUT_PATH_PREFIX = "annotation_datasets/annotation_jakob"

## Code

In [17]:
input_folder = 'annotation_datasets/synth_annotation_nils_labelstudio_output'

all_files = [file for file in os.listdir(input_folder) if file.endswith('.csv')]
df = pd.concat([pd.read_csv(os.path.join(input_folder, file)) for file in all_files], ignore_index=True)

In [18]:
df

Unnamed: 0.1,agreement,annotation_id,annotator,aspect_available_without_judgement,created_at,few_shot_condtion,id,label-explicit,label-implicit,lead_time,model,split,text,two_or_more_sentences,updated_at,Unnamed: 0
0,100.0,29386319,nils-constantin.hellwig@stud.uni-regensburg.de,,2024-01-17T12:39:00.806330Z,random,a49a6f01-1ecc-4da0-b76b-f283f518fc60,,"[{""end"":25,""text"":""waren sehr zufrieden."",""sta...",14.011,Llama70B,0,Wir waren sehr zufrieden.,,2024-01-17T12:39:00.806355Z,
1,100.0,29386342,nils-constantin.hellwig@stud.uni-regensburg.de,,2024-01-17T12:39:20.055071Z,random,d6c3769c-e06a-4b05-8ccf-e1338af83856,"[{""end"":9,""text"":""Essen"",""start"":4,""labels"":[""...",,18.298,Llama70B,0,"Das Essen war gut, aber der Service leider nicht.",,2024-01-17T12:39:20.055089Z,
2,100.0,29386363,nils-constantin.hellwig@stud.uni-regensburg.de,,2024-01-17T12:39:39.528689Z,random,3772b851-5f3f-49a4-b456-dd1e8878abf4,,"[{""end"":37,""text"":""was man bekommt"",""start"":22...",18.566,Llama70B,0,Viel zu teuer für das was man bekommt.,,2024-01-17T12:39:39.528712Z,
3,100.0,29386372,nils-constantin.hellwig@stud.uni-regensburg.de,,2024-01-17T12:39:50.874016Z,random,011243dc-960f-4c73-8d8c-3c6238011f63,,,10.431,Llama70B,0,Wir waren schon öfters hier.,,2024-01-17T12:39:50.874034Z,
4,100.0,29386385,nils-constantin.hellwig@stud.uni-regensburg.de,,2024-01-17T12:40:02.602344Z,random,63a89c9b-458a-466b-89c3-462b9f1b1833,"[{""end"":14,""text"":""Restaurant"",""start"":4,""labe...",,10.857,Llama70B,0,Das Restaurant ist sehr empfehlenswert.,,2024-01-17T12:40:02.602362Z,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,100.0,29573613,nils-constantin.hellwig@stud.uni-regensburg.de,,2024-01-21T09:37:35.854793Z,fixed,21740dd7-85c5-4c34-81a9-cf2bfb211e1a,"[{""end"":57,""text"":""Atmosphäre"",""start"":47,""lab...",,14.658,GPT-3,4,Das Restaurant hat eine eher durchschnittliche...,,2024-01-21T09:37:35.854811Z,595.0
2396,100.0,29558855,nils-constantin.hellwig@stud.uni-regensburg.de,,2024-01-20T13:00:18.530092Z,fixed,9ba1ca94-3bf6-4ca7-b7a5-aafa6a351242,"[{""end"":9,""text"":""Essen"",""start"":4,""labels"":[""...",,2.041,GPT-3,4,Das Essen war köstlich.,,2024-01-20T13:00:18.530111Z,596.0
2397,100.0,29573619,nils-constantin.hellwig@stud.uni-regensburg.de,,2024-01-21T09:37:42.318170Z,fixed,a7807724-4791-495c-b9b2-3d2feb9b61bd,"[{""end"":14,""text"":""Restaurant"",""start"":4,""labe...",,5.416,GPT-3,4,Das Restaurant hinterließ einen durchschnittli...,,2024-01-21T09:37:42.318186Z,597.0
2398,100.0,29558878,nils-constantin.hellwig@stud.uni-regensburg.de,,2024-01-20T13:01:24.704569Z,fixed,29989208-4a1b-409a-9e49-8cc3f1401f3f,"[{""end"":9,""text"":""Essen"",""start"":4,""labels"":[""...",,2.549,GPT-3,4,Das Essen war in Ordnung.,,2024-01-20T13:01:24.704594Z,598.0


In [19]:
df["second_annotator_comment"] = None
df["two_or_more_sentences"].unique()

array([nan, 'Two or more sentences'], dtype=object)

In [20]:
def merge_labels_implicit_to_explicit(dataframe):
    # Benutzerdefinierte Funktion, um label-implicit zu label-explicit hinzuzufügen und 'type' hinzufügen
    def merge_labels(row):
        labels_explicit = row["label-explicit"]
        labels_implicit = row["label-implicit"]

        if isinstance(labels_explicit, str) == False:
            labels_explicit = []
        else:
            labels_explicit = json.loads(labels_explicit)

        if isinstance(labels_implicit, str) == False:
            labels_implicit = []
        else:
            labels_implicit = json.loads(labels_implicit)

        labels_total = ""
        for annotation in labels_implicit:
            tuple = (annotation["labels"][0][:-len("-no-phrase")].rsplit('-', 1)[0],
                     annotation["labels"][0][:-len("-no-phrase")].rsplit('-', 1)[-1], 'NULL')
            labels_total += str(tuple) + "\n"

        for annotation in labels_explicit:
            tuple = (annotation["labels"][0].rsplit('-', 1)[0],
                     annotation["labels"][0].rsplit('-', 1)[-1], annotation["text"])
            labels_total += str(tuple) + "\n"

        row["annotation"] = labels_total

        # Sonderfälle
        # Update: Text is a bit misleading - my fault... However, I wanted to mark examples with mentioned aspects towards which no sentiment
        # was expressed by the author.
        if row["aspect_available_without_judgement"] == 'An aspect is addressed in the text without an explicit mention by the author':
            row["aspect_available_without_judgement"] = "✅"
        else:
            row["aspect_available_without_judgement"] = ""

        if row["two_or_more_sentences"] == "Two or more sentences":
            row["annotation"] = "⚠️ Mehr als 1 Satz im Text identifiziert von Annoator A"

        row["second_annotator_comment"] = ""
        return row

    dataframe = dataframe.apply(merge_labels, axis=1)

    return dataframe

In [21]:
total_rows = len(df)
rows_per_part = total_rows // N_PARTS

for idx in range(N_PARTS):
    start_idx = idx * rows_per_part
    end_idx = (idx + 1) * rows_per_part if idx < N_PARTS - 1 else total_rows
    part_df = df.iloc[start_idx:end_idx]

    filename = f"{OUTPUT_PATH_PREFIX}_{idx}.csv"

    part_df = merge_labels_implicit_to_explicit(part_df)
    part_df = part_df[["id", "second_annotator_comment", "annotation", "aspect_available_without_judgement", "text", "model", "few_shot_condtion", "split"]]

    part_df.to_csv(filename, index=False)

    print(f"Part {idx} saved to '{filename}'.")

Part 0 saved to 'annotation_datasets/annotation_jakob_0.csv'.
