In [None]:
import json
from collections import defaultdict

# Define valid categories and polarities (with English mapping)
valid_categories = {"essen", "preis", "ambiente", "gesamteindruck", "service"}
valid_polarities_map = {
    "negativ": "negative",
    "positiv": "positive",
    "neutral": "neutral",
    "konflikt": "conflict",
    "NULL": "NULL"
}

# Load JSON file
with open("../../11_annotations/crowd/acsa/crowd_acsa_raw.json", "r", encoding="utf-8") as f:
    data = json.load(f)

annotator_data = defaultdict(list)

# Process data
for item in data:
    task_data = item.get("data", {})
    task_id = task_data.get("id")
    original_id = task_data.get("original_id", task_id)
    text = task_data.get("text", "")

    for ann in item.get("annotations", []):
        completed_by = ann.get("completed_by")
        labels_list = []

        for result in ann.get("result", []):
            from_name = result.get("from_name")
            val = result.get("value", {})

            if from_name in {"category_polarity", "gender"}:  # allow both names seen in your data
                for lbl in val.get("choices", []):
                    parts = lbl.split("-")
                    category = parts[0].lower()
                    polarity = parts[1].lower() if len(parts) > 1 else "NULL"
                    # Translate polarity to English
                    polarity = valid_polarities_map.get(polarity, polarity)
                    labels_list.append([category, polarity])

        if labels_list:
            annotator_data[completed_by].append({
                "id": str(task_id),
                "original_id": str(original_id),
                "text": text,
                "labels": labels_list
            })

# Validation and saving
for annotator, entries in annotator_data.items():
    filename = f"results_single_acsa_crowd/{annotator}.jsonl"
    with open(filename, "w", encoding="utf-8") as f:
        for entry in entries:
            json.dump(entry, f, ensure_ascii=False)
            f.write("\n")

    # Checks
    if len(entries) != 200:
        # print(f"Length Error: Annotator {annotator} has {len(entries)} entries, expected 200.")
        None

    for entry in entries:
        for lbl in entry["labels"]:
            category, polarity = lbl
            # Check category
            if category not in valid_categories:
                print(f"Category Error in annotator {annotator}, id {entry['id']}: invalid category '{category}'")
            # Check polarity
            if polarity not in {"negative", "positive", "neutral", "conflict", "NULL"}:
                print(f"Polarity Error in annotator {annotator}, id {entry['id']}: invalid polarity '{polarity}'")

print("Processing and validation complete.")


Processing and validation complete.


In [None]:
#!/usr/bin/env python3
import json
import os
import glob
from collections import Counter, defaultdict

# Config
RESULTS_DIR = "results_single_acsa_crowd/original"       # where annotator jsonl files live
INPUT_DIR = "results_single_acsa_crowd"
TRAINSET_FILE = "data/trainset_1000.jsonl"
SPLIT_SIZE = 200                      # split size (0-199,200-399,...)
NUM_SPLITS = 5                        # 1000 / 200

def load_trainset(path):
    train = []
    id_to_index = {}
    with open(path, "r", encoding="utf-8") as f:
        for idx, line in enumerate(f):
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError as e:
                raise RuntimeError(f"Error parsing trainset line {idx}: {e}")
            train.append(obj)
            id_to_index[str(obj.get("id"))] = idx
    return train, id_to_index

def load_jsonl(path):
    items = []
    with open(path, "r", encoding="utf-8") as f:
        for line_no, line in enumerate(f, 1):
            s = line.strip()
            if not s:
                continue
            try:
                obj = json.loads(s)
            except json.JSONDecodeError as e:
                raise RuntimeError(f"Error parsing {path} line {line_no}: {e}")
            items.append(obj)
    return items

def write_jsonl(path, items):
    with open(path, "w", encoding="utf-8") as f:
        for obj in items:
            json.dump(obj, f, ensure_ascii=False)
            f.write("\n")

def detect_split(present_ids, id_to_index):
    # Count which split each id falls into (based on trainset index)
    split_counts = Counter()
    missing_ids = []
    for pid in present_ids:
        idx = id_to_index.get(pid)
        if idx is None:
            missing_ids.append(pid)
        else:
            split = idx // SPLIT_SIZE
            split_counts[split] += 1
    if not split_counts:
        # No match at all -> default to split 0 and report
        return 0, missing_ids
    best_split = max(split_counts.items(), key=lambda x: (x[1], -x[0]))[0]
    return best_split, missing_ids

def complete_file(file_path, trainset, id_to_index):
    # Load annotator file
    items = load_jsonl(file_path)
    present_map = {str(item.get("id")): item for item in items}
    present_ids = list(present_map.keys())

    best_split, not_found_in_train = detect_split(present_ids, id_to_index)

    start = best_split * SPLIT_SIZE
    end = start + SPLIT_SIZE
    split_train = trainset[start:end]

    train_ids_in_split = [str(x.get("id")) for x in split_train]

    # Build final list in trainset order: either existing annotator entry or filled entry
    final_entries = []
    added_ids = []
    kept_ids = []
    for ts in split_train:
        ts_id_str = str(ts.get("id"))
        if ts_id_str in present_map:
            # Keep annotator's entry, but ensure it contains original_id and text (fill from trainset if missing)
            entry = present_map[ts_id_str]
            if "original_id" not in entry and "original_id" in ts:
                entry["original_id"] = ts.get("original_id")
            if "text" not in entry and "text" in ts:
                entry["text"] = ts.get("text")
            # If this annotator entry is missing labels key, preserve it (do not auto-insert)
            if "labels" not in entry:
                entry["labels"] = []
            final_entries.append(entry)
            kept_ids.append(ts_id_str)
        else:
            # Create missing entry using trainset entry + empty labels
            new_entry = {
                "id": str(ts.get("id")),
                "original_id": str(ts.get("original_id")),
                "text": ts.get("text"),
                "labels": []
            }
            final_entries.append(new_entry)
            added_ids.append(ts_id_str)

    # Any present entries outside the chosen split will be reported and dropped
    outsiders = [pid for pid in present_ids if pid not in set(train_ids_in_split)]

    # Final safety checks
    if len(final_entries) != SPLIT_SIZE:
        raise RuntimeError(f"Internal error: final_entries length is {len(final_entries)} (expected {SPLIT_SIZE}) for file {file_path}")

    # Overwrite file
    write_jsonl(file_path, final_entries)

    summary = {
        "file": file_path,
        "chosen_split": (start, end-1),
        "kept_count": len(kept_ids),
        "added_count": len(added_ids),
        "added_ids": added_ids,
        "outsiders_dropped": outsiders,
        "ids_not_found_in_trainset": not_found_in_train
    }
    return summary

def main():
    if not os.path.exists(TRAINSET_FILE):
        print(f"Trainset file not found: {TRAINSET_FILE}")
        return

    trainset, id_to_index = load_trainset(TRAINSET_FILE)

    files = sorted(glob.glob(os.path.join(INPUT_DIR, "*.jsonl")))
    if not files:
        print(f"No .jsonl files found in {INPUT_DIR}")
        return

    all_summaries = []
    for fpath in files:
        try:
            summary = complete_file(fpath, trainset, id_to_index)
            all_summaries.append(summary)
            print(f"Processed {os.path.basename(fpath)}: split {summary['chosen_split'][0]}-{summary['chosen_split'][1]}, "
                  f"kept {summary['kept_count']}, added {summary['added_count']}, dropped outsiders {len(summary['outsiders_dropped'])}")
            if summary["outsiders_dropped"]:
                print("  -> Dropped outsider ids (not in chosen split):", summary["outsiders_dropped"])
            if summary["ids_not_found_in_trainset"]:
                print("  -> IDs present in file but not found in trainset:", summary["ids_not_found_in_trainset"])
        except Exception as e:
            print(f"Error processing {fpath}: {e}")

    # Optionally write a summary JSON
    summary_path = os.path.join(RESULTS_DIR, "fill_summary.json")
    with open(summary_path, "w", encoding="utf-8") as sf:
        json.dump(all_summaries, sf, ensure_ascii=False, indent=2)

    print(f"\nDone. Summary saved to {summary_path}")

if __name__ == "__main__":
    main()


Processed 91945.jsonl: split 400-599, kept 200, added 0, dropped outsiders 0
Processed 91947.jsonl: split 800-999, kept 199, added 1, dropped outsiders 0
Processed 91948.jsonl: split 0-199, kept 198, added 2, dropped outsiders 0
Processed 91949.jsonl: split 200-399, kept 199, added 1, dropped outsiders 0
Processed 91950.jsonl: split 400-599, kept 200, added 0, dropped outsiders 0
Processed 91951.jsonl: split 600-799, kept 200, added 0, dropped outsiders 0
Processed 91952.jsonl: split 800-999, kept 200, added 0, dropped outsiders 0
Processed 91954.jsonl: split 200-399, kept 200, added 0, dropped outsiders 0
Processed 91955.jsonl: split 400-599, kept 198, added 2, dropped outsiders 0
Processed 91956.jsonl: split 600-799, kept 200, added 0, dropped outsiders 0
Processed 97366.jsonl: split 0-199, kept 189, added 11, dropped outsiders 0
Processed 97367.jsonl: split 0-199, kept 200, added 0, dropped outsiders 0
Processed 97368.jsonl: split 200-399, kept 196, added 4, dropped outsiders 0
Proc

In [None]:
import os
import json
import glob

# Folders
RESULTS_DIR = "results_single_acsa_crowd"  # where annotator jsonl files live

# Validation sets
valid_categories = {"essen", "preis", "ambiente", "gesamteindruck", "service"}
valid_polarities = {"negative", "positive", "neutral", "conflict", "NULL"}

def load_jsonl(path):
    entries = []
    with open(path, "r", encoding="utf-8") as f:
        for line_no, line in enumerate(f, 1):
            s = line.strip()
            if not s:
                continue
            try:
                obj = json.loads(s)
            except json.JSONDecodeError as e:
                print(f"JSON error in {path}, line {line_no}: {e}")
                continue
            entries.append(obj)
    return entries

def validate_file(path):
    annotator = os.path.basename(path).replace(".jsonl", "")
    entries = load_jsonl(path)

    # Check file length
    if len(entries) != 200:
        print("Length!")
        None

    # Check each entry
    for entry in entries:
        for lbl in entry.get("labels", []):
            if not lbl:
                continue
            if len(lbl) != 2:
                print(f"Format Error in {annotator}, id {entry['id']}: label {lbl} is not tuple")
                continue
            category, polarity = lbl

            # Category check
            if category not in valid_categories:
                print(f"Category Error in {annotator}, id {entry['id']}: invalid category '{category}'")

            # Polarity check
            if polarity not in valid_polarities:
                print(f"Polarity Error in {annotator}, id {entry['id']}: invalid polarity '{polarity}'")


def main():
    files = sorted(glob.glob(os.path.join(RESULTS_DIR, "*.jsonl")))
    if not files:
        print(f"No .jsonl files found in {RESULTS_DIR}")
        return

    for path in files:
        validate_file(path)

    print("Processing and validation complete.")

if __name__ == "__main__":
    main()


Processing and validation complete.


In [4]:
import os
import json
import glob
from collections import Counter, defaultdict

# --- Helper functions ---
def get_frequency_for_counts(counts, minimum):
    """Return the frequency count based on minimum appearance across splits."""
    return sorted(counts, reverse=True)[0:minimum][minimum-1]

def get_unique_keys(dict_list):
    """Return all unique keys across a list of dictionaries."""
    unique_keys = set()
    for d in dict_list:
        unique_keys.update(d.keys())
    return list(unique_keys)

def merge_aspect_lists(aspect_lists, minimum_appearance=2, oid=None):
    """
    Merge multiple aspect lists based on a minimum appearance, 
    with warning for low-frequency aspects.
    """
    counter_exclude = 0
    aspect_lists_counter = []
    for aspect_list in aspect_lists:
        aspect_counter = dict(Counter(["#####".join(aspect) for aspect in aspect_list]))
        aspect_lists_counter.append(aspect_counter)
        
    unique_tuples = get_unique_keys(aspect_lists_counter)

    label = []
    for tuple_str in unique_tuples:
        counts = [asp.get(tuple_str, 0) for asp in aspect_lists_counter]
        total_count = sum(counts)
        
        if total_count < minimum_appearance:
            counter_exclude += 1
            continue  # skip weak aspects
        
        count_tuple = get_frequency_for_counts(counts, minimum_appearance)
        tuple_reverse = tuple(tuple_str.split("#####"))
        label += count_tuple * [tuple_reverse]
        
    return label, counter_exclude


# --- Main script ---
RESULTS_DIR = "results_single_acsa_crowd/"
OUTPUT_FILE = "final/trainset_crowd_acsa.jsonl"
os.makedirs("final", exist_ok=True)

# Load all annotator files
files = sorted(glob.glob(os.path.join(RESULTS_DIR, "*.jsonl")))
print(f"Found {len(files)} annotator files.")

# Group entries by ID
entries_by_id = defaultdict(list)

def load_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f if line.strip()]

for path in files:
    for entry in load_jsonl(path):
        entries_by_id[entry["id"]].append(entry)

# Merge by majority vote
merged_entries = []
for oid, entry_list in sorted(entries_by_id.items(), key=lambda x: int(x[0])):
    if not entry_list:
        continue

    text = entry_list[0]["text"]
    original_id = entry_list[0]["original_id"]

    # Collect all label sets
    label_sets = [e.get("labels", []) for e in entry_list]

    # Merge with majority voting
    merged_labels, excluded = merge_aspect_lists(label_sets, minimum_appearance=2, oid=oid)

    merged_entries.append({
        "id": str(oid),
        "original_id": original_id,
        "text": text,
        "labels": merged_labels
    })

# Save merged dataset
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for entry in merged_entries:
        json.dump(entry, f, ensure_ascii=False)
        f.write("\n")

print(f"✅ Merged dataset written to {OUTPUT_FILE} with {len(merged_entries)} entries.")

Found 15 annotator files.
✅ Merged dataset written to final/trainset_crowd_acsa.jsonl with 1000 entries.


In [5]:
import json

INPUT_FILE = "final/trainset_crowd_acsa.jsonl"
OUTPUT_FILE = "final/trainset_crowd_acsa_noconflict.jsonl"

with open(INPUT_FILE, "r", encoding="utf-8") as f_in, open(OUTPUT_FILE, "w", encoding="utf-8") as f_out:
    for line in f_in:
        if not line.strip():
            continue
        entry = json.loads(line)

        # filter out labels with conflict polarity
        filtered_labels = [lbl for lbl in entry.get("labels", []) if lbl[1] != "conflict"]

        entry["labels"] = filtered_labels
        json.dump(entry, f_out, ensure_ascii=False)
        f_out.write("\n")

print(f"✅ File cleaned and saved to {OUTPUT_FILE}")


✅ File cleaned and saved to final/trainset_crowd_acsa_noconflict.jsonl


In [11]:
import json
from collections import defaultdict

# Define valid categories and polarities (with English mapping)
valid_categories = {"essen", "preis", "ambiente", "gesamteindruck", "service"}
valid_polarities_map = {
    "negativ": "negative",
    "positiv": "positive",
    "neutral": "neutral",
    "konflikt": "conflict",
    "NULL": "NULL"
}

# Load JSON file
with open("../../11_annotations/students/acsa/students_raw.json", "r", encoding="utf-8") as f:
    data = json.load(f)

annotator_data = defaultdict(list)

# Process data
for item in data:
    task_data = item.get("data", {})
    task_id = task_data.get("id")
    original_id = task_data.get("original_id", task_id)
    text = task_data.get("text", "")

    for ann in item.get("annotations", []):
        completed_by = ann.get("completed_by")
        labels_list = []
        meta_dict = {}
        comments_list = []

        for result in ann.get("result", []):
            from_name = result.get("from_name")
            val = result.get("value", {})

            if from_name in {"category_polarity", "gender"}:  # allow both naming variants
                for lbl in val.get("choices", []) + val.get("labels", []):
                    parts = lbl.split("-")
                    category = parts[0].lower()
                    polarity = parts[1].lower() if len(parts) > 1 else "NULL"
                    # Translate polarity
                    polarity = valid_polarities_map.get(polarity, polarity)
                    labels_list.append({"category": category, "polarity": polarity})

            elif from_name == "meta":
                # Collect meta choices
                choices = val.get("choices", [])
                if choices:
                    meta_dict["choices"] = choices

            elif from_name == "comment":
                # Collect free text comments
                comment_texts = val.get("text", [])
                comments_list.extend(comment_texts)

        if labels_list:
            annotator_data[completed_by].append({
                "id": str(task_id),
                "original_id": str(original_id),
                "text": text,
                "labels": labels_list,
                "meta": meta_dict,
                "comment_authors": comments_list
            })

# Validation and saving
for annotator, entries in annotator_data.items():
    filename = f"results_single_acsa_meta/{annotator}.jsonl"
    with open(filename, "w", encoding="utf-8") as f:
        for entry in entries:
            json.dump(entry, f, ensure_ascii=False)
            f.write("\n")

    # Checks
    if len(entries) != 200:
        # print(f"Length Error: Annotator {annotator} has {len(entries)} entries, expected 200.")
        None

    for entry in entries:
        for lbl in entry["labels"]:
            category = lbl["category"]
            polarity = lbl["polarity"]
            # Check category
            if category not in valid_categories:
                print(f"Category Error in annotator {annotator}, id {entry['id']}: invalid category '{category}'")
            # Check polarity
            if polarity not in {"negative", "positive", "neutral", "conflict", "NULL"}:
                print(f"Polarity Error in annotator {annotator}, id {entry['id']}: invalid polarity '{polarity}'")

print("Processing and validation complete.")


Processing and validation complete.


In [12]:
import os
import json
import pandas as pd
from collections import Counter

folder = "results_single_acsa_meta"
rows = []

# Loop through all jsonl files
for filename in os.listdir(folder):
    if filename.endswith(".jsonl"):
        annotator_id = filename.replace(".jsonl", "")
        counter = Counter()
        comment_count = 0

        with open(os.path.join(folder, filename), "r", encoding="utf-8") as f:
            for line in f:
                entry = json.loads(line)

                # Count meta choices
                meta = entry.get("meta", {})
                choices = meta.get("choices", [])
                counter.update(choices)
                text = entry.get("text", [])
                labels = entry.get("labels", [])
                # Count comments
                comments = entry.get("comment_authors", [])
                if comments == []:
                    continue
                else:
                    print(text, "\n", labels, "\n", comments, "\n\n")
                comment_count += len(comments)

        rows.append({
            "annotator": annotator_id,
            # K = Fehlender Kontext aus der Bewertung könnte die Annotation beeinflussen.
            "K": counter.get("K", 0),
            # S = Die Annotation war schwierig.
            "S": counter.get("S", 0),
            "total_choices": sum(counter.values()),
            "comments": comment_count
        })

# Create dataframe
df = pd.DataFrame(rows)
print(df)

# Save to CSV (optional)
df.to_csv("meta_counts_with_comments.csv", index=False)


Der Teller ist gut gefüllt, fritierte Sardinen, gegrillte Paprika, Datteln in Speck und Ziegenkäse im Speck, nichts Überraschendes, nichts Außergewöhnliches. 
 [{'category': 'essen', 'polarity': 'conflict'}] 
 ['Viel, vermutlich aber nicht lecker'] 


!Auch wenn die Bedienung sich dafür entschuldigt hat, war es kein schönes Erlebnis. 
 [{'category': 'service', 'polarity': 'conflict'}, {'category': 'gesamteindruck', 'polarity': 'negative'}] 
 ['War die Bedienung freundlich?'] 


Der Auslieferungsfahrer war sehr nett und hat sich entschuldigt. 
 [{'category': 'service', 'polarity': 'positive'}, {'category': 'service', 'polarity': 'negative'}, {'category': 'service', 'polarity': 'conflict'}] 
 ['Vermutlich lange Wartezeit '] 


Von dünn und knusprig, goldbraun gebraten keine Spur. 
 [{'category': 'essen', 'polarity': 'negative'}] 
 ['Gericht sollte ergänzt werden'] 


Die Qualität des Essens fand ich aber sehr durchschnittlich. 
 [{'category': 'essen', 'polarity': 'neutral'}, {'category':

TASD:
   annotator   K   S  total_choices  comments
0      10189  25   8             33        18
1      12680   0   0              0         0
2      15658   9  17             26         3
3      21123   8  12             20         0
4      86001   4   0              4         0
5      86020   3   4              7         0
6      86108   0   0              0         1
7      86407   0   0              0         0
8      86725  17  10             27         1
9      87069  14   5             19        19
10     87560   0   3              3         0
11     87576   8  16             24         1
12     88190   4   5              9         3
13     88285   8   5             13         4
14     89770   5   0              5         0