### Check the files for ACSA and TASD

In [28]:
import os
import json
from collections import Counter
import re

FOLDER_PATH = "AA_Output"   # adjust to your folder
TASK = "tasd"  # "acsa" or "tasd"
LENGTH = 200
LENGTH_2 = 1000
LENGTH_3 = 185
# --- allowed values ---
ALLOWED_POLARITIES = {"positive", "negative", "neutral"}
ALLOWED_CATEGORIES = {"preis", "gesamteindruck", "ambiente", "service", "essen"}

def load_jsonl(file_path):
    """Load JSONL as list of dicts."""
    with open(file_path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

def check_acsa(entry):
    errors = []
    seen_labels = set()

    for label in entry["labels"]:
        if len(label) != 2:
            errors.append(f"ID {entry['id']}: Invalid label length {label}")
            continue

        category, polarity = label

        # check lowercase
        if category != category.lower():
            errors.append(f"ID {entry['id']}: Category not lowercase: {category}")

        if polarity != polarity.lower():
            errors.append(f"ID {entry['id']}: Polarity not lowercase: {polarity}")

        # check allowed values
        if polarity not in ALLOWED_POLARITIES:
            errors.append(f"ID {entry['id']}: Invalid polarity: {polarity}")

        if category not in ALLOWED_CATEGORIES:
            errors.append(f"ID {entry['id']}: Invalid category: {category}")

        # check duplicates
        tup = tuple(label)
        if tup in seen_labels:
            errors.append(f"ID {entry['id']}: Duplicate label {label}")
        seen_labels.add(tup)

    return errors

def check_tasd(entry):
    errors = []

    for label in entry["labels"]:
        if len(label) != 3:
            errors.append(f"ID {entry['id']}: Invalid label length {label}")
            continue

        category, polarity, phrase = label

        # category lowercase
        if category != category.lower():
            errors.append(f"ID {entry['id']}: Category not lowercase: {category}")

        if polarity != polarity.lower():
            errors.append(f"ID {entry['id']}: Polarity not lowercase: {polarity}")

        # allowed values
        if polarity not in ALLOWED_POLARITIES:
            errors.append(f"ID {entry['id']}: Invalid polarity: {polarity}")

        if category not in ALLOWED_CATEGORIES:
            errors.append(f"ID {entry['id']}: Invalid category: {category}")


        if phrase != "NULL":
            if phrase not in entry["text"]:
                errors.append(f"ID {entry['id']}: Phrase '{phrase}' not in text")

                        

    return errors

if __name__ == "__main__":
    all_files = sorted([f for f in os.listdir(FOLDER_PATH) if f.endswith(".jsonl")])

    for file in all_files:
        total_errors = []
        print(f"Checking {file}...")
        entries = load_jsonl(os.path.join(FOLDER_PATH, file))
        if len(entries) != LENGTH and len(entries) != LENGTH_2 and len(entries) != LENGTH_3:
            print(f"  Warning: Expected {LENGTH} entries, found {len(entries)}")

            # Collect and sort IDs
            ids = sorted(int(entry["id"]) for entry in entries)

            # Find gaps
            missing_ids = []
            for i in range(len(ids) - 1):
                current_id = ids[i]
                next_id = ids[i + 1]
                if next_id != current_id + 1:
                    gap = list(range(current_id + 1, next_id))
                    missing_ids.extend(gap)

            if missing_ids:
                print(f"  Missing IDs: {missing_ids}")
            else:
                print("  No obvious missing IDs (but count mismatch).")

        for entry in entries:                
            if TASK == "acsa":
                errs = check_acsa(entry)
            else:
                errs = check_tasd(entry)
            total_errors.extend(errs)
        for error in total_errors:
            print(error)


Checking MW_1.jsonl...
Checking MW_2.jsonl...
Checking MW_3.jsonl...
Checking MW_4.jsonl...
Checking MW_5.jsonl...
  No obvious missing IDs (but count mismatch).
Checking ND_1.jsonl...
Checking ND_2.jsonl...
Checking ND_3.jsonl...
Checking ND_4.jsonl...
Checking ND_5.jsonl...
  No obvious missing IDs (but count mismatch).
