### Update IDS

In [None]:
import json

# --- CONFIG ---
name = "MW_4_orgsd.jsonl"
INPUT_FILE = f"ground_truth_tasd/{name}"   # original file
OUTPUT_FILE = f"{name}" # file to save updated IDs
INCREMENT = 185*3              # value to add to each id

# --- SCRIPT ---
with open(INPUT_FILE, "r", encoding="utf-8") as infile, \
     open(OUTPUT_FILE, "w", encoding="utf-8") as outfile:
    
    for line in infile:
        obj = json.loads(line)
        if "id" in obj and isinstance(obj["id"], int):
            obj["id"] += INCREMENT
        json.dump(obj, outfile, ensure_ascii=False)
        outfile.write("\n")

print(f"Updated IDs by +{INCREMENT} and saved to {OUTPUT_FILE}")


Updated IDs by +555 and saved to MW_4_org.jsonl


### Combine Single Annotations to Dataset

In [12]:
import os
import json

FOLDER_PATH = "ground_truth_tasd"
OUTPUT_FILE = FOLDER_PATH + "_all"

os.makedirs(OUTPUT_FILE, exist_ok=True)

def load_jsonl(file_path):
    """Load a JSONL file as a list of dicts."""
    with open(file_path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

def save_jsonl(data, file_path):
    """Save a list of dicts to a JSONL file."""
    with open(file_path, "w", encoding="utf-8") as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

if __name__ == "__main__":
    # List all JSONL files
    all_files = sorted([f for f in os.listdir(FOLDER_PATH) if f.endswith(".jsonl")])
    full_paths = [os.path.join(FOLDER_PATH, f) for f in all_files]

    # Step 1: group files into batches (files with identical IDs)
    file_ids = {}
    batches = []
    used_files = set()

    for f in full_paths:
        with open(f, "r", encoding="utf-8") as jf:
            ids = set(json.loads(line)["id"] for line in jf)
        file_ids[f] = ids

    for f1 in full_paths:
        if f1 in used_files:
            continue
        batch = [f1]
        ids1 = file_ids[f1]
        for f2 in full_paths:
            if f2 != f1 and f2 not in used_files and file_ids[f2] == ids1:
                batch.append(f2)
        used_files.update(batch)
        batches.append(sorted(batch))  # keep consistent order

    print(f"Detected {len(batches)} batches")
    for i, b in enumerate(batches):
        print(f"Batch {i+1}: {[os.path.basename(f) for f in b]}")

    # Step 2: combine files column-wise
    num_datasets = len(batches[0])
    for i in range(num_datasets):
        combined_data = []
        for batch in batches:
            combined_data.extend(load_jsonl(batch[i]))
        out_file = os.path.join(OUTPUT_FILE, f"combined_dataset_{i+1}.jsonl")
        save_jsonl(combined_data, out_file)
        print(f"Saved {out_file} with {len(combined_data)} entries")


Detected 5 batches
Batch 1: ['MW_1.jsonl', 'ND_1.jsonl']
Batch 2: ['MW_2.jsonl', 'ND_2.jsonl']
Batch 3: ['MW_3.jsonl', 'ND_3.jsonl']
Batch 4: ['MW_4.jsonl', 'ND_4.jsonl']
Batch 5: ['MW_5.jsonl', 'ND_5.jsonl']
Saved ground_truth_tasd_all\combined_dataset_1.jsonl with 924 entries
Saved ground_truth_tasd_all\combined_dataset_2.jsonl with 924 entries


### Remove conflict and \\n in phrase

In [None]:
import os
import json
import re

FOLDER_PATH = "AA_Input"
OUTPUT_PATH = "AA_Output"
os.makedirs(OUTPUT_PATH, exist_ok=True)

def load_jsonl(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

def save_jsonl(data, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

def normalize(s: str) -> str:
    """Collapse all whitespace (\n, \t, multiple spaces) into a single space."""
    return s.replace("\\n", "").strip()

def clean_labels(labels):
    """Remove conflict labels and normalize phrases if present."""
    cleaned = []
    for label in labels:
        # skip conflict labels
        if len(label) >= 2 and label[1].lower() == "conflict":
            continue
        # normalize phrase if TASD (triplet)
        if len(label) == 3:
            cat, pol, phrase = label
            cleaned.append([cat, pol, normalize(phrase)])
        else:
            cleaned.append(label)
    return cleaned

if __name__ == "__main__":
    all_files = sorted([f for f in os.listdir(FOLDER_PATH) if f.endswith(".jsonl")])

    for file in all_files:
        file_path = os.path.join(FOLDER_PATH, file)
        entries = load_jsonl(file_path)

        for entry in entries:
            entry["labels"] = clean_labels(entry["labels"])

        out_file = os.path.join(OUTPUT_PATH, file)
        save_jsonl(entries, out_file)
        print(f"Cleaned {file} → {out_file}")


Cleaned MW_1.jsonl → AA_Output\MW_1.jsonl
Cleaned MW_2.jsonl → AA_Output\MW_2.jsonl
Cleaned MW_3.jsonl → AA_Output\MW_3.jsonl
Cleaned MW_4.jsonl → AA_Output\MW_4.jsonl
Cleaned MW_5.jsonl → AA_Output\MW_5.jsonl
Cleaned ND_1.jsonl → AA_Output\ND_1.jsonl
Cleaned ND_2.jsonl → AA_Output\ND_2.jsonl
Cleaned ND_3.jsonl → AA_Output\ND_3.jsonl
Cleaned ND_4.jsonl → AA_Output\ND_4.jsonl
Cleaned ND_5.jsonl → AA_Output\ND_5.jsonl


### Update category / polarity and remove 

In [10]:
import os
import json

FOLDER_PATH = "AA_Input"
OUTPUT_PATH = "AA_Output"
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Mapping from German polarities to English
GERMAN_TO_EN_POLARITY = {
    "positiv": "positive",
    "negativ": "negative",
    "neutral": "neutral"
}

def load_jsonl(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

def save_jsonl(data, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

def transform_labels(labels):
    """Lowercase category and map polarity from German to English."""
    transformed = []
    for label in labels:
        if len(label) == 2:
            cat, pol = label
            cat = cat.lower()
            pol = GERMAN_TO_EN_POLARITY.get(pol.lower(), pol.lower())
            transformed.append([cat, pol])
        elif len(label) == 3:
            cat, pol, phrase = label
            cat = cat.lower()
            pol = GERMAN_TO_EN_POLARITY.get(pol.lower(), pol.lower())
            transformed.append([cat, pol, phrase])
        else:
            transformed.append(label)  # leave as is if unexpected
    return transformed

if __name__ == "__main__":
    all_files = sorted([f for f in os.listdir(FOLDER_PATH) if f.endswith(".jsonl")])

    for file in all_files:
        file_path = os.path.join(FOLDER_PATH, file)
        entries = load_jsonl(file_path)

        for entry in entries:
            entry["labels"] = transform_labels(entry["labels"])

        out_file = os.path.join(OUTPUT_PATH, file)
        save_jsonl(entries, out_file)
        print(f"Transformed {file} → {out_file}")


Transformed MW_1.jsonl → AA_Output\MW_1.jsonl
Transformed MW_2.jsonl → AA_Output\MW_2.jsonl
Transformed MW_3.jsonl → AA_Output\MW_3.jsonl
Transformed MW_4.jsonl → AA_Output\MW_4.jsonl
Transformed MW_5.jsonl → AA_Output\MW_5.jsonl
Transformed ND_1.jsonl → AA_Output\ND_1.jsonl
Transformed ND_2.jsonl → AA_Output\ND_2.jsonl
Transformed ND_3.jsonl → AA_Output\ND_3.jsonl
Transformed ND_4.jsonl → AA_Output\ND_4.jsonl
Transformed ND_5.jsonl → AA_Output\ND_5.jsonl
