### Transform raw output in simple json and jsonl files

In [1]:
import os
import json

# Input and output base directories
base_dir = 'acsa_students_redo_llm_fs'
output_dir = 'acsa_students_redo_llm_fs_transform'

# Walk through all files in directory
for root, _, files in os.walk(base_dir):
    for file in files:
        if file.endswith('.json'):
            file_path = os.path.join(root, file)
            print(f"Processing file: {file_path}")

            # Load JSON data
            with open(file_path, 'r', encoding='utf-8') as f:
                try:
                    data = json.load(f)
                except json.JSONDecodeError as e:
                    print(f"Failed to load {file_path}: {e}")
                    continue

            # Remove all keys except selected ones
            cleaned_data = []
            for obj in data:
                cleaned_obj = {
                    'id': obj.get('id'),
                    'original_id': obj.get('original_id'),
                    'text': obj.get('text'),
                    'pred_label': obj.get('pred_label')
                }
                cleaned_data.append(cleaned_obj)

            # Construct output path, preserving subfolder structure
            rel_path = os.path.relpath(root, base_dir)  # relative to base_dir
            target_dir = os.path.join(output_dir, rel_path)
            os.makedirs(target_dir, exist_ok=True)

            out_file_path = os.path.join(target_dir, file)

            # Save cleaned JSON
            with open(out_file_path, 'w', encoding='utf-8') as f:
                json.dump(cleaned_data, f, ensure_ascii=False, indent=2)

            # Also save as JSONL with "labels" format
            out_file_path_jsonl = os.path.splitext(out_file_path)[0] + ".jsonl"
            with open(out_file_path_jsonl, 'w', encoding='utf-8') as f_jsonl:
                for obj in cleaned_data:
                    jsonl_entry = {
                        "id": obj["id"],
                        "original_id": obj["original_id"],
                        "text": obj["text"],
                        # wrap pred_label into [["category", "polarity", "term"]]
                        "labels": obj.get("pred_label")
       
                    }
                    f_jsonl.write(json.dumps(jsonl_entry, ensure_ascii=False) + "\n")

print("Processing complete. Cleaned files are in 'results_clean/'")


Processing file: acsa_students_redo_llm_fs\acsa_students_gemma3_27b_10_50.json
Processing file: acsa_students_redo_llm_fs\acsa_students_gemma3_27b_15_50.json
Processing file: acsa_students_redo_llm_fs\acsa_students_gemma3_27b_20_50.json
Processing file: acsa_students_redo_llm_fs\acsa_students_gemma3_27b_25_50.json
Processing file: acsa_students_redo_llm_fs\acsa_students_gemma3_27b_5_50.json
Processing complete. Cleaned files are in 'results_clean/'


### TASD: Merge the 5 seeds with majority vote into a single file (separated by splits)

In [None]:
import os
import json
from collections import Counter

# --- Helper functions ---
def get_frequency_for_counts(counts, minimum):
    """Return the frequency count based on minimum appearance across splits."""
    return sorted(counts, reverse=True)[0:minimum][minimum-1]

def get_unique_keys(dict_list):
    """Return all unique keys across a list of dictionaries."""
    unique_keys = set()
    for d in dict_list:
        unique_keys.update(d.keys())
    return list(unique_keys)

def merge_aspect_lists(aspect_lists, minimum_appearance=3, oid=None):
    """
    Merge multiple aspect lists based on a minimum appearance, 
    with warning for low-frequency aspects.
    """
    counter_exclude = 0
    aspect_lists_counter = []
    for aspect_list in aspect_lists:
        aspect_counter = dict(Counter(["#####".join(aspect) for aspect in aspect_list]))
        aspect_lists_counter.append(aspect_counter)
        
    unique_tuples = get_unique_keys(aspect_lists_counter)

    label = []
    for tuple_str in unique_tuples:
        counts = [asp.get(tuple_str, 0) for asp in aspect_lists_counter]
        total_count = sum(counts)
        
        if total_count < minimum_appearance:
            counter_exclude += 1
            #print(f"Warning: original_id={oid}, aspect={tuple_str.split('#####')} appeared only {total_count} times")
        
        count_tuple = get_frequency_for_counts(counts, minimum_appearance)
        tuple_reverse = tuple(tuple_str.split("#####"))
        label += count_tuple * [tuple_reverse]
        
    return label, counter_exclude

# --- Paths ---
input_folder = "results_clean"
output_folder = "results_majority"
os.makedirs(output_folder, exist_ok=True)

for k in range(5):
    splits = [f"tasd_gerestaurant_gemma3_27b_{i}_30_Split_{k+1}.jsonl" for i in range(5)]

    # --- Merge data by 'original_id' ---
    data_by_id = {}
    for split_file in splits:
        with open(os.path.join(input_folder, split_file), "r", encoding="utf-8") as f:
            for line in f:
                entry = json.loads(line)
                oid = entry["original_id"]
                if oid not in data_by_id:
                    data_by_id[oid] = {
                        "id": entry["id"],
                        "original_id": oid,
                        "text": entry["text"],
                        "labels_lists": []
                    }
                data_by_id[oid]["labels_lists"].append(entry.get("labels", []))

    # --- Apply majority merge ---
    merged_entries = []
    total_excluded = 0
    for oid, info in data_by_id.items():
        merged_labels, counter_exclude = merge_aspect_lists(info["labels_lists"], minimum_appearance=3, oid=oid)
        total_excluded += counter_exclude
        merged_entries.append({
            "id": info["id"],
            "original_id": info["original_id"],
            "text": info["text"],
            "labels": merged_labels
        })

    print(f"Total excluded aspects in Split {k}: {total_excluded}")

    # --- Save merged result ---
    output_file = os.path.join(output_folder, f"tasd_gerestaurant_gemma3_27b_merged_30_Split_{k}.jsonl")
    with open(output_file, "w", encoding="utf-8") as f:
        for entry in merged_entries:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

    print(f"Merged results saved to {output_file}")


Total excluded aspects in Split 0: 104
Merged results saved to results_majority\tasd_crowd_gemma3_27b_merged.jsonl
Total excluded aspects in Split 1: 104
Merged results saved to results_majority\tasd_crowd_gemma3_27b_merged.jsonl
Total excluded aspects in Split 2: 104
Merged results saved to results_majority\tasd_crowd_gemma3_27b_merged.jsonl
Total excluded aspects in Split 3: 104
Merged results saved to results_majority\tasd_crowd_gemma3_27b_merged.jsonl
Total excluded aspects in Split 4: 104
Merged results saved to results_majority\tasd_crowd_gemma3_27b_merged.jsonl


### ACSA: Remove duplicates in lines ([["Essen", "Positiv"], ["Essen", "Positiv"]] -> [["Essen", "Positiv"]] )

In [2]:
import os
import json

# --- Paths ---
input_folder = "../../11_annotations/t/"
output_folder = "../../11_annotations/x/"
os.makedirs(output_folder, exist_ok=True)

def clean_labels(entry, field_name):
    """Remove duplicate tuples and check Konflikt polarity."""
    labels = entry.get(field_name, [])
    unique_labels = list({tuple(label) for label in labels})
    entry[field_name] = unique_labels

    if any(label[1] == "Konflikt" for label in unique_labels):
        print(f"⚠️ Konflikt found in id={entry.get('id')} (original_id={entry.get('original_id')})")

    return entry

for filename in os.listdir(input_folder):
    if not (filename.endswith(".jsonl") or filename.endswith(".json")):
        continue

    input_file = os.path.join(input_folder, filename)
    output_file = os.path.join(output_folder, filename)

    cleaned_entries = []

    # Handle JSONL
    if filename.endswith(".jsonl"):
        with open(input_file, "r", encoding="utf-8") as f:
            for line in f:
                entry = json.loads(line)

                if "labels" in entry:
                    entry = clean_labels(entry, "labels")
                if "pred_label" in entry:
                    entry = clean_labels(entry, "pred_label")

                cleaned_entries.append(entry)

        with open(output_file, "w", encoding="utf-8") as f:
            for entry in cleaned_entries:
                f.write(json.dumps(entry, ensure_ascii=False) + "\n")

    # Handle JSON
    elif filename.endswith(".json"):
        with open(input_file, "r", encoding="utf-8") as f:
            data = json.load(f)

        # If file contains a list of entries
        if isinstance(data, list):
            for entry in data:
                if "labels" in entry:
                    entry = clean_labels(entry, "labels")
                if "pred_label" in entry:
                    entry = clean_labels(entry, "pred_label")
            cleaned_entries = data

        # If file contains a dict with keys
        elif isinstance(data, dict):
            if "labels" in data:
                data = clean_labels(data, "labels")
            if "pred_label" in data:
                data = clean_labels(data, "pred_label")
            cleaned_entries = data

        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(cleaned_entries, f, ensure_ascii=False, indent=2)

    print(f"✅ Cleaned file saved: {output_file}")


✅ Cleaned file saved: ../../11_annotations/x/updatet_labels_original_acsa.jsonl


### ACSA: Merge the 5 seeds with majority vote into a single file (separated by splits)

In [5]:
import os
import json
from collections import Counter

# --- Helper functions ---
def get_frequency_for_counts(counts, minimum):
    """Return the frequency count based on minimum appearance across splits."""
    return sorted(counts, reverse=True)[0:minimum][minimum-1]

def get_unique_keys(dict_list):
    """Return all unique keys across a list of dictionaries."""
    unique_keys = set()
    for d in dict_list:
        unique_keys.update(d.keys())
    return list(unique_keys)

def merge_aspect_lists(aspect_lists, minimum_appearance=3, oid=None):
    """
    Merge multiple aspect (category, polarity) tuples based on a minimum appearance,
    with warning for low-frequency labels.
    """
    counter_exclude = 0
    aspect_lists_counter = []
    for aspect_list in aspect_lists:
        # Here aspect_list contains tuples: (category, polarity)
        aspect_counter = dict(Counter(["#####".join(aspect) for aspect in aspect_list]))
        aspect_lists_counter.append(aspect_counter)
        
    unique_tuples = get_unique_keys(aspect_lists_counter)

    label = []
    for tuple_str in unique_tuples:
        counts = [asp.get(tuple_str, 0) for asp in aspect_lists_counter]
        total_count = sum(counts)
        
        if total_count < minimum_appearance:
            counter_exclude += 1
            # print(f"Warning: original_id={oid}, label={tuple_str.split('#####')} appeared only {total_count} times")
        
        count_tuple = get_frequency_for_counts(counts, minimum_appearance)
        tuple_reverse = tuple(tuple_str.split("#####"))  # now (category, polarity)
        label += count_tuple * [tuple_reverse]
        
    return label, counter_exclude

# --- Paths ---
input_folder = "results_clean/acsa_clean"
output_folder = "results_majority"
os.makedirs(output_folder, exist_ok=True)

for k in range(5):
    splits = [f"acsa_gerestaurant_gemma3_27b_{i}_30_Split_{k+1}.jsonl" for i in range(5)]

    # --- Merge data by 'original_id' ---
    data_by_id = {}
    for split_file in splits:
        with open(os.path.join(input_folder, split_file), "r", encoding="utf-8") as f:
            for line in f:
                entry = json.loads(line)
                oid = entry["original_id"]
                if oid not in data_by_id:
                    data_by_id[oid] = {
                        "id": entry["id"],
                        "original_id": oid,
                        "text": entry["text"],
                        "labels_lists": []
                    }
                data_by_id[oid]["labels_lists"].append(entry.get("labels", []))

    # --- Apply majority merge ---
    merged_entries = []
    total_excluded = 0
    for oid, info in data_by_id.items():
        merged_labels, counter_exclude = merge_aspect_lists(info["labels_lists"], minimum_appearance=3, oid=oid)
        total_excluded += counter_exclude
        merged_entries.append({
            "id": info["id"],
            "original_id": info["original_id"],
            "text": info["text"],
            "labels": merged_labels
        })

    print(f"Total excluded labels in Split {k}: {total_excluded}")

    # --- Save merged result ---
    output_file = os.path.join(output_folder, f"acsa_gerestaurant_gemma3_27b_merged_30_Split_{k}.jsonl")
    with open(output_file, "w", encoding="utf-8") as f:
        for entry in merged_entries:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

    print(f"Merged results saved to {output_file}")


Total excluded labels in Split 0: 10
Merged results saved to results_majority\acsa_gerestaurant_gemma3_27b_merged_30_Split_0.jsonl
Total excluded labels in Split 1: 15
Merged results saved to results_majority\acsa_gerestaurant_gemma3_27b_merged_30_Split_1.jsonl
Total excluded labels in Split 2: 17
Merged results saved to results_majority\acsa_gerestaurant_gemma3_27b_merged_30_Split_2.jsonl
Total excluded labels in Split 3: 13
Merged results saved to results_majority\acsa_gerestaurant_gemma3_27b_merged_30_Split_3.jsonl
Total excluded labels in Split 4: 8
Merged results saved to results_majority\acsa_gerestaurant_gemma3_27b_merged_30_Split_4.jsonl


### Combine the splits to a single file

In [None]:
import os
import json

# --- Paths ---
input_folder = "results_majority"   # Folder containing all JSONL files
output_folder = "results_majority"
os.makedirs(output_folder, exist_ok=True)

output_file = os.path.join(output_folder, "acsa_gerestaurant_gemma3_27b_merged_30.jsonl")

# --- List all .jsonl files in the input folder ---
if os.path.exists(output_file):
    print(f"Output file already exists: {output_file}. Aborting to prevent overwrite.")
else:
    jsonl_files = [f for f in os.listdir(input_folder) if f.endswith(".jsonl")]

    combined_count = 0

    with open(output_file, "w", encoding="utf-8") as out_f:
        for file_name in jsonl_files:
            file_path = os.path.join(input_folder, file_name)
            with open(file_path, "r", encoding="utf-8") as in_f:
                for line in in_f:
                    try:
                        entry = json.loads(line.strip())
                        out_f.write(json.dumps(entry, ensure_ascii=False) + "\n")
                        combined_count += 1
                    except json.JSONDecodeError:
                        print(f"Skipping invalid JSON line in {file_name}")

    print(f"Combined {combined_count} entries from {len(jsonl_files)} files into {output_file}")


Combined 4520 entries from 5 files into results_clean\tasd_crowd.jsonl
