In [1]:
import json
import os
from pathlib import Path

def remove_duplicates_from_file(filepath):
    """Remove duplicate quadruplets or triplets from a JSONL file."""
    with open(filepath, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    
    cleaned_data = []
    total_removed = 0
    
    for entry in data:
        if "Quadruplet" in entry:
            # Subtask 3: Remove duplicate quadruplets based on Aspect, Category, Opinion (ignore VA)
            original_count = len(entry["Quadruplet"])
            # Convert to tuples for deduplication
            unique_quads = []
            seen = set()
            for quad in entry["Quadruplet"]:
                quad_tuple = (quad["Aspect"], quad["Category"], quad["Opinion"])
                if quad_tuple not in seen:
                    seen.add(quad_tuple)
                    unique_quads.append(quad)
            entry["Quadruplet"] = unique_quads
            removed = original_count - len(unique_quads)
            total_removed += removed
            
        elif "Triplet" in entry:
            # Subtask 2: Remove duplicate triplets based on Aspect, Opinion (ignore VA)
            original_count = len(entry["Triplet"])
            # Convert to tuples for deduplication
            unique_trips = []
            seen = set()
            for trip in entry["Triplet"]:
                trip_tuple = (trip["Aspect"], trip["Opinion"])
                if trip_tuple not in seen:
                    seen.add(trip_tuple)
                    unique_trips.append(trip)
            entry["Triplet"] = unique_trips
            removed = original_count - len(unique_trips)
            total_removed += removed
        
        cleaned_data.append(entry)
    
    # Write back to file with UTF-8 encoding and ensure_ascii=False
    with open(filepath, 'w', encoding='utf-8') as f:
        for entry in cleaned_data:
            f.write(json.dumps(entry, ensure_ascii=False) + '\n')
    
    return total_removed

# Process all JSONL files in results/results_pred_dev/
base_dir = Path("results/results_pred_dev")

for subtask in [2, 3]:
    subtask_dir = base_dir / f"subtask_{subtask}"
    
    if not subtask_dir.exists():
        print(f"Directory {subtask_dir} does not exist, skipping...")
        continue
    
    print(f"\n{'='*60}")
    print(f"Processing Subtask {subtask}")
    print(f"{'='*60}")
    
    for filepath in subtask_dir.glob("*.jsonl"):
        total_removed = remove_duplicates_from_file(filepath)
        if total_removed > 0:
            print(f"✓ {filepath.name}: Removed {total_removed} duplicates")
        else:
            print(f"✓ {filepath.name}: No duplicates found")

print(f"\n{'='*60}")
print("Duplicate removal completed!")
print(f"{'='*60}")


Processing Subtask 2
✓ pred_eng_laptop.jsonl: No duplicates found
✓ pred_eng_restaurant.jsonl: No duplicates found
✓ pred_tat_restaurant.jsonl: No duplicates found
✓ pred_rus_restaurant.jsonl: No duplicates found
✓ pred_jpn_hotel.jsonl: No duplicates found
✓ pred_ukr_restaurant.jsonl: No duplicates found

Processing Subtask 3
✓ pred_eng_laptop.jsonl: No duplicates found
✓ pred_eng_restaurant.jsonl: No duplicates found
✓ pred_tat_restaurant.jsonl: No duplicates found
✓ pred_zho_restaurant.jsonl: Removed 1 duplicates
✓ pred_rus_restaurant.jsonl: No duplicates found
✓ pred_jpn_hotel.jsonl: No duplicates found
✓ pred_ukr_restaurant.jsonl: No duplicates found
✓ pred_zho_laptop.jsonl: No duplicates found

Duplicate removal completed!
