In [1]:
import os
import glob
import json
from collections import Counter
from tqdm.auto import tqdm
from concurrent.futures import ProcessPoolExecutor

# 1. Define paths
base_dir = "/home/2022113135/datasets/aihub_a2a_unit"
paths = {
    "train_ko": os.path.join(base_dir, "train/ko"),
    "train_en": os.path.join(base_dir, "train/en"),
    "valid_ko": os.path.join(base_dir, "valid/ko"),
    "valid_en": os.path.join(base_dir, "valid/en"),
}

output_file = "unit_counts_all_splits.json"

def process_file(file_path):
    """Worker function to count units in a file."""
    try:
        with open(file_path, 'r') as f:
            units = f.read().split()
            return Counter(units)
    except Exception as e:
        return Counter()

def get_counts_for_split(directory):
    """Efficiently collects unit frequencies from a directory."""
    file_paths = glob.glob(os.path.join(directory, "*.txt"))
    total_counter = Counter()
    
    print(f"Processing {len(file_paths)} files in {directory}...")
    
    # Process in chunks using all available CPU cores
    with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
        chunk_size = 2000
        for i in tqdm(range(0, len(file_paths), chunk_size)):
            chunk = file_paths[i:i + chunk_size]
            results = executor.map(process_file, chunk)
            for res in results:
                total_counter.update(res)
                
    return dict(total_counter)

# 2. Execute collection for all 4 splits
all_results = {}
for split_name, split_path in paths.items():
    all_results[split_name] = get_counts_for_split(split_path)

# 3. Save to JSON
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(all_results, f, indent=4, ensure_ascii=False)

print(f"\nSuccess! Counts saved to: {os.path.abspath(output_file)}")

# Quick verification display
print("\n--- Summary ---")
for split, counts in all_results.items():
    total_tokens = sum(counts.values())
    unique_units = len(counts)
    print(f"{split:10} | Tokens: {total_tokens:,} | Unique Units: {unique_units}")

Processing 150660 files in /home/2022113135/datasets/aihub_a2a_unit/train/ko...


  0%|          | 0/76 [00:00<?, ?it/s]

Processing 150660 files in /home/2022113135/datasets/aihub_a2a_unit/train/en...


  0%|          | 0/76 [00:00<?, ?it/s]

Processing 9416 files in /home/2022113135/datasets/aihub_a2a_unit/valid/ko...


  0%|          | 0/5 [00:00<?, ?it/s]

Processing 9416 files in /home/2022113135/datasets/aihub_a2a_unit/valid/en...


  0%|          | 0/5 [00:00<?, ?it/s]


Success! Counts saved to: /home/2022113135/jjs/notebooks/unit_counts_all_splits.json

--- Summary ---
train_ko   | Tokens: 34,545,370 | Unique Units: 969
train_en   | Tokens: 39,805,808 | Unique Units: 909
valid_ko   | Tokens: 2,134,670 | Unique Units: 931
valid_en   | Tokens: 2,471,902 | Unique Units: 854
