# Save words to a list

In [3]:
import json
import pandas as pd
from datasets import load_dataset
from collections import defaultdict

def save_dict_to_json(entries, output_file):
    # Create a default dictionary to collect homophones under each word
    word_dict = defaultdict(lambda: {"groundtruth": "", "homophones": []})

    for entry in entries:
        word = entry["word"]
        pro = entry["pro"]

        # Clean up pronunciation by removing brackets if necessary
        pro_cleaned = pro.strip("[]").replace("-","").split(",")  # Assuming multiple pronunciations

        word_dict[word]["groundtruth"] = word
        for p in pro_cleaned:
            if p and p not in word_dict[word]["homophones"]:
                word_dict[word]["homophones"].append(p)

    # Convert defaultdict back to a regular dict before saving
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(dict(word_dict), f, ensure_ascii=False, indent=2)


def save_dict_to_excel(entries, output_file):
    # Create a list to collect data
    data = []

    for entry in entries:
        word = entry["word"]
        pro = entry["pro"]

        # Clean up pronunciation by removing brackets and hyphens
        pro_cleaned = pro.strip("[]").replace("-", "").split(",")  # Remove hyphens and handle multiple pronunciations

        # Append each pronunciation (homophone) with its corresponding word
        for p in pro_cleaned:
            # Append the word and its corresponding pronunciation (homophone)
            data.append({"groundtruth": word, "homophone": p})

    # Convert the list of data into a pandas DataFrame
    df = pd.DataFrame(data)

    # Save the DataFrame to an Excel file
    df.to_excel(output_file, index=False, engine='openpyxl')
    
for i in range(10):
    begin_range = i*100
    number_of_words_per_file = 100
    end_range = begin_range + number_of_words_per_file
    OUTPUT_FILE_JSON = f"output/word_list_{begin_range}_{end_range-1}.json"  # Using json format for better structure
    OUTPUT_FILE_XLSX = f"output/word_list_{begin_range}_{end_range-1}.xlsx"
    # Load data from Khmer dictionary
    khmer_dictionary = load_dataset("seanghay/khmer-dictionary-44k")
    
    # Collect entries in a batch and save them
    entries_to_save = khmer_dictionary["train"].select(range(begin_range, end_range))
    
    # Save the entries to JSON
    # save_dict_to_json(entries_to_save, OUTPUT_FILE_JSON)
    save_dict_to_excel(entries_to_save, OUTPUT_FILE_XLSX)
    print(f"Saving loop: {i}")


{'word': 'ក', 'pos': 'ន.', 'pro': '[ក]', 'definition': 'តួអក្សរទី១ក្នុងវគ្គទី១នៃអក្ខរក្រមព្យញ្ជនៈខ្មែរ។\xa0តួអក្សរនេះមានឈ្មោះថា\xa0ក\xa0ហើយតាងឱ្យព្យញ្ជនៈ /ក/\xa0មានអក្ខរសូរអន្តរជាតិ\xa0[k]\xa0ជាកណ្ឋជៈ មានសំឡេងក្នុងឋានបំពង់ក ជាសិថិល អឃោសៈ។', 'example': ''}
Saving loop: 0
{'word': 'ក', 'pos': 'ន.', 'pro': '[ក]', 'definition': 'តួអក្សរទី១ក្នុងវគ្គទី១នៃអក្ខរក្រមព្យញ្ជនៈខ្មែរ។\xa0តួអក្សរនេះមានឈ្មោះថា\xa0ក\xa0ហើយតាងឱ្យព្យញ្ជនៈ /ក/\xa0មានអក្ខរសូរអន្តរជាតិ\xa0[k]\xa0ជាកណ្ឋជៈ មានសំឡេងក្នុងឋានបំពង់ក ជាសិថិល អឃោសៈ។', 'example': ''}
Saving loop: 1
{'word': 'ក', 'pos': 'ន.', 'pro': '[ក]', 'definition': 'តួអក្សរទី១ក្នុងវគ្គទី១នៃអក្ខរក្រមព្យញ្ជនៈខ្មែរ។\xa0តួអក្សរនេះមានឈ្មោះថា\xa0ក\xa0ហើយតាងឱ្យព្យញ្ជនៈ /ក/\xa0មានអក្ខរសូរអន្តរជាតិ\xa0[k]\xa0ជាកណ្ឋជៈ មានសំឡេងក្នុងឋានបំពង់ក ជាសិថិល អឃោសៈ។', 'example': ''}
Saving loop: 2
{'word': 'ក', 'pos': 'ន.', 'pro': '[ក]', 'definition': 'តួអក្សរទី១ក្នុងវគ្គទី១នៃអក្ខរក្រមព្យញ្ជនៈខ្មែរ។\xa0តួអក្សរនេះមានឈ្មោះថា\xa0ក\xa0ហើយតាងឱ្យព្យញ្ជនៈ /ក/\xa0មានអក្ខរសូរអន្តរជាតិ\xa0[k