### Combinde data from different json files to create a homohone dictionary

In [1]:
# Import libraries and packages
import json

In [12]:
# Read data from JSON file
def read_json(src):
    with open(src, 'r', encoding='utf-8') as file:
        data = json.load(file)
        # print(data[:10])
        return data

# Read data from JSONL file
def read_jsonl(src):
    with open(src, 'r', encoding='utf-8') as file:
        data = []
        for line in file:
            data.append(json.loads(line)) # parse each line sperately
    return data

# Save the combined data into a new JSON file
def save_as_json(data, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

# Save the combined data into a new JSONL file
def save_as_jsonl(data, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        for item in data:
            json.dump(item, file, ensure_ascii=False)
            file.write("\n")

def combine_data(dictionary, homophone, phoneme):
    combined_data = []
    for i in range(len(dictionary)): # all 3 src have the same length
    # for i in range(10): # all 3 src have the same length
        dict_item = dictionary[i]
        homophone_item = homophone[i]
        phoneme_item = phoneme[i]
        # Combine items
        combined_item = {
            'word': dict_item['word'],
            'pos': dict_item['pos'],
            'definition': dict_item['definition'],
            'example': dict_item['example'],
            'pro': dict_item['pro'], 
            'phoneme': phoneme_item['pronunciation'],
        }

        # Handle homophones dynamically (store all homophones as a list)
        homophones_list = []
        if 'homophone1' in homophone_item:
            homophones_list.append(homophone_item['homophone1'])
        if 'homophone2' in homophone_item:
            homophones_list.append(homophone_item['homophone2'])
        # Check if there are more homophone
        homophones_list.extend(homophone_item.get('additional_homophones', []))

        # Store homophones list or leave it empty if no homophones
        combined_item['homophones'] = homophones_list
        
        combined_data.append(combined_item)
    return combined_data

def filter_words_with_homophones(data):
    filtered_data = [item for item in data if item['homophones']]
    return filtered_data

In [13]:
# Input paths
khmer_dictionary = "data/khmer_dictionary.json"
word_pronunciation = "data/word_pronunciation.jsonl"
homophone_list = "data/homophone_list.json"

# Output path
output = "data/dictKh_with_homophones.json"
filtered_output  = "data/dictKh_with_homophones_only.json"

In [9]:
# Combine all data to a single file
dictionary = read_json(khmer_dictionary)
homophones = read_json(homophone_list)
phonemes = read_jsonl(word_pronunciation)
# print(phonemes[:3])
data = combine_data(dictionary, homophones, phonemes)
save_as_json(data, output)

In [14]:
# Filter words with homophone only and save to a new file
data = read_json(output)
filtered_data = filter_words_with_homophones(read_json(output))
save_as_json(filtered_data, filtered_output)