In [None]:
import xml.etree.ElementTree as ET
from collections import defaultdict

# ƒê·ªçc file XML
xml_file = 'vietnet_food (th·ªß c√¥ng).xml'
tree = ET.parse(xml_file)
root = tree.getroot()

# Namespace
ns = {'dc': 'https://globalwordnet.github.io/schemas/dc/'}

# Dictionary l∆∞u tr·ªØ th√¥ng tin
synsets = {}  # synset_id -> {lemmas: [], definition: str}
children = defaultdict(list)  # parent_synset_id -> [child_synset_ids]
parents = {}  # child_synset_id -> parent_synset_id

# Parse LexicalEntry ƒë·ªÉ l·∫•y lemmas
lemma_to_synset = defaultdict(list)
for entry in root.findall('.//LexicalEntry'):
    for sense in entry.findall('Sense'):
        synset_id = sense.get('synset')
        lemma = entry.find('Lemma').get('writtenForm')
        lemma_to_synset[synset_id].append(lemma)

# Parse Synsets
for synset in root.findall('.//Synset'):
    synset_id = synset.get('id')
    
    # L·∫•y definition
    definition_elem = synset.find('Definition')
    definition = definition_elem.text if definition_elem is not None else ""
    
    # L·∫•y lemmas
    lemmas = lemma_to_synset.get(synset_id, [])
    
    # L∆∞u th√¥ng tin synset
    synsets[synset_id] = {
        'lemmas': lemmas,
        'definition': definition
    }
    
    # L·∫•y quan h·ªá hypernym v√† hyponym
    for relation in synset.findall('SynsetRelation'):
        rel_type = relation.get('relType')
        target = relation.get('target')
        
        if rel_type == 'hypernym':
            parents[synset_id] = target
            children[target].append(synset_id)

# T√¨m root nodes (kh√¥ng c√≥ hypernym)
root_synsets = [sid for sid in synsets.keys() if sid not in parents]

# H√†m ƒë·ªá quy ƒë·ªÉ xu·∫•t c√¢y v·ªõi ƒë·ªãnh d·∫°ng y√™u c·∫ßu
def export_tree(synset_id, level=0):
    lines = []
    
    # L·∫•y th√¥ng tin synset
    info = synsets[synset_id]
    lemmas = info['lemmas']
    definition = info['definition']
    
    # T·∫°o chu·ªói lemmas
    lemmas_str = ', '.join(lemmas) if lemmas else ''
    
    # T·∫°o tab indent
    indent = '\t' * level
    
    # T·∫°o d√≤ng xu·∫•t
    line = f"{indent}({level}) {{{lemmas_str}}} [{synset_id}]: {definition}"
    lines.append(line)
    
    # ƒê·ªá quy cho c√°c con
    for child_id in sorted(children.get(synset_id, [])):
        lines.extend(export_tree(child_id, level + 1))
    
    return lines

# Xu·∫•t to√†n b·ªô c√¢y
output_lines = []
for root_id in sorted(root_synsets):
    output_lines.extend(export_tree(root_id))

# L∆∞u ra file
output_file = 'vietnet_food_export.txt'
with open(output_file, 'w', encoding='utf-8') as f:
    f.write('\n'.join(output_lines))

print(f"‚úÖ ƒê√£ xu·∫•t {len(output_lines)} d√≤ng ra file: {output_file}")
print(f"‚úÖ T·ªïng s·ªë synsets: {len(synsets)}")
print(f"‚úÖ S·ªë root synsets: {len(root_synsets)}")
print("\nüìÑ 10 d√≤ng ƒë·∫ßu ti√™n:")
print('\n'.join(output_lines[:10]))


‚úÖ ƒê√£ xu·∫•t 590 d√≤ng ra file: vietnet_food_export.txt
‚úÖ T·ªïng s·ªë synsets: 590
‚úÖ S·ªë root synsets: 2

üìÑ 10 d√≤ng ƒë·∫ßu ti√™n:
(0) {th·ª©c ƒÉn, th·ª©c} [vietnet-food-00045016-n]: th·ª©, m√≥n, lo·∫°i, n√≥i chung [th∆∞·ªùng n√≥i v·ªÅ ƒë·ªì ƒÉn u·ªëng]
	(1) {cao l∆∞∆°ng mƒ© v·ªã, cao l∆∞∆°ng m·ªπ v·ªã, mƒ© v·ªã, m·ªπ v·ªã} [vietnet-food-00005530-n]: m√≥n ƒÉn ngon v√† qu√Ω [n√≥i kh√°i qu√°t]
		(2) {cao l∆∞∆°ng} [vietnet-food-00005529-n]: th·ªãt b√©o v√† g·∫°o ngon; m√≥n ƒÉn ngon [n√≥i kh√°i qu√°t]
		(2) {h·∫£i v·ªã} [vietnet-food-00019494-n]: Th·ª©c ƒÉn qu√Ω, ch·∫ø bi·∫øn t·ª´ c√°c s·∫£n ph·∫©m l·∫•y ·ªü bi·ªÉn
		(2) {s∆°n h√†o} [vietnet-food-00040423-n]: th·ª©c ƒÉn qu√Ω, ch·∫ø bi·∫øn b·∫±ng s·∫£n ph·∫©m l·∫•y ·ªü r·ª´ng n√∫i
	(1) {c·ªó} [vietnet-food-00009756-n]: t·∫•t c·∫£ nh·ªØng m√≥n ƒÉn b√†y th√†nh m√¢m ƒë·ªÉ c√∫ng l·ªÖ v√† ƒÉn u·ªëng, theo t·ª•c l·ªá
	(1) {c∆°m} [vietnet-food-00010368-n]: nh·ªØng th·ª©c l√†m th√†nh m·ªôt b·ªØa ƒÉn [n√≥i t·ªïng qu√°t]
	(1) {c∆°m b·ªØa} 