In [1]:
import wn
from collections import defaultdict
import pandas as pd

# Load lexicon
lexicon = wn.Wordnet('oewn:2024')

# L·∫•y 2 synset c·∫ßn th·ªëng k√™
synset_ids = ['oewn-00021445-n', 'oewn-07571428-n']
synsets = [lexicon.synset(sid) for sid in synset_ids]

def collect_hyponyms_recursive(synset, level=0, visited_synsets=None, all_data=None):
    """
    Thu th·∫≠p t·∫•t c·∫£ hyponyms theo ƒë·ªá quy v√† th·ªëng k√™ theo level
    visited_synsets: set ƒë·ªÉ tr√°nh tr√πng l·∫∑p synset
    all_data: dict ƒë·ªÉ l∆∞u th·ªëng k√™ theo level {level: {'synsets': set(), 'lemmas': list()}}
    """
    if visited_synsets is None:
        visited_synsets = set()
    if all_data is None:
        all_data = defaultdict(lambda: {'synsets': set(), 'lemmas': []})
    
    # Ki·ªÉm tra tr√πng l·∫∑p synset
    synset_id = synset.id
    if synset_id in visited_synsets:
        return all_data
    visited_synsets.add(synset_id)
    
    # Th√™m synset v√†o level hi·ªán t·∫°i
    all_data[level]['synsets'].add(synset_id)
    
    # Th√™m t·∫•t c·∫£ lemmas v√†o level hi·ªán t·∫°i (kh√¥ng lo·∫°i b·ªè tr√πng)
    for lemma in synset.lemmas():
        all_data[level]['lemmas'].append(lemma)
    
    # Duy·ªát c√°c hyponym
    hyponyms = synset.hyponyms()
    for hyponym in hyponyms:
        collect_hyponyms_recursive(hyponym, level + 1, visited_synsets, all_data)
    
    return all_data

# Th·ªëng k√™ cho t·ª´ng synset ri√™ng
results_per_synset = {}
for i, synset in enumerate(synsets):
    synset_id = synset_ids[i]
    visited = set()
    data = defaultdict(lambda: {'synsets': set(), 'lemmas': []})
    
    # Thu th·∫≠p d·ªØ li·ªáu
    collect_hyponyms_recursive(synset, level=0, visited_synsets=visited, all_data=data)
    
    # Chuy·ªÉn ƒë·ªïi sang dict d·ªÖ ƒë·ªçc
    stats = {}
    for level in sorted(data.keys()):
        stats[level] = {
            'synsets_count': len(data[level]['synsets']),
            'lemmas_count': len(data[level]['lemmas']),
            'synsets': sorted(data[level]['synsets']),
            'lemmas': data[level]['lemmas']
        }
    
    # L∆∞u th√¥ng tin ƒë·∫ßy ƒë·ªß c·ªßa synset g·ªëc
    examples = synset.examples()
    results_per_synset[synset_id] = {
        'stats': stats,
        'total_synsets': len(visited),
        'total_lemmas': sum(len(data[level]['lemmas']) for level in data.keys()),
        'synset_info': {
            'id': synset.id,
            'pos': synset.pos,
            'lemmas': ', '.join(lemma for lemma in synset.lemmas()),
            'definition': synset.definition(),
            'examples': ', '.join(examples) if examples else '(kh√¥ng c√≥)',
            'num_lemmas': len(list(synset.lemmas())),
            'num_direct_hyponyms': len(synset.hyponyms())
        }
    }

# Th·ªëng k√™ chung cho c·∫£ 2 synset - ƒë∆°n gi·∫£n l√† t·ªïng c·ªßa 2 sheet ri√™ng r·∫Ω
# T√¨m t·∫•t c·∫£ c√°c level c√≥ trong c·∫£ 2 synset
all_levels = set()
for result in results_per_synset.values():
    all_levels.update(result['stats'].keys())

# T√≠nh t·ªïng cho t·ª´ng level
stats_combined = {}
for level in sorted(all_levels):
    synsets_count = 0
    lemmas_count = 0
    synsets_list = []
    lemmas_list = []
    
    for result in results_per_synset.values():
        if level in result['stats']:
            synsets_count += result['stats'][level]['synsets_count']
            lemmas_count += result['stats'][level]['lemmas_count']
            synsets_list.extend(result['stats'][level]['synsets'])
            lemmas_list.extend(result['stats'][level]['lemmas'])
    
    stats_combined[level] = {
        'synsets_count': synsets_count,
        'lemmas_count': lemmas_count,
        'synsets': sorted(synsets_list),
        'lemmas': lemmas_list
    }

total_combined = {
    'total_synsets': sum(result['total_synsets'] for result in results_per_synset.values()),
    'total_lemmas': sum(result['total_lemmas'] for result in results_per_synset.values())
}

# In k·∫øt qu·∫£
print("=" * 80)
print("TH·ªêNG K√ä THEO T·ª™NG SYNSET RI√äNG R·∫º")
print("=" * 80)

for synset_id, result in results_per_synset.items():
    info = result['synset_info']
    print(f"\nüìä SYNSET: {info['lemmas']} ({synset_id})")
    print("-" * 80)
    print("TH√îNG TIN SYNSET G·ªêC:")
    print(f"  ID: {info['id']}")
    print(f"  POS: {info['pos']}")
    print(f"  Lemmas: {info['lemmas']}")
    print(f"  Definition: {info['definition']}")
    print(f"  Examples: {info['examples']}")
    print(f"  S·ªë lemmas: {info['num_lemmas']}")
    print(f"  S·ªë hyponyms tr·ª±c ti·∫øp: {info['num_direct_hyponyms']}")
    print("\nTH·ªêNG K√ä:")
    print(f"  T·ªïng s·ªë synsets (kh√¥ng tr√πng): {result['total_synsets']}")
    print(f"  T·ªïng s·ªë lemmas (c√≥ tr√πng): {result['total_lemmas']}")
    print("\nTh·ªëng k√™ theo level:")
    print(f"{'Level':<10} {'S·ªë synsets':<15} {'S·ªë lemmas':<15}")
    print("-" * 40)
    for level in sorted(result['stats'].keys()):
        stats = result['stats'][level]
        print(f"{level:<10} {stats['synsets_count']:<15} {stats['lemmas_count']:<15}")

print("\n" + "=" * 80)
print("TH·ªêNG K√ä CHUNG CHO C·∫¢ 2 SYNSET")
print("=" * 80)
print("\nTH√îNG TIN SYNSET G·ªêC:")
for i, synset in enumerate(synsets):
    synset_id = synset_ids[i]
    info = results_per_synset[synset_id]['synset_info']
    print(f"\n  üìå SYNSET G·ªêC {i+1}: {synset_id}")
    print(f"    ID: {info['id']}")
    print(f"    POS: {info['pos']}")
    print(f"    Lemmas: {info['lemmas']}")
    print(f"    Definition: {info['definition']}")
    print(f"    Examples: {info['examples']}")
    print(f"    S·ªë lemmas: {info['num_lemmas']}")
    print(f"    S·ªë hyponyms tr·ª±c ti·∫øp: {info['num_direct_hyponyms']}")

print("\nTH·ªêNG K√ä:")
print(f"  T·ªïng s·ªë synsets (kh√¥ng tr√πng): {total_combined['total_synsets']}")
print(f"  T·ªïng s·ªë lemmas (c√≥ tr√πng): {total_combined['total_lemmas']}")
print("\nTh·ªëng k√™ theo level:")
print(f"{'Level':<10} {'S·ªë synsets':<15} {'S·ªë lemmas':<15}")
print("-" * 40)
for level in sorted(stats_combined.keys()):
    stats = stats_combined[level]
    print(f"{level:<10} {stats['synsets_count']:<15} {stats['lemmas_count']:<15}")

print("\n" + "=" * 80)
print("CHI TI·∫æT THEO LEVEL (CHUNG)")
print("=" * 80)
for level in sorted(stats_combined.keys()):
    print(f"\nüìà Level {level}:")
    print(f"  - S·ªë synsets: {stats_combined[level]['synsets_count']}")
    print(f"  - S·ªë lemmas: {stats_combined[level]['lemmas_count']}")
    if level <= 2:  # Ch·ªâ hi·ªÉn th·ªã chi ti·∫øt cho level 0-2 ƒë·ªÉ kh√¥ng qu√° d√†i
        print(f"  - Synsets: {', '.join(stats_combined[level]['synsets'][:10])}{'...' if len(stats_combined[level]['synsets']) > 10 else ''}")

# Xu·∫•t ra file Excel
print("\n" + "=" * 80)
print("XU·∫§T RA FILE EXCEL")
print("=" * 80)

filename = 'statistics.xlsx'
print(f"T√™n file: {filename}")

# T·∫°o Excel writer
with pd.ExcelWriter(filename, engine='openpyxl') as writer:
    # Sheet 1: food, nutrient (oewn-00021445-n)
    synset_id_1 = synset_ids[0]
    result_1 = results_per_synset[synset_id_1]
    info_1 = result_1['synset_info']
    sheet_name_1 = f"{info_1['lemmas']} ({synset_id_1})"
    
    # T·∫°o DataFrame cho sheet 1
    data_1 = []
    for level in sorted(result_1['stats'].keys()):
        stats = result_1['stats'][level]
        data_1.append({
            'Level': level,
            'S·ªë synsets': stats['synsets_count'],
            'S·ªë lemmas': stats['lemmas_count']
        })
    df_1 = pd.DataFrame(data_1)
    df_1.to_excel(writer, sheet_name=sheet_name_1, index=False)
    
    # Sheet 2: food, solid food (oewn-07571428-n)
    synset_id_2 = synset_ids[1]
    result_2 = results_per_synset[synset_id_2]
    info_2 = result_2['synset_info']
    sheet_name_2 = f"{info_2['lemmas']} ({synset_id_2})"
    
    # T·∫°o DataFrame cho sheet 2
    data_2 = []
    for level in sorted(result_2['stats'].keys()):
        stats = result_2['stats'][level]
        data_2.append({
            'Level': level,
            'S·ªë synsets': stats['synsets_count'],
            'S·ªë lemmas': stats['lemmas_count']
        })
    df_2 = pd.DataFrame(data_2)
    df_2.to_excel(writer, sheet_name=sheet_name_2, index=False)
    
    # Sheet 3: C·∫£ 2 synset
    sheet_name_3 = "C·∫£ 2 synset"
    
    # T·∫°o DataFrame cho sheet 3
    data_3 = []
    for level in sorted(stats_combined.keys()):
        stats = stats_combined[level]
        data_3.append({
            'Level': level,
            'S·ªë synsets': stats['synsets_count'],
            'S·ªë lemmas': stats['lemmas_count']
        })
    df_3 = pd.DataFrame(data_3)
    df_3.to_excel(writer, sheet_name=sheet_name_3, index=False)

print(f"‚úÖ ƒê√£ xu·∫•t file Excel: {filename}")
print(f"  - Sheet 1: {sheet_name_1}")
print(f"  - Sheet 2: {sheet_name_2}")
print(f"  - Sheet 3: {sheet_name_3}")


TH·ªêNG K√ä THEO T·ª™NG SYNSET RI√äNG R·∫º

üìä SYNSET: food, nutrient (oewn-00021445-n)
--------------------------------------------------------------------------------
TH√îNG TIN SYNSET G·ªêC:
  ID: oewn-00021445-n
  POS: n
  Lemmas: food, nutrient
  Definition: any substance that can be metabolized by an animal to give energy and build tissue
  Examples: (kh√¥ng c√≥)
  S·ªë lemmas: 2
  S·ªë hyponyms tr·ª±c ti·∫øp: 15

TH·ªêNG K√ä:
  T·ªïng s·ªë synsets (kh√¥ng tr√πng): 1603
  T·ªïng s·ªë lemmas (c√≥ tr√πng): 2333

Th·ªëng k√™ theo level:
Level      S·ªë synsets      S·ªë lemmas      
----------------------------------------
0          1               2              
1          15              39             
2          91              143            
3          334             504            
4          473             656            
5          411             598            
6          217             311            
7          55              72             
8          6        

