## Rijmalgoritme

In [20]:
import json
import re
from collections import defaultdict, Counter
from tqdm import tqdm

# Laad de verse_id lookup (correcte mapping per vers)
print("Loading data...")
with open('../data/verse_id_lookup.json', 'r', encoding='utf-8') as f:
    verse_id_lookup = json.load(f)

# Laad ook de originele stress data als fallback
with open('../data/stresses_restructured.json', 'r', encoding='utf-8') as f:
    stress_data = json.load(f)

# Laad de originele vers data
with open('../data/vers_laatste_woorden.json', 'r', encoding='utf-8') as f:
    vers_data = json.load(f)

# Definieer de suffixen die je wilt overslaan
skip_suffixes = {
    'M1_01_000',
    'M2_01_000',
    'M3_01_000',
    'M1_75_976',
    'M2_26_339',
    'M3_39_508'
}

def should_skip_vers(vers_id):
    """Check of een vers_id geskipt moet worden"""
    return any(vers_id.endswith(suffix) for suffix in skip_suffixes)

def parse_vers_id(vers_id):
    """Parse vers_id zoals A_M1_01_001 naar componenten"""
    parts = vers_id.split('_')
    doorlopend_vers = int(parts[3])
    strofe_num = int(parts[2])
    vers_in_strofe = ((doorlopend_vers - 1) % 13) + 1
    
    return {
        'manuscript': parts[0],
        'section': parts[1],
        'strofe': strofe_num,
        'doorlopend_vers': doorlopend_vers,
        'vers_in_strofe': vers_in_strofe
    }

# ============================================================================
# RIJMFUNCTIES
# ============================================================================

def remove_consonants_before_first_vowel(input_string):
    input_string = input_string.replace('qu', 'kw')
    return re.sub(r'^[^aeiou]*', '', input_string)

def replace_vowel_followed_by_vowels(input_string):
    return re.sub(r'([aeiou])[aeiouA]+', r'\1', input_string)

def replace_y_with_i(input_string):
    if not any(c in 'aeiou' for c in input_string):
        return input_string.replace('y', 'i')
    return re.sub(r'y(?=[aeiou])', 'i', input_string)

def replace_v_with_u(input_string):
    if not any(c in 'aeiou' for c in input_string):
        return re.sub(r'^v', 'u', input_string)
    return input_string

def replace_u_with_v_before_consonant(input_string):
    return re.sub(r'^u(?=[bcdfghjklmnpqrstvwxyz])', 'v', input_string)

def replace_j_with_i(input_string):
    if not any(c in 'aeiou' for c in input_string):
        return input_string.replace('j', 'i')
    return input_string

def replace_w_with_uu(input_string):
    if not any(c in 'aeiou' for c in input_string):
        return re.sub(r'^w', 'uu', input_string)
    return input_string

def replace_identical_consonants(input_string):
    return re.sub(r'([bcdfghjklmnpqrstvwxyz])\1+', r'\1', input_string, flags=re.IGNORECASE)

def replace_u_with_v(input_string):
    return re.sub(r'^u([aeiouAEIOU])', r'v\1', input_string)

def replace_gh_between_vowels(text):
    return re.sub(r'(?<=[aeiou])gh(?=[aeiou])', 'g', text)

def spelling_normalize(syll_str):
    syll_str = syll_str.replace('qu', 'kw')
    syll_str = syll_str.replace('ch', 'X')
    syll_str = syll_str.replace('ng', 'N')
    syll_str = replace_gh_between_vowels(syll_str)
    
    syll_str = syll_str.replace('eyd', 'eid')
    syll_str = syll_str.replace('ii', 'ij')
    syll_str = syll_str.replace('ij', 'ei').replace('ey', 'ei').replace('ye', 'ie')
    syll_str = replace_vowel_followed_by_vowels(syll_str)
    syll_str = replace_identical_consonants(syll_str)
    syll_str = syll_str.replace('qu', 'kw')
    syll_str = re.sub(r'dt*$', 't', syll_str)
    syll_str = re.sub(r'wt$', 'ut', syll_str)
    syll_str = syll_str.replace('ck', 'k').replace('c', 'k')
    syll_str = syll_str.replace('ph', 'f')
    syll_str = syll_str.replace('Nh', 'N')
    syll_str = syll_str.replace('nk', 'Nk')

    if syll_str.endswith('en'):
        syll_str = syll_str[:-1]
 
    return syll_str

def get_word_data(word, vers_id=None):
    """
    Haal syllabified en stresses op voor een woord.
    Gebruikt vers_id om de juiste vorm te selecteren bij woorden met meerdere vormen.
    """
    word = word.lower().strip()
    
    # Probeer eerst de verse_id_lookup
    if vers_id and vers_id in verse_id_lookup:
        entry = verse_id_lookup[vers_id]
        return entry['syllabified'], entry['stresses']
    
    # Fallback naar stress_data
    if word in stress_data:
        entry = stress_data[word]
        if isinstance(entry, list):
            # Meerdere vormen, neem de eerste als fallback
            return entry[0]['syllabified'], entry[0]['stresses']
        else:
            return entry['syllabified'], entry['stresses']
    
    return None, None

def extract_rhymes(word, vers_id=None):
    word = word.lower().strip()
    
    syllables, stresses = get_word_data(word, vers_id)
    
    if syllables is None:
        return None, None
    
    if stresses:
        last_stress = max(stresses)
    else:
        last_stress = -1
    
    relevant_syllables = syllables[last_stress:]
    relevant_syllables = [re.sub(r'^ie', 'je', s) for s in relevant_syllables]
    relevant_syllables = [re.sub(r'^i([aeiou])', r'j\1', s) for s in relevant_syllables]
    relevant_syllables = [re.sub(r'ioe', 'joe', s) for s in relevant_syllables]
    relevant_syllables = [replace_y_with_i(s) for s in relevant_syllables]
    relevant_syllables = [replace_w_with_uu(s) for s in relevant_syllables]
    relevant_syllables = [replace_u_with_v(s) for s in relevant_syllables]
    relevant_syllables = [replace_v_with_u(s) for s in relevant_syllables]
    relevant_syllables = [replace_u_with_v_before_consonant(s) for s in relevant_syllables]
    relevant_syllables = [replace_j_with_i(s) for s in relevant_syllables]
    relevant_syllables[0] = remove_consonants_before_first_vowel(relevant_syllables[0])
    
    syll_str = ''.join(relevant_syllables)
    syll_str = spelling_normalize(syll_str)
    
    vocalic = ''.join([c for c in syll_str if c in 'aeiou'])
    consonantal = ''.join([c for c in syll_str if c not in 'aeiou'])
    return vocalic, consonantal

def analyze_pair(w1, w2, vers_id1=None, vers_id2=None):
    if w1 == 'skipped' or w2 == 'skipped':
        return 'skipped'
    if w1 == 'damaged' or w2 == 'damaged':
        return 'damaged'
    
    # Alle uitzonderingen
    if w1.endswith(('ede', 'eden', 'ene', 'enen')) and w2.endswith(('ede', 'eden', 'ene', 'enen')):
        return 'full'
    if w1.endswith(('eine', 'eyne', 'eynen', 'einen')) and w2.endswith(('eine', 'eyne', 'eynen', 'einen')):
        return 'full'
    if w1.endswith(('lantsloet', 'oet')) and w2.endswith(('lantsloet', 'oet')):
        return 'full'
    if w1.endswith(('scepe', 'epe', 'scepen', 'epen')) and w2.endswith(('scepe', 'epe', 'scepen', 'epen')):
        return 'full'
    if w1.endswith(('antwoorde', 'oorde', 'antwoerde', 'oerde', 'antwoorden', 'oorden', 'antwoerden', 'oerden')) and \
       w2.endswith(('antwoorde', 'oorde', 'antwoerde', 'oerde', 'antwoorden', 'oorden', 'antwoerden', 'oerden')):
        return 'full'
    if w1.endswith(('heit', 'eit')) and w2.endswith(('heit', 'eit')):
        return 'full'
    if w1.endswith(('coninc', 'koninc')) and w2.endswith(('inc',)):
        return 'full'
    if w1.endswith(('like', 'liken', 'ike', 'ijke')) and w2.endswith(('like', 'liken', 'ike', 'ijke')):
        return 'full'
    if w1.endswith(('inge', 'inghen', 'inghe', 'inghen')) and w2.endswith(('inge', 'inghen', 'inghe', 'inghen')):
        return 'full'
    if w1.endswith(('nie', 'ie')) and w2.endswith(('je', 'ie')):
        return 'full'
    if w1.endswith(('je', 'ie')) and w2.endswith(('nie', 'ie')):
        return 'full'
    if w1.endswith(('sij', 'ij', 'by', 'y', 'i')) and w2.endswith(('sij', 'ij', 'by', 'y', 'i')):
        return 'full'
    if w1.endswith(('ien', 'yen', 'jen', 'aien')) and w2.endswith(('ien', 'yen', 'jen', 'aien')):
        return 'full'
    if w1.endswith(('iden', 'ijden', 'yden', 'ide', 'ijde', 'yde')) and \
       w2.endswith(('iden', 'ijden', 'yden', 'ide', 'ijde', 'yde')):
        return 'full'
    if w1.endswith(('ijset', 'iset', 'yset')) and w2.endswith(('ijset', 'iset', 'yset')):
        return 'full'
    if w1.endswith(('ijn', 'in', 'yn')) and w2.endswith(('ijn', 'in', 'yn')):
        return 'full'
    if w1.endswith(('uelt', 'velt', 'elt')) and w2.endswith(('uelt', 'velt', 'elt')):
        return 'full'
    if w1.endswith(('inc', 'ijnc', 'ync')) and w2.endswith(('inc', 'ijnc', 'ync')):
        return 'full'
    if w1.endswith(('ijt', 'eit', 'eyt', 'yt', 'it')) and w2.endswith(('ijt', 'eit', 'eyt', 'yt', 'it')):
        return 'full'
    if w1.endswith('iet') and w2.endswith('iet'):
        return 'full'

    if (w1.endswith(('reet', 'eet')) and w2.endswith(('heyt', 'heit'))) or \
        (w1.endswith(('heyt', 'heit')) and w2.endswith(('reet', 'eet'))):
        return 'disturbed'
    
    v1, c1 = extract_rhymes(w1, vers_id1)
    v2, c2 = extract_rhymes(w2, vers_id2)
    
    if v1 is None or v2 is None:
        return 'unknown'
    
    vocalic_match = v1 == v2
    consonantal_match = c1 == c2
    
    if vocalic_match and consonantal_match:
        return 'full'
    elif vocalic_match:
        return 'assonance'
    elif consonantal_match:
        return 'disturbed'
    else:
        return 'none'

def determine_rhyme_quality(rhyme_type_counts):
    """
    Bepaal de rijmkwaliteit op basis van welk type het meest voorkomt.
    AANGEPAST: Vergelijkt nu ook met 'none'
    """
    full_count = rhyme_type_counts.get('full', 0)
    disturbed_count = rhyme_type_counts.get('disturbed', 0)
    assonance_count = rhyme_type_counts.get('assonance', 0)
    none_count = rhyme_type_counts.get('none', 0)
    
    # Vind het hoogste count
    max_count = max(full_count, disturbed_count, assonance_count, none_count)
    
    # Als alles 0 is
    if max_count == 0:
        return 'none'
    
    # Bepaal welk type dominant is
    # Bij gelijke stand: voorkeur full > disturbed > assonance > none
    if full_count == max_count:
        return 'full'
    elif disturbed_count == max_count:
        return 'disturbed'
    elif assonance_count == max_count:
        return 'assonance'
    else:  # none_count == max_count
        return 'none'

def analyze_strofe_by_word(strofe, section, strofe_num):
    """Analyseer rijm per woord: rijmt elk woord met minstens één ander in zijn groep?"""
    
    results = []
    
    # Check of de hele strofe ontbreekt (of bijna leeg is)
    non_skipped_verses = [v for v in strofe.values() if v['woord'] != 'skipped']
    
    if len(non_skipped_verses) == 0:
        # Hele strofe ontbreekt - niet meetellen als skipped
        return []
    
    # Tel hoeveel verzen 'skipped' zijn in deze strofe
    skipped_in_strofe = sum(1 for v in strofe.values() if v['woord'] == 'skipped')
    
    # Als alle 13 verzen skipped zijn, negeer de hele strofe
    if skipped_in_strofe == 13:
        return []
    
    a_lines = [1, 2, 4, 5, 7, 8, 10, 11]
    b_lines = [3, 6, 9, 12, 13]
    
    for rhyme_letter, positions in [('A', a_lines), ('B', b_lines)]:
        # Verzamel woorden in deze rijmgroep (exclusief damaged en skipped)
        words_in_group = []
        damaged_in_group = []
        skipped_in_group = []
        
        for pos in positions:
            if pos in strofe:
                word = strofe[pos]['woord']
                if word == 'damaged':
                    damaged_in_group.append({
                        'pos': pos,
                        'vers_id': strofe[pos]['vers_id'],
                        'word': word
                    })
                elif word == 'skipped':
                    skipped_in_group.append({
                        'pos': pos,
                        'vers_id': strofe[pos]['vers_id'],
                        'word': word
                    })
                else:
                    words_in_group.append({
                        'pos': pos,
                        'vers_id': strofe[pos]['vers_id'],
                        'word': word
                    })
        
        # Voeg damaged woorden toe aan resultaten met speciale status
        for damaged_info in damaged_in_group:
            results.append({
                'section': section,
                'strofe': strofe_num,
                'vers_pos': damaged_info['pos'],
                'vers_id': damaged_info['vers_id'],
                'word': damaged_info['word'],
                'rhyme_letter': rhyme_letter,
                'rhyme_quality': None,
                'status': 'damaged',
                'rhyme_type_counts': {},
                'total_in_group': len(words_in_group),
                'damaged_in_group': len(damaged_in_group),
                'skipped_in_group': len(skipped_in_group)
            })
        
        # Voeg skipped woorden toe als 'none' kwaliteit
        if skipped_in_strofe < 13:
            for skipped_info in skipped_in_group:
                results.append({
                    'section': section,
                    'strofe': strofe_num,
                    'vers_pos': skipped_info['pos'],
                    'vers_id': skipped_info['vers_id'],
                    'word': skipped_info['word'],
                    'rhyme_letter': rhyme_letter,
                    'rhyme_quality': 'none',  # Skipped = niet rijmend
                    'status': 'skipped',
                    'rhyme_type_counts': {},
                    'total_in_group': len(words_in_group),
                    'damaged_in_group': len(damaged_in_group),
                    'skipped_in_group': len(skipped_in_group)
                })
        
        if len(words_in_group) < 2:
            continue
        
        # Voor elk woord: check de rijmkwaliteit
        for i, word_info in enumerate(words_in_group):
            rhyme_type_counts = Counter()
            
            for j, other_info in enumerate(words_in_group):
                if i == j:
                    continue
                    
                pair_result = analyze_pair(word_info['word'], other_info['word'],
                                          word_info['vers_id'], other_info['vers_id'])
                rhyme_type_counts[pair_result] += 1
            
            # Bepaal de rijmkwaliteit op basis van dominante type
            rhyme_quality = determine_rhyme_quality(rhyme_type_counts)
            
            results.append({
                'section': section,
                'strofe': strofe_num,
                'vers_pos': word_info['pos'],
                'vers_id': word_info['vers_id'],
                'word': word_info['word'],
                'rhyme_letter': rhyme_letter,
                'rhyme_quality': rhyme_quality,
                'status': 'analyzed',
                'rhyme_type_counts': dict(rhyme_type_counts),
                'total_in_group': len(words_in_group),
                'damaged_in_group': len(damaged_in_group),
                'skipped_in_group': len(skipped_in_group)
            })
    
    return results

def analyze_strofe_by_word(strofe, section, strofe_num):
    """Analyseer rijm per woord: rijmt elk woord met minstens één ander in zijn groep?"""
    
    results = []
    
    # Check of de hele strofe ontbreekt (of bijna leeg is)
    non_skipped_verses = [v for v in strofe.values() if v['woord'] != 'skipped']
    
    if len(non_skipped_verses) == 0:
        return []
    
    # Tel hoeveel verzen 'skipped' zijn in deze strofe
    skipped_in_strofe = sum(1 for v in strofe.values() if v['woord'] == 'skipped')
    
    if skipped_in_strofe == 13:
        return []
    
    a_lines = [1, 2, 4, 5, 7, 8, 10, 11]
    b_lines = [3, 6, 9, 12, 13]
    
    for rhyme_letter, positions in [('A', a_lines), ('B', b_lines)]:
        words_in_group = []
        damaged_in_group = []
        skipped_in_group = []
        
        for pos in positions:
            if pos in strofe:
                word = strofe[pos]['woord']
                if word == 'damaged':
                    damaged_in_group.append({
                        'pos': pos,
                        'vers_id': strofe[pos]['vers_id'],
                        'word': word
                    })
                elif word == 'skipped':
                    skipped_in_group.append({
                        'pos': pos,
                        'vers_id': strofe[pos]['vers_id'],
                        'word': word
                    })
                else:
                    words_in_group.append({
                        'pos': pos,
                        'vers_id': strofe[pos]['vers_id'],
                        'word': word
                    })
        
        # Damaged woorden
        for damaged_info in damaged_in_group:
            results.append({
                'section': section,
                'strofe': strofe_num,
                'vers_pos': damaged_info['pos'],
                'vers_id': damaged_info['vers_id'],
                'word': damaged_info['word'],
                'rhyme_letter': rhyme_letter,
                'rhyme_quality': None,
                'status': 'damaged',
                'rhyme_type_counts': {},
                'rhyme_details': {},  # NIEUW
                'total_in_group': len(words_in_group),
                'damaged_in_group': len(damaged_in_group),
                'skipped_in_group': len(skipped_in_group)
            })
        
        # Skipped woorden
        if skipped_in_strofe < 13:
            for skipped_info in skipped_in_group:
                results.append({
                    'section': section,
                    'strofe': strofe_num,
                    'vers_pos': skipped_info['pos'],
                    'vers_id': skipped_info['vers_id'],
                    'word': skipped_info['word'],
                    'rhyme_letter': rhyme_letter,
                    'rhyme_quality': 'none',
                    'status': 'skipped',
                    'rhyme_type_counts': {},
                    'rhyme_details': {},  # NIEUW
                    'total_in_group': len(words_in_group),
                    'damaged_in_group': len(damaged_in_group),
                    'skipped_in_group': len(skipped_in_group)
                })
        
        if len(words_in_group) < 2:
            continue
        
        # Voor elk woord: check de rijmkwaliteit
        for i, word_info in enumerate(words_in_group):
            rhyme_type_counts = Counter()
            rhyme_details = defaultdict(list)  # NIEUW: per type een lijst van woorden
            
            for j, other_info in enumerate(words_in_group):
                if i == j:
                    continue
                    
                pair_result = analyze_pair(word_info['word'], other_info['word'],
                                          word_info['vers_id'], other_info['vers_id'])
                rhyme_type_counts[pair_result] += 1
                
                rhyme_details[pair_result].append(other_info['word'])
            
            # Bepaal de rijmkwaliteit op basis van dominante type
            rhyme_quality = determine_rhyme_quality(rhyme_type_counts)
            
            results.append({
                'section': section,
                'strofe': strofe_num,
                'vers_pos': word_info['pos'],
                'vers_id': word_info['vers_id'],
                'word': word_info['word'],
                'rhyme_letter': rhyme_letter,
                'rhyme_quality': rhyme_quality,
                'status': 'analyzed',
                'rhyme_type_counts': dict(rhyme_type_counts),
                'rhyme_details': {k: v for k, v in rhyme_details.items()},  # NIEUW
                'total_in_group': len(words_in_group),
                'damaged_in_group': len(damaged_in_group),
                'skipped_in_group': len(skipped_in_group)
            })
    
    return results
# ============================================================================
# MAIN
# ============================================================================

print("\nAnalyzing rhymes per word...")
results_by_word = {}
skipped_count = 0
damaged_count = 0

for manuscript, verzen in tqdm(vers_data.items(), desc="Manuscripts"):
    # Filter verzen (maar behoud damaged)
    filtered_verzen = {}
    for vers_id, woord in verzen.items():
        if should_skip_vers(vers_id):
            skipped_count += 1
            continue
        if woord == 'damaged':
            damaged_count += 1
        filtered_verzen[vers_id] = woord
    
    # Groepeer verzen per sectie en strofe
    strofes = defaultdict(lambda: defaultdict(dict))
    for vers_id, woord in filtered_verzen.items():
        parsed = parse_vers_id(vers_id)
        section = parsed['section']
        strofe_num = parsed['strofe']
        vers_in_strofe = parsed['vers_in_strofe']
        strofes[(section, strofe_num)][vers_in_strofe] = {
            'vers_id': vers_id,
            'woord': woord
        }
    
    manuscript_results = []
    
    for (section, strofe_num), strofe in sorted(strofes.items()):
        word_results = analyze_strofe_by_word(strofe, section, strofe_num)
        manuscript_results.extend(word_results)
    
    results_by_word[manuscript] = manuscript_results

# Save resultaten
with open('../data/rhyme_analysis_by_word.json', 'w', encoding='utf-8') as f:
    json.dump(results_by_word, f, indent=2, ensure_ascii=False)

print(f"\n Resultaten opgeslagen in '../data/rhyme_analysis_by_word.json'")

# ============================================================================
# STATISTIEKEN
# ============================================================================

print("\n" + "="*80)
print("STATISTIEKEN PER WOORD (met rijmkwaliteit classificatie)")
print("="*80)

all_words = []
for pairs in results_by_word.values():
    all_words.extend(pairs)

# Splits op status
analyzed_words = [w for w in all_words if w['status'] == 'analyzed']
damaged_words = [w for w in all_words if w['status'] == 'damaged']
skipped_words = [w for w in all_words if w['status'] == 'skipped']

total_words = len(all_words)
total_analyzed = len(analyzed_words)
total_damaged = len(damaged_words)
total_skipped = len(skipped_words)

# Classificeer op rijmkwaliteit
quality_full = [w for w in analyzed_words if w['rhyme_quality'] == 'full']
quality_disturbed = [w for w in analyzed_words if w['rhyme_quality'] == 'disturbed']
quality_assonance = [w for w in analyzed_words if w['rhyme_quality'] == 'assonance']
quality_none_analyzed = [w for w in analyzed_words if w['rhyme_quality'] == 'none']
quality_none_total = quality_none_analyzed + skipped_words  # Inclusief skipped

total_countable = total_analyzed + total_skipped

print(f"\nTotaal woorden: {total_words}")
print(f"  - Geanalyseerd: {total_analyzed} ({total_analyzed/total_words*100:.1f}%)")
print(f"  - Damaged: {total_damaged} ({total_damaged/total_words*100:.1f}%)")
print(f"  - Skipped (individueel): {total_skipped} ({total_skipped/total_words*100:.1f}%)")

print(f"\n--- RIJMKWALITEIT VERDELING (van {total_countable} analyseerbare woorden) ---")
print(f"Full rijm (overwegend):      {len(quality_full):5d} ({len(quality_full)/total_countable*100:5.1f}%)")
print(f"Disturbed rijm (overwegend): {len(quality_disturbed):5d} ({len(quality_disturbed)/total_countable*100:5.1f}%)")
print(f"Assonance (overwegend):      {len(quality_assonance):5d} ({len(quality_assonance)/total_countable*100:5.1f}%)")
print(f"Geen rijm:                   {len(quality_none_total):5d} ({len(quality_none_total)/total_countable*100:5.1f}%)")
print(f"  waarvan geanalyseerd:      {len(quality_none_analyzed):5d}")
print(f"  waarvan skipped:           {len(skipped_words):5d}")

# Verzamel alle rijmtype counts over alle woorden
all_rhyme_type_counts = Counter()
for word in analyzed_words:
    for rhyme_type, count in word['rhyme_type_counts'].items():
        all_rhyme_type_counts[rhyme_type] += count

print(f"\n--- RIJMTYPE STATISTIEKEN (alle paarwise vergelijkingen) ---")
total_comparisons = sum(all_rhyme_type_counts.values())
print(f"Totaal vergelijkingen: {total_comparisons}")
for rhyme_type in ['full', 'assonance', 'disturbed', 'none', 'stress_mismatch', 'unknown']:
    count = all_rhyme_type_counts.get(rhyme_type, 0)
    if count > 0:
        print(f"  {rhyme_type}: {count} ({count/total_comparisons*100:.1f}%)")

# Per manuscript
print("\n--- Per manuscript ---")
for manuscript, words in results_by_word.items():
    if not words:
        continue
    analyzed = [w for w in words if w['status'] == 'analyzed']
    damaged = [w for w in words if w['status'] == 'damaged']
    skipped = [w for w in words if w['status'] == 'skipped']
    
    qual_full = sum(1 for w in analyzed if w['rhyme_quality'] == 'full')
    qual_dist = sum(1 for w in analyzed if w['rhyme_quality'] == 'disturbed')
    qual_asso = sum(1 for w in analyzed if w['rhyme_quality'] == 'assonance')
    qual_none = sum(1 for w in analyzed if w['rhyme_quality'] == 'none')
    qual_none_total = qual_none + len(skipped)
    total_countable_ms = len(analyzed) + len(skipped)
    
    if analyzed or skipped:
        print(f"\n{manuscript}:")
        print(f"  Geanalyseerd: {len(analyzed)}, Damaged: {len(damaged)}, Skipped: {len(skipped)}")
        print(f"  Rijmkwaliteit ({total_countable_ms} woorden):")
        print(f"    Full: {qual_full} ({qual_full/total_countable_ms*100:.1f}%)")
        print(f"    Disturbed: {qual_dist} ({qual_dist/total_countable_ms*100:.1f}%)")
        print(f"    Assonance: {qual_asso} ({qual_asso/total_countable_ms*100:.1f}%)")
        print(f"    None: {qual_none_total} ({qual_none_total/total_countable_ms*100:.1f}%)")


# Toon voorbeelden per categorie
print("\n--- VOORBEELDEN PER CATEGORIE ---")

print("\nFull rijm woorden (eerste 10):")
for w in quality_full[:10]:
    print(f"  {w['vers_id']}: '{w['word']}' - {w['rhyme_type_counts']}")

print("\nDisturbed rijm woorden (eerste 10):")
for w in quality_disturbed[:10]:
    print(f"  {w['vers_id']}: '{w['word']}' - {w['rhyme_type_counts']}")

print("\nAssonance woorden (eerste 10):")
for w in quality_assonance[:10]:
    print(f"  {w['vers_id']}: '{w['word']}' - {w['rhyme_type_counts']}")

print("\nGeen rijm woorden (eerste 10, exclusief skipped):")
for w in quality_none_analyzed[:10]:
    print(f"  {w['vers_id']}: '{w['word']}' - {w['rhyme_type_counts']}")

Loading data...

Analyzing rhymes per word...


Manuscripts: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:05<00:00,  3.39it/s]



 Resultaten opgeslagen in '../data/rhyme_analysis_by_word.json'

STATISTIEKEN PER WOORD (met rijmkwaliteit classificatie)

Totaal woorden: 16521
  - Geanalyseerd: 15441 (93.5%)
  - Damaged: 1053 (6.4%)
  - Skipped (individueel): 27 (0.2%)

--- RIJMKWALITEIT VERDELING (van 15468 analyseerbare woorden) ---
Full rijm (overwegend):      15273 ( 98.7%)
Disturbed rijm (overwegend):    88 (  0.6%)
Assonance (overwegend):         48 (  0.3%)
Geen rijm:                      59 (  0.4%)
  waarvan geanalyseerd:         32
  waarvan skipped:              27

--- RIJMTYPE STATISTIEKEN (alle paarwise vergelijkingen) ---
Totaal vergelijkingen: 87674
  full: 86028 (98.1%)
  assonance: 508 (0.6%)
  disturbed: 790 (0.9%)
  none: 348 (0.4%)

--- Per manuscript ---

A:
  Geanalyseerd: 1758, Damaged: 6, Skipped: 4
  Rijmkwaliteit (1762 woorden):
    Full: 1746 (99.1%)
    Disturbed: 6 (0.3%)
    Assonance: 3 (0.2%)
    None: 7 (0.4%)

B:
  Geanalyseerd: 1809, Damaged: 6, Skipped: 5
  Rijmkwaliteit (1814 w

In [22]:
import json
import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import PatternFill, Font, Alignment
from openpyxl.utils import get_column_letter

# Laad de resultaten
print("Loading results...")
with open('../data/rhyme_analysis_by_word.json', 'r', encoding='utf-8') as f:
    results_by_word = json.load(f)

# Verzamel alle woorden
all_words = []
for manuscript, words in results_by_word.items():
    for word in words:
        word['manuscript'] = manuscript
        all_words.append(word)

# ============================================================================
# FILTER: ALLEEN NIET-FULL EN NIET-DAMAGED WOORDEN
# ============================================================================

problematic_words = []

for w in all_words:
    # Skip woorden met rhyme_quality = 'full'
    if w['status'] == 'analyzed' and w.get('rhyme_quality') == 'full':
        continue
    
    # Skip damaged woorden
    if w['status'] == 'damaged':
        continue
    
    # Alle andere woorden toevoegen
    problematic_words.append(w)

print(f"Totaal woorden: {len(all_words)}")
print(f"Problematische woorden (niet-full, niet-damaged): {len(problematic_words)}")

excel_data = []

for w in problematic_words:
    # Format de rhyme_type_counts
    counts = w.get('rhyme_type_counts', {})
    counts_str = ', '.join([f"{k}({v})" for k, v in counts.items() if v > 0])
    
    # Bepaal status/kwaliteit
    if w['status'] == 'skipped':
        quality = 'skipped'
    else:
        quality = w.get('rhyme_quality', 'unknown')
    
    excel_data.append({
        'vers_id': w['vers_id'],
        'manuscript': w['manuscript'],
        'section': w['section'],
        'strofe': w['strofe'],
        'probleem_woord': w['word'],
        'rhj_letter': w['rhyme_letter'],
        'non_rhyme_results': counts_str,
        'corrected_category': '',  # LEEG voor jouw correcties
        'notes': ''  # LEEG voor jouw notities
    })

# Maak DataFrame
df = pd.DataFrame(excel_data)

# Sorteer op manuscript, section, strofe, vers_id
df = df.sort_values(['manuscript', 'section', 'strofe', 'vers_id'])

print(f"\nTotaal problematische woorden in Excel: {len(df)}")

# Verdeling
print(f"\nVerdeling:")
quality_counts = {}
for w in problematic_words:
    if w['status'] == 'skipped':
        q = 'skipped'
    else:
        q = w.get('rhyme_quality', 'unknown')
    quality_counts[q] = quality_counts.get(q, 0) + 1

for quality, count in sorted(quality_counts.items()):
    print(f"  {quality}: {count}")

# Save naar Excel
df.to_excel('../data/probleemwoorden.xlsx', index=False, sheet_name='woorden')

# Laad workbook om op te maken
wb = load_workbook('../data/probleemwoorden.xlsx')
ws = wb['woorden']

# Header row bold en met kleur
header_fill = PatternFill(start_color='4A90E2', end_color='4A90E2', fill_type='solid')
header_font = Font(bold=True, color='FFFFFF')

for cell in ws[1]:
    cell.fill = header_fill
    cell.font = header_font
    cell.alignment = Alignment(horizontal='center', vertical='center')

# Auto-width voor kolommen
for column in ws.columns:
    max_length = 0
    column_letter = get_column_letter(column[0].column)
    
    for cell in column:
        try:
            if len(str(cell.value)) > max_length:
                max_length = len(str(cell.value))
        except:
            pass
    
    adjusted_width = min(max_length + 2, 80)  # Max 80
    ws.column_dimensions[column_letter].width = adjusted_width

# Maak de bewerkbare kolommen lichtgeel
edit_fill = PatternFill(start_color='FFFFCC', end_color='FFFFCC', fill_type='solid')

for idx, cell in enumerate(ws[1], 1):
    col_letter = get_column_letter(idx)
    
    if cell.value == 'corrected_category':
        ws.column_dimensions[col_letter].width = 20
        for row in range(2, ws.max_row + 1):
            ws[f'{col_letter}{row}'].fill = edit_fill
    
    if cell.value == 'notes':
        ws.column_dimensions[col_letter].width = 50
        for row in range(2, ws.max_row + 1):
            ws[f'{col_letter}{row}'].fill = edit_fill

# Freeze eerste rij
ws.freeze_panes = 'A2'

Loading results...
Totaal woorden: 16521
Problematische woorden (niet-full, niet-damaged): 195

Totaal problematische woorden in Excel: 195

Verdeling:
  assonance: 48
  disturbed: 88
  none: 32
  skipped: 27
