## (Bijna-)identieke rijmwoorden opsporen

In [15]:
import json
import re
from collections import defaultdict

# Laad de resultaten
with open('../data/rhyme_analysis_by_word.json', 'r', encoding='utf-8') as f:
    results_by_word = json.load(f)

def normalize_vowels(word):
    """Normaliseer klinkers zodat spellingvarianten gelijk worden"""
    w = word.lower()
    
    # Lange klinkers en tweeklanken normaliseren
    # Volgorde is belangrijk: langste eerst!
    
    # oe/ou/oo/oi → o
    w = w.replace('oey', 'oi')
    w = w.replace('oei', 'oi')
    w = w.replace('oe', 'o')
    w = w.replace('ou', 'o')
    w = w.replace('oo', 'o')
    w = w.replace('oi', 'o')
    
    # ee/ei → e
    w = w.replace('eey', 'ei')
    w = w.replace('ee', 'e')
    w = w.replace('ei', 'e')
    w = w.replace('ey', 'e')
    
    # uu/ue/eu → u
    w = w.replace('uu', 'u')
    w = w.replace('ue', 'u')
    w = w.replace('eu', 'u')
    
    # aa/ae/ai → a
    w = w.replace('aa', 'a')
    w = w.replace('ae', 'a')
    w = w.replace('ai', 'a')
    
    # ie → i
    w = w.replace('ie', 'i')
    
    # ij/y → i
    w = w.replace('ij', 'i')
    w = w.replace('y', 'i')
    
    return w

def normalize_consonants(word):
    """Normaliseer medeklinkers zodat spellingvarianten gelijk worden"""
    w = word
    
    # Langste combinaties eerst!
    
    # sch → sc (of s, afhankelijk van context)
    w = w.replace('sch', 'sc')
    
    # gh → g
    w = w.replace('gh', 'g')
    
    # ch → g (of X als je dat wilt behouden)
    w = w.replace('ch', 'g')
    
    # ph → f
    w = w.replace('ph', 'f')
    
    # th → t
    w = w.replace('th', 't')
    
    # qu → kw
    w = w.replace('qu', 'kw')
    
    # ck → k
    w = w.replace('ck', 'k')
    
    # c → k (behalve in combinaties)
    w = w.replace('c', 'k')
    
    # dt → t (aan het eind)
    w = re.sub(r'dt$', 't', w)
    
    # d → t (aan het eind)
    w = re.sub(r'd$', 't', w)
    
    # Dubbele medeklinkers → enkel
    w = re.sub(r'([bcdfghjklmnpqrstvwxz])\1+', r'\1', w)
    
    # v → f (aan het begin, optioneel)
    # w = re.sub(r'^v', 'f', w)
    
    # z → s (aan het begin, optioneel)
    # w = re.sub(r'^z', 's', w)
    
    # nk → ng + k (voor ng-klank)
    # w = w.replace('nk', 'ngk')
    
    # ng behouden als aparte klank
    # (geen verandering nodig)
    
    return w

def normalize_word(word):
    """Normaliseer zowel klinkers als medeklinkers"""
    w = word.lower()
    w = normalize_vowels(w)
    w = normalize_consonants(w)
    return w

def are_nearly_identical(w1, w2):
    """Check of woorden bijna identiek zijn (zelfde stam, spellingvarianten)"""
    w1_lower = w1.lower()
    w2_lower = w2.lower()
    
    # Exact identiek
    if w1_lower == w2_lower:
        return True, 'identical'
    
    # Normaliseer volledig
    w1_norm = normalize_word(w1_lower)
    w2_norm = normalize_word(w2_lower)
    
    # Identiek na normalisatie
    if w1_norm == w2_norm:
        # Bepaal of het een klinker- of medeklinkervariant is
        w1_vowel = normalize_vowels(w1_lower)
        w2_vowel = normalize_vowels(w2_lower)
        
        if w1_vowel == w2_vowel:
            return True, 'consonant_variant'
        else:
            w1_cons = normalize_consonants(w1_lower)
            w2_cons = normalize_consonants(w2_lower)
            if w1_cons == w2_cons:
                return True, 'vowel_variant'
            else:
                return True, 'vowel_and_consonant_variant'
    
    # Eén is prefix van de ander (genormaliseerd)
    if w1_norm.startswith(w2_norm) or w2_norm.startswith(w1_norm):
        if len(w1_norm) != len(w2_norm):
            longer = w1_norm if len(w1_norm) > len(w2_norm) else w2_norm
            shorter = w2_norm if len(w1_norm) > len(w2_norm) else w1_norm
            diff = longer[len(shorter):]
            if diff in ['e', 'en', 's', 'n', 'es', 'ens', 'er', 'ere', 'eren', 'de', 'den', 'te', 'ten']:
                return True, f'suffix_{diff}'
        return True, 'prefix'
    
    # Alleen verschil in suffix (genormaliseerd)
    for suffix in ['e', 'en', 's', 'n', 'es', 'ens', 'er', 'ere', 'eren', 'de', 'den', 'te', 'ten']:
        if w1_norm + suffix == w2_norm or w2_norm + suffix == w1_norm:
            return True, f'suffix_{suffix}'
    
    return False, None

# Zoek identieke en bijna-identieke rijmwoorden binnen dezelfde rijmgroep
identical_rhymes = []
nearly_identical_rhymes = []

# Groepeer per manuscript, sectie, strofe, en rijmletter
for manuscript, words in results_by_word.items():
    # Groepeer woorden per strofe en rijmletter
    groups = defaultdict(list)
    
    for w in words:
        if w['status'] == 'analyzed':
            key = (w['section'], w['strofe'], w['rhyme_letter'])
            groups[key].append(w)
    
    # Check elke groep op identieke/bijna-identieke woorden
    for (section, strofe, rhyme_letter), group_words in groups.items():
        # Vergelijk alle paren
        for i, w1 in enumerate(group_words):
            for j, w2 in enumerate(group_words):
                if i >= j:  # Voorkom dubbele vergelijkingen
                    continue
                
                is_similar, match_type = are_nearly_identical(w1['word'], w2['word'])
                
                if is_similar:
                    result = {
                        'manuscript': manuscript,
                        'section': section,
                        'strofe': strofe,
                        'rhyme_letter': rhyme_letter,
                        'word1': w1['word'],
                        'word2': w2['word'],
                        'word1_normalized': normalize_word(w1['word'].lower()),
                        'word2_normalized': normalize_word(w2['word'].lower()),
                        'vers1_id': w1['vers_id'],
                        'vers2_id': w2['vers_id'],
                        'vers1_pos': w1['vers_pos'],
                        'vers2_pos': w2['vers_pos'],
                        'match_type': match_type
                    }
                    
                    if match_type == 'identical':
                        identical_rhymes.append(result)
                    else:
                        nearly_identical_rhymes.append(result)

# Resultaten
print(f"\n{'='*70}")
print(f"IDENTIEKE RIJMWOORDEN: {len(identical_rhymes)}")
print('='*70)

#for item in identical_rhymes:
 #   print(f"\n{item['manuscript']} - {item['section']} strofe {item['strofe']} ({item['rhyme_letter']}-rijm):")
  #  print(f"  '{item['word1']}' op vers {item['vers1_pos']} ({item['vers1_id']})")
   # print(f"  '{item['word2']}' op vers {item['vers2_pos']} ({item['vers2_id']})")

print(f"\n{'='*70}")
print(f"BIJNA-IDENTIEKE RIJMWOORDEN: {len(nearly_identical_rhymes)}")
print('='*70)

# Groepeer per match_type voor overzichtelijkheid
by_match_type = defaultdict(list)
for item in nearly_identical_rhymes:
    by_match_type[item['match_type']].append(item)

for match_type, items in sorted(by_match_type.items(), key=lambda x: -len(x[1])):
    print(f"\n--- {match_type.upper()} ({len(items)} gevallen) ---")
    for item in items[:15]:  # Toon eerste 15 per type
        print(f"  {item['manuscript']} {item['section']} str.{item['strofe']} ({item['rhyme_letter']}): "
              f"'{item['word1']}' vs '{item['word2']}' "
              f"→ '{item['word1_normalized']}'")
    if len(items) > 15:
        print(f"  ... en {len(items) - 15} meer")

# Save resultaten
with open('identical_rhymes.json', 'w', encoding='utf-8') as f:
    json.dump(identical_rhymes, f, indent=2, ensure_ascii=False)

with open('nearly_identical_rhymes.json', 'w', encoding='utf-8') as f:
    json.dump(nearly_identical_rhymes, f, indent=2, ensure_ascii=False)

print(f"\nIdentieke rijmen opgeslagen in 'identical_rhymes.json'")
print(f"Bijna-identieke rijmen opgeslagen in 'nearly_identical_rhymes.json'")

# Extra: toon voorbeelden per type
print(f"\n{'='*70}")
print("VOORBEELDEN PER TYPE")
print('='*70)

for match_type in ['vowel_variant', 'consonant_variant', 'vowel_and_consonant_variant']:
    variants = [r for r in nearly_identical_rhymes if r['match_type'] == match_type]
    if variants:
        print(f"\n--- {match_type.upper()} ({len(variants)} gevallen) ---")
        for item in variants[:20]:
            print(f"  '{item['word1']}' vs '{item['word2']}' → '{item['word1_normalized']}'")

# Statistieken totaal
print(f"\n{'='*70}")
print("TOTAAL OVERZICHT")
print('='*70)

total_identical = len(identical_rhymes)
total_nearly = len(nearly_identical_rhymes)
total = total_identical + total_nearly

print(f"\nTotaal identieke rijmen: {total_identical}")
print(f"Totaal bijna-identieke rijmen: {total_nearly}")
print(f"Totaal problematische rijmen: {total}")

print(f"\nBreakdown bijna-identiek:")
for match_type, items in sorted(by_match_type.items(), key=lambda x: -len(x[1])):
    print(f"  {match_type}: {len(items)} ({len(items)/total_nearly*100:.1f}%)")


IDENTIEKE RIJMWOORDEN: 393

BIJNA-IDENTIEKE RIJMWOORDEN: 37

--- CONSONANT_VARIANT (16 gevallen) ---
  B M1 str.20 (A): 'woort' vs 'woirt' → 'wort'
  B M1 str.29 (B): 'ere' vs 'eere' → 'ere'
  B M2 str.8 (A): 'vairt' vs 'vaert' → 'vart'
  B M2 str.20 (B): 'woorde' vs 'woirde' → 'worde'
  B M3 str.26 (A): 'tyden' vs 'tiden' → 'tiden'
  C M1 str.29 (B): 'eere' vs 'ere' → 'ere'
  C M3 str.21 (A): 'eere' vs 'ere' → 'ere'
  D2 M2 str.20 (B): 'woorde' vs 'woerde' → 'worde'
  F M2 str.3 (A): 'leit' vs 'leyt' → 'let'
  F M2 str.20 (B): 'woirde' vs 'woorde' → 'worde'
  F M3 str.25 (A): 'eere' vs 'ere' → 'ere'
  L M3 str.34 (A): 'gaue' vs 'gaeue' → 'gau'
  O M2 str.3 (A): 'leit' vs 'leyt' → 'let'
  O M3 str.17 (A): 'croone' vs 'crone' → 'krone'
  O M3 str.38 (A): 'smake' vs 'smaeke' → 'smake'
  ... en 1 meer

--- VOWEL_VARIANT (12 gevallen) ---
  B M1 str.29 (B): 'eere' vs 'eerre' → 'ere'
  B M1 str.61 (A): 'sat' vs 'sadt' → 'sat'
  B M3 str.19 (A): 'ghelike' vs 'gelike' → 'gelike'
  D M3 str.2

## Lettergreeprijmen opsporen

In [21]:
import json
import re
from collections import defaultdict
from tqdm import tqdm

# Laad de data
print("Loading data...")
with open('../data/verse_id_lookup.json', 'r', encoding='utf-8') as f:
    verse_id_lookup = json.load(f)

with open('../data/stresses_restructured.json', 'r', encoding='utf-8') as f:
    stress_data = json.load(f)

with open('../data/rhyme_analysis_by_word.json', 'r', encoding='utf-8') as f:
    results_by_word = json.load(f)

# ============================================================================
# SPECIFIEKE GEVALLEN WAAR 'T' VERWIJDERD MAG WORDEN
# ============================================================================

T_PREFIX_ALLOWED = {
    # beswaert:tswaert en varianten
    ('beswaert', 'tswaert'),
    ('tswaert', 'beswaert'),
    ('verzwairt', 'tzwairt'),
    ('tzwairt', 'verzwairt'),
    ('beswaert', 'tzwairt'),
    ('tzwairt', 'beswaert'),
    ('verzwairt', 'tswaert'),
    ('tswaert', 'verzwairt'),
    # waren:twaren en varianten
    ('waren', 'twaren'),
    ('twaren', 'waren'),
    ('tvaren', 'gheuaren'),
    ('gheuaren', 'tvaren'),
    ('twaeren', 'waren'),
    ('waren', 'twaeren'),
    ('gheuaren', 'twaren'),
    ('twaren', 'gheuaren'),
    ('tvaren', 'waren'),
    ('waren', 'tvaren'),
    ('tvelt', 'gheuelt'),
    ('gheuelt', 'tvelt'),
    ('tfelt', 'gevelt'),
    ('gevelt', 'tfelt'),
    ('tvelt', 'gevelt'),
    ('gevelt', 'tvelt'),
    ('tfelt', 'gheuelt'),
    ('gheuelt', 'tfelt'),
}

# ============================================================================
# HANDMATIGE TOEVOEGINGEN (met specifieke manuscripten)
# ============================================================================

# Format: (section, strofe, vers1, vers2, [lijst van manuscripten waar het WEL geldt, of None voor alle])
MANUAL_ADDITIONS = [
    ('M1', 3, 31, 36, None),  # Alle manuscripten
    ('M1', 8, 96, 99, None),  # Alle manuscripten
    ('M1', 10, 119, 125, ['A', 'B', 'C', 'G', 'O', 'Y']),  # NIET D, F, L
    ('M1', 23, 291, 294, None),  # Alle manuscripten
]

# ============================================================================
# HULPFUNCTIES
# ============================================================================

def get_word_data(word, vers_id=None):
    """Haal syllabified en stresses op voor een woord."""
    word = word.lower().strip()
    
    if vers_id and vers_id in verse_id_lookup:
        entry = verse_id_lookup[vers_id]
        return entry['syllabified'], entry['stresses']
    
    if word in stress_data:
        entry = stress_data[word]
        if isinstance(entry, list):
            return entry[0]['syllabified'], entry[0]['stresses']
        else:
            return entry['syllabified'], entry['stresses']
    
    return None, None

def get_rhyming_part(word, vers_id=None):
    """Haal het rijmende deel op: alle lettergrepen vanaf de laatste klemtoon."""
    syllables, stresses = get_word_data(word, vers_id)
    
    if syllables is None:
        return None
    
    if stresses:
        last_stress = max(stresses)
    else:
        last_stress = -1
    
    rhyming_syllables = syllables[last_stress:]
    return ''.join(rhyming_syllables)

def normalize_for_comparison(s):
    """Normaliseer voor vergelijking (ZONDER 't' verwijdering)."""
    s = s.lower()
    
    s = re.sub(r'^u', 'f', s)  
    s = re.sub(r'u(?=[aeiou])', 'f', s)
    
    # Klinkernormalisatie
    s = s.replace('oey', 'oi').replace('oei', 'oi')
    s = s.replace('oe', 'o').replace('ou', 'o').replace('oo', 'o').replace('oi', 'o')
    s = s.replace('eey', 'ei')
    s = s.replace('ee', 'e').replace('ei', 'e').replace('ey', 'e')
    s = s.replace('uu', 'u').replace('ue', 'u').replace('eu', 'u')
    s = s.replace('aa', 'a').replace('ae', 'a').replace('ai', 'a')
    s = s.replace('ie', 'i').replace('ij', 'i').replace('y', 'i')
    
    # Medeklinkernormalisatie
    s = s.replace('gh', 'g').replace('ch', 'g')
    s = s.replace('ph', 'f').replace('th', 't')
    s = s.replace('ck', 'k')
    
    # v/f normalisatie
    s = s.replace('v', 'f')
    
    # u aan begin → f (want is eigenlijk v)
  
    
    # u voor klinker → f
 
    
    # s/z normalisatie
    s = s.replace('z', 's')
    
    # c voor e/i → s (maar NIET sc → ss, want sc is een aparte klank)
    # We doen c → s alleen als c NIET voorafgegaan wordt door s
    s = re.sub(r'(?<!s)c(?=[ei])', 's', s)
    
    # c in andere gevallen → k
    s = re.sub(r'(?<!s)c', 'k', s)
    
    # Eind-normalisatie
    s = re.sub(r'dt$', 't', s)
    s = re.sub(r'd$', 't', s)
    s = re.sub(r'([bcdfghjklmnpqrstvwxz])\1+', r'\1', s)
    
    # e/en normalisatie aan het eind
    s = re.sub(r'en$', 'e', s)
    
    return s

def normalize_with_t_removal(s):
    """Normaliseer MET 't' verwijdering aan het begin."""
    s = s.lower()
    
    # Verwijder 't aan het begin
    if s.startswith('t') and len(s) > 2:
        s = s[1:]
    
    # Rest van normalisatie
    return normalize_for_comparison(s)

def is_t_prefix_pair(word1, word2):
    """Check of dit een toegestaan paar is voor 't' prefix verwijdering."""
    w1 = word1.lower()
    w2 = word2.lower()
    return (w1, w2) in T_PREFIX_ALLOWED or (w2, w1) in T_PREFIX_ALLOWED

def is_lettergreeprijm(word1, word2, vers_id1=None, vers_id2=None):
    """Check of twee woorden lettergreeprijm vormen."""
    full1 = get_rhyming_part(word1, vers_id1)
    full2 = get_rhyming_part(word2, vers_id2)
    
    if full1 is None or full2 is None:
        return False, None, None, None
    
    # Standaard normalisatie (zonder 't' verwijdering)
    full1_norm = normalize_for_comparison(full1)
    full2_norm = normalize_for_comparison(full2)
    
    if full1_norm == full2_norm:
        return True, full1, full2, full1_norm
    
    # Check of dit een toegestaan 't' prefix paar is
    if is_t_prefix_pair(word1, word2):
        full1_t = normalize_with_t_removal(full1)
        full2_t = normalize_with_t_removal(full2)
        if full1_t == full2_t:
            return True, full1, full2, full1_t
    
    return False, full1, full2, None

# ============================================================================
# ZOEK LETTERGREEPRIJMEN
# ============================================================================

print("\n" + "="*60)
print("ZOEKEN NAAR LETTERGREEPRIJMEN")
print("="*60)

lettergreeprijmen = []

for manuscript, words in tqdm(results_by_word.items(), desc="Manuscripts"):
    groups = defaultdict(list)
    
    for w in words:
        if w['status'] == 'analyzed':
            key = (w['section'], w['strofe'], w['rhyme_letter'])
            groups[key].append(w)
    
    for (section, strofe, rhyme_letter), group_words in groups.items():
        for i, w1 in enumerate(group_words):
            for j, w2 in enumerate(group_words):
                if i >= j:
                    continue
                
                # Skip identieke woorden
                if w1['word'].lower() == w2['word'].lower():
                    continue
                
                is_lg_rijm, full1, full2, normalized = is_lettergreeprijm(
                    w1['word'], w2['word'], 
                    w1['vers_id'], w2['vers_id']
                )
                
                if is_lg_rijm:
                    lettergreeprijmen.append({
                        'manuscript': manuscript,
                        'section': section,
                        'strofe': strofe,
                        'rhyme_letter': rhyme_letter,
                        'word1': w1['word'],
                        'word2': w2['word'],
                        'rhyming_part1': full1,
                        'rhyming_part2': full2,
                        'normalized': normalized,
                        'vers1_id': w1['vers_id'],
                        'vers2_id': w2['vers_id'],
                        'vers1_pos': w1['vers_pos'],
                        'vers2_pos': w2['vers_pos'],
                        'type': 'lettergreeprijm'
                    })

# ============================================================================
# HANDMATIGE TOEVOEGINGEN
# ============================================================================

print("\nHandmatige toevoegingen verwerken...")
for section, strofe, vers1, vers2, allowed_manuscripts in MANUAL_ADDITIONS:
    for manuscript, words in results_by_word.items():
        # Check of dit manuscript is toegestaan
        if allowed_manuscripts is not None and manuscript not in allowed_manuscripts:
            continue
        
        word1_info = None
        word2_info = None
        
        for w in words:
            if w['section'] == section and w['strofe'] == strofe:
                vers_num = int(w['vers_id'].split('_')[-1])
                if vers_num == vers1:
                    word1_info = w
                elif vers_num == vers2:
                    word2_info = w
        
        if word1_info and word2_info:
            # Check of dit paar al bestaat
            exists = False
            for lg in lettergreeprijmen:
                if (lg['vers1_id'] == word1_info['vers_id'] and lg['vers2_id'] == word2_info['vers_id']) or \
                   (lg['vers1_id'] == word2_info['vers_id'] and lg['vers2_id'] == word1_info['vers_id']):
                    exists = True
                    break
            
            if not exists:
                full1 = get_rhyming_part(word1_info['word'], word1_info['vers_id'])
                full2 = get_rhyming_part(word2_info['word'], word2_info['vers_id'])
                
                lettergreeprijmen.append({
                    'manuscript': manuscript,
                    'section': section,
                    'strofe': strofe,
                    'rhyme_letter': word1_info['rhyme_letter'],
                    'word1': word1_info['word'],
                    'word2': word2_info['word'],
                    'rhyming_part1': full1,
                    'rhyming_part2': full2,
                    'normalized': normalize_for_comparison(full1) if full1 else '',
                    'vers1_id': word1_info['vers_id'],
                    'vers2_id': word2_info['vers_id'],
                    'vers1_pos': word1_info['vers_pos'],
                    'vers2_pos': word2_info['vers_pos'],
                    'type': 'lettergreeprijm',
                    'manual_addition': True
                })

print(f"Totaal lettergreeprijmen gevonden: {len(lettergreeprijmen)}")

# Save lettergreeprijmen
with open('alle_lettergreeprijmen.json', 'w', encoding='utf-8') as f:
    json.dump(lettergreeprijmen, f, indent=2, ensure_ascii=False)

print(f"Lettergreeprijmen opgeslagen in 'alle_lettergreeprijmen.json'")

# ============================================================================
# LAAD IDENTICAL RHYMES
# ============================================================================

with open('identical_rhymes.json', 'r', encoding='utf-8') as f:
    identical_rhymes = json.load(f)

for item in identical_rhymes:
    item['type'] = 'identical'

print(f"Totaal identical rhymes: {len(identical_rhymes)}")

# ============================================================================
# UNIEKE LOCATIES ANALYSE
# ============================================================================

def get_vers_number(vers_id):
    return int(vers_id.split('_')[-1])

def get_section(vers_id):
    return vers_id.split('_')[1]

def cluster_all_locations(lettergreep_rhymes, identical_rhymes):
    """Groepeer alle rijmparen in clusters waar verzen overlappen."""
    all_rhymes = []
    
    for item in lettergreep_rhymes:
        all_rhymes.append({
            'verses': {get_vers_number(item['vers1_id']), get_vers_number(item['vers2_id'])},
            'section': get_section(item['vers1_id']),
            'manuscript': item['manuscript'],
            'items': [item],
            'types': {'lettergreeprijm'}
        })
    
    for item in identical_rhymes:
        all_rhymes.append({
            'verses': {get_vers_number(item['vers1_id']), get_vers_number(item['vers2_id'])},
            'section': get_section(item['vers1_id']),
            'manuscript': item['manuscript'],
            'items': [item],
            'types': {'identical'}
        })
    
    # Groepeer per sectie
    by_section = defaultdict(list)
    for rhyme in all_rhymes:
        by_section[rhyme['section']].append(rhyme)
    
    # Merge overlappende clusters per sectie
    merged_clusters = {}
    for section, clusters in by_section.items():
        merged = []
        
        for cluster in clusters:
            found = False
            for existing in merged:
                if cluster['verses'] & existing['verses']:
                    existing['verses'] |= cluster['verses']
                    existing['items'].extend(cluster['items'])
                    existing['types'] |= cluster['types']
                    existing['manuscripts'].add(cluster['manuscript'])
                    found = True
                    break
            
            if not found:
                cluster['manuscripts'] = {cluster['manuscript']}
                merged.append(cluster)
        
        # Herhaal merging
        changed = True
        while changed:
            changed = False
            new_merged = []
            used = set()
            
            for i, c1 in enumerate(merged):
                if i in used:
                    continue
                
                for j, c2 in enumerate(merged):
                    if j <= i or j in used:
                        continue
                    
                    if c1['verses'] & c2['verses']:
                        c1['verses'] |= c2['verses']
                        c1['items'].extend(c2['items'])
                        c1['types'] |= c2['types']
                        c1['manuscripts'] |= c2['manuscripts']
                        used.add(j)
                        changed = True
                
                new_merged.append(c1)
                used.add(i)
            
            merged = new_merged
        
        merged_clusters[section] = merged
    
    return merged_clusters

# Cluster alles
print("\nClusteren van alle rijmen...")
all_clusters = cluster_all_locations(lettergreeprijmen, identical_rhymes)

# ============================================================================
# STATISTIEKEN
# ============================================================================

print(f"\n{'='*80}")
print("UNIEKE LOCATIES ANALYSE")
print('='*80)

total_all = 0
stats_by_section = {}

for section in ['M1', 'M2', 'M3']:
    clusters = all_clusters.get(section, [])
    
    only_lettergreep = sum(1 for c in clusters if c['types'] == {'lettergreeprijm'})
    only_identical = sum(1 for c in clusters if c['types'] == {'identical'})
    both = sum(1 for c in clusters if 'lettergreeprijm' in c['types'] and 'identical' in c['types'])
    
    total_all += len(clusters)
    
    stats_by_section[section] = {
        'total': len(clusters),
        'only_lettergreep': only_lettergreep,
        'only_identical': only_identical,
        'both': both
    }
    
    print(f"\n{section}: {len(clusters)} unieke locaties")
    print(f"  - Alleen lettergreeprijm: {only_lettergreep}")
    print(f"  - Alleen identical: {only_identical}")
    print(f"  - Beide types: {both}")

print(f"\nTOTAAL: {total_all} unieke locaties")

# ============================================================================
# DETAIL OUTPUT
# ============================================================================

print(f"\n{'='*80}")
print("DETAIL PER SECTIE")
print('='*80)

output_data = {
    'total_unique_locations': total_all,
    'by_section': {}
}

for section in ['M1', 'M2', 'M3']:
    clusters = all_clusters.get(section, [])
    clusters_sorted = sorted(clusters, key=lambda x: min(x['verses']))
    
    print(f"\n--- {section} ({len(clusters)} unieke locaties) ---")
    
    section_data = []
    for cluster in clusters_sorted:
        verses = sorted(cluster['verses'])
        types = list(cluster['types'])
        manuscripts = sorted(cluster['manuscripts'])
        
        # Verzamel info
        word_pairs = set()
        identical_words = set()
        
        for item in cluster['items']:
            if item.get('type') == 'lettergreeprijm':
                pair = tuple(sorted([item['word1'].lower(), item['word2'].lower()]))
                word_pairs.add(pair)
            else:
                word = item.get('word') or item.get('word1')
                if word:
                    identical_words.add(word.lower())
        
        verses_str = ':'.join(str(v) for v in verses)
        types_str = '+'.join(sorted(types))
        manuscripts_str = ','.join(manuscripts)
        
        info_parts = []
        if identical_words:
            info_parts.append(f"identical: {', '.join(sorted(identical_words))}")
        if word_pairs:
            pairs_str = '; '.join(f"'{p[0]}':'{p[1]}'" for p in list(word_pairs)[:3])
            info_parts.append(f"lettergreep: {pairs_str}")
        
        print(f"  {section}_{verses_str} [{types_str}]: {' | '.join(info_parts)} (in: {manuscripts_str})")
        
        section_data.append({
            'verses': verses,
            'types': types,
            'identical_words': list(identical_words),
            'lettergreep_pairs': [list(p) for p in word_pairs],
            'manuscripts': manuscripts
        })
    
    output_data['by_section'][section] = {
        'count': len(clusters),
        'statistics': stats_by_section[section],
        'locations': section_data
    }

# ============================================================================
# TELLINGEN PER MANUSCRIPT
# ============================================================================

print(f"\n{'='*80}")
print("TELLINGEN PER MANUSCRIPT")
print('='*80)

# Lettergreeprijmen per manuscript
lg_per_manuscript = defaultdict(lambda: defaultdict(int))
for item in lettergreeprijmen:
    manuscript = item['manuscript']
    section = item['section']
    lg_per_manuscript[manuscript][section] += 1
    lg_per_manuscript[manuscript]['totaal'] += 1

# Identical per manuscript
id_per_manuscript = defaultdict(lambda: defaultdict(int))
for item in identical_rhymes:
    manuscript = item['manuscript']
    section = item['section']
    id_per_manuscript[manuscript][section] += 1
    id_per_manuscript[manuscript]['totaal'] += 1

manuscripts_sorted = sorted(set(list(lg_per_manuscript.keys()) + list(id_per_manuscript.keys())))

print(f"\nLETTERGREEPRIJMEN PER MANUSCRIPT:")
print(f"{'Manuscript':<12} {'M1':>8} {'M2':>8} {'M3':>8} {'Totaal':>10}")
print("-" * 50)

totals_lg = {'M1': 0, 'M2': 0, 'M3': 0, 'totaal': 0}
for ms in manuscripts_sorted:
    m1 = lg_per_manuscript[ms].get('M1', 0)
    m2 = lg_per_manuscript[ms].get('M2', 0)
    m3 = lg_per_manuscript[ms].get('M3', 0)
    tot = lg_per_manuscript[ms]['totaal']
    print(f"{ms:<12} {m1:>8} {m2:>8} {m3:>8} {tot:>10}")
    totals_lg['M1'] += m1
    totals_lg['M2'] += m2
    totals_lg['M3'] += m3
    totals_lg['totaal'] += tot

print("-" * 50)
print(f"{'TOTAAL':<12} {totals_lg['M1']:>8} {totals_lg['M2']:>8} {totals_lg['M3']:>8} {totals_lg['totaal']:>10}")

print(f"\nIDENTICAL RHYMES PER MANUSCRIPT:")
print(f"{'Manuscript':<12} {'M1':>8} {'M2':>8} {'M3':>8} {'Totaal':>10}")
print("-" * 50)

totals_id = {'M1': 0, 'M2': 0, 'M3': 0, 'totaal': 0}
for ms in manuscripts_sorted:
    m1 = id_per_manuscript[ms].get('M1', 0)
    m2 = id_per_manuscript[ms].get('M2', 0)
    m3 = id_per_manuscript[ms].get('M3', 0)
    tot = id_per_manuscript[ms]['totaal']
    print(f"{ms:<12} {m1:>8} {m2:>8} {m3:>8} {tot:>10}")
    totals_id['M1'] += m1
    totals_id['M2'] += m2
    totals_id['M3'] += m3
    totals_id['totaal'] += tot

print("-" * 50)
print(f"{'TOTAAL':<12} {totals_id['M1']:>8} {totals_id['M2']:>8} {totals_id['M3']:>8} {totals_id['totaal']:>10}")

print(f"\nGECOMBINEERD (LETTERGREEP + IDENTICAL):")
print(f"{'Manuscript':<12} {'M1':>8} {'M2':>8} {'M3':>8} {'Totaal':>10}")
print("-" * 50)

totals_comb = {'M1': 0, 'M2': 0, 'M3': 0, 'totaal': 0}
for ms in manuscripts_sorted:
    m1 = lg_per_manuscript[ms].get('M1', 0) + id_per_manuscript[ms].get('M1', 0)
    m2 = lg_per_manuscript[ms].get('M2', 0) + id_per_manuscript[ms].get('M2', 0)
    m3 = lg_per_manuscript[ms].get('M3', 0) + id_per_manuscript[ms].get('M3', 0)
    tot = m1 + m2 + m3
    print(f"{ms:<12} {m1:>8} {m2:>8} {m3:>8} {tot:>10}")
    totals_comb['M1'] += m1
    totals_comb['M2'] += m2
    totals_comb['M3'] += m3
    totals_comb['totaal'] += tot

print("-" * 50)
print(f"{'TOTAAL':<12} {totals_comb['M1']:>8} {totals_comb['M2']:>8} {totals_comb['M3']:>8} {totals_comb['totaal']:>10}")

# Save tellingen
output_tellingen = {
    'lettergreeprijmen': {
        'per_manuscript': {ms: dict(lg_per_manuscript[ms]) for ms in manuscripts_sorted},
        'totals': totals_lg
    },
    'identical_rhymes': {
        'per_manuscript': {ms: dict(id_per_manuscript[ms]) for ms in manuscripts_sorted},
        'totals': totals_id
    },
    'combined': {
        'per_manuscript': {ms: {
            'M1': lg_per_manuscript[ms].get('M1', 0) + id_per_manuscript[ms].get('M1', 0),
            'M2': lg_per_manuscript[ms].get('M2', 0) + id_per_manuscript[ms].get('M2', 0),
            'M3': lg_per_manuscript[ms].get('M3', 0) + id_per_manuscript[ms].get('M3', 0),
            'totaal': lg_per_manuscript[ms]['totaal'] + id_per_manuscript[ms]['totaal']
        } for ms in manuscripts_sorted},
        'totals': totals_comb
    }
}

with open('rijmfouten_per_manuscript.json', 'w', encoding='utf-8') as f:
    json.dump(output_tellingen, f, indent=2, ensure_ascii=False)

print(f"\nTellingen per manuscript opgeslagen in 'rijmfouten_per_manuscript.json'")

# ============================================================================
# SAMENVATTING
# ============================================================================

print(f"\n{'='*80}")
print("SAMENVATTING")
print('='*80)

print(f"\nRuwe data:")
print(f"  Lettergreeprijmen: {len(lettergreeprijmen)} paren")
print(f"  Identical rhymes: {len(identical_rhymes)} paren")
print(f"  Totaal: {len(lettergreeprijmen) + len(identical_rhymes)} paren")

print(f"\nUnieke locaties: {total_all}")

Loading data...

ZOEKEN NAAR LETTERGREEPRIJMEN


Manuscripts: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:01<00:00, 15.62it/s]



Handmatige toevoegingen verwerken...
Totaal lettergreeprijmen gevonden: 736
Lettergreeprijmen opgeslagen in 'alle_lettergreeprijmen.json'
Totaal identical rhymes: 393

Clusteren van alle rijmen...

UNIEKE LOCATIES ANALYSE

M1: 95 unieke locaties
  - Alleen lettergreeprijm: 43
  - Alleen identical: 29
  - Beide types: 23

M2: 42 unieke locaties
  - Alleen lettergreeprijm: 14
  - Alleen identical: 17
  - Beide types: 11

M3: 58 unieke locaties
  - Alleen lettergreeprijm: 23
  - Alleen identical: 20
  - Beide types: 15

TOTAAL: 195 unieke locaties

DETAIL PER SECTIE

--- M1 (95 unieke locaties) ---
  M1_2:8 [lettergreeprijm]: lettergreep: 'doerstaen':'staen' (in: B)
  M1_7:11 [lettergreeprijm]: lettergreep: 'ontfaen':'vaen'; 'beuaen':'ontfaen' (in: A,B,C,D,F,G,L,O)
  M1_14:18 [identical]: identical: dit (in: A,B,C,D,F,G,L,O,Y)
  M1_31:36 [lettergreeprijm]: lettergreep: 'negheen':'scheen'; 'negeen':'sceen'; 'gheen':'sceen' (in: A,B,C,D,F,G,L,O,Y)
  M1_41:46 [lettergreeprijm]: lettergreep: