# 1. DBBE

In [None]:
from orthography_helpers import LSHIndex, MinHashLshClustering, MinHashProcessor, ShingleGenerator, TextPreprocessor, UnionFind
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Set

import numpy as np
from sklearn.metrics import adjusted_rand_score, v_measure_score

RESULTS_DIR = Path("dbbe_orthographic_results")
RESULTS_DIR.mkdir(exist_ok=True)
DATA_FILE = 'paper_verses.csv'
try:
    import cupy as cp
    GPU_AVAILABLE = True
    print("GPU detected - using CuPy acceleration")
except ImportError:
    cp = np
    GPU_AVAILABLE = False
    print("No GPU - using NumPy (CPU mode)")

def reconstruct_poems(df):
    poem_to_clusters = defaultdict(set)
    poem_verse_counts = defaultdict(int)

    for _, row in df.iterrows():
        poem_id = row['idoriginal_poem']
        cluster_id = row['cluster_id']
        poem_verse_counts[poem_id] += 1
        if cluster_id != -1:
            poem_to_clusters[poem_id].add(cluster_id)

    print(f"\nReconstructed {len(poem_to_clusters)} poems")
    return poem_to_clusters, poem_verse_counts

def calculate_poem_cluster_similarity(clusters_a: Set[int], clusters_b: Set[int]) -> float:
    if not clusters_a or not clusters_b:
        return 0.0
    intersection = len(clusters_a & clusters_b)
    union = len(clusters_a | clusters_b)
    return intersection / union if union > 0 else 0.0

def cluster_poems(poem_to_clusters: Dict, similarity_threshold: float = 0.60):
    poem_ids = list(poem_to_clusters.keys())
    n_poems = len(poem_ids)

    edges = []
    for i in range(n_poems):
        for j in range(i + 1, n_poems):
            poem_a = poem_ids[i]
            poem_b = poem_ids[j]
            similarity = calculate_poem_cluster_similarity(
                poem_to_clusters[poem_a],
                poem_to_clusters[poem_b]
            )
            if similarity >= similarity_threshold:
                edges.append((poem_a, poem_b, similarity))

    class PoemUnionFind:
        def __init__(self, elements):
            self.parent = {e: e for e in elements}
            self.rank = {e: 0 for e in elements}

        def find(self, x):
            if self.parent[x] != x:
                self.parent[x] = self.find(self.parent[x])
            return self.parent[x]

        def union(self, x, y):
            px, py = self.find(x), self.find(y)
            if px == py:
                return
            if self.rank[px] < self.rank[py]:
                px, py = py, px
            self.parent[py] = px
            if self.rank[px] == self.rank[py]:
                self.rank[px] += 1

    uf = PoemUnionFind(poem_ids)
    for poem_a, poem_b, _ in edges:
        uf.union(poem_a, poem_b)

    poem_clusters = {poem_id: uf.find(poem_id) for poem_id in poem_ids}
    n_clusters = len(set(poem_clusters.values()))

    return poem_clusters, edges, n_clusters

def evaluate_clustering(y_true, y_pred):
    ari = adjusted_rand_score(y_true, y_pred)
    v_measure = v_measure_score(y_true, y_pred)
    return ari, v_measure


def calculate_perfect_reconstruction_rate(df, poem_clusters):
    poem_to_type = df.groupby('idoriginal_poem')['type_id'].first().to_dict()

    gt_to_poems = defaultdict(set)
    for poem_id, gt_type in poem_to_type.items():
        gt_to_poems[gt_type].add(poem_id)

    pred_to_poems = defaultdict(set)
    for poem_id, pred_cluster in poem_clusters.items():
        pred_to_poems[pred_cluster].add(poem_id)

    perfectly_reconstructed = 0
    total_gt_clusters = len(gt_to_poems)

    for gt_type, gt_poems in gt_to_poems.items():
        for pred_cluster, pred_poems in pred_to_poems.items():
            if gt_poems == pred_poems:
                perfectly_reconstructed += 1
                break

    reconstruction_rate = perfectly_reconstructed / total_gt_clusters if total_gt_clusters > 0 else 0
    return reconstruction_rate, perfectly_reconstructed, total_gt_clusters


def visualize_verse_grid_search(results_df, save_path=None):
    if save_path is None:
        save_path = RESULTS_DIR / 'verse_grid_search_results.png'

    ari_pivot = results_df.pivot(index='shingle_size', columns='threshold', values='ari')
    vmeasure_pivot = results_df.pivot(index='shingle_size', columns='threshold', values='v_measure')
    clusters_pivot = results_df.pivot(index='shingle_size', columns='threshold', values='n_clusters')
    similarities_pivot = results_df.pivot(index='shingle_size', columns='threshold', values='n_similarities')

    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('Verse-Level Clustering Grid Search Results', fontsize=18, fontweight='bold')

    col_labels = [f"{col:.0%}" for col in ari_pivot.columns]

    ax1 = axes[0, 0]
    sns.heatmap(ari_pivot, annot=True, fmt='.4f', cmap='viridis', ax=ax1,
                cbar_kws={'label': 'ARI'}, xticklabels=col_labels)
    ax1.set_xlabel('Similarity Threshold', fontweight='bold', fontsize=12)
    ax1.set_ylabel('Shingle Size', fontweight='bold', fontsize=12)
    ax1.set_title('Adjusted Rand Index (ARI)', fontweight='bold', fontsize=13)

    ax2 = axes[0, 1]
    sns.heatmap(vmeasure_pivot, annot=True, fmt='.4f', cmap='viridis', ax=ax2,
                cbar_kws={'label': 'V-measure'}, xticklabels=col_labels)
    ax2.set_xlabel('Similarity Threshold', fontweight='bold', fontsize=12)
    ax2.set_ylabel('Shingle Size', fontweight='bold', fontsize=12)
    ax2.set_title('V-measure', fontweight='bold', fontsize=13)

    ax3 = axes[1, 0]
    sns.heatmap(clusters_pivot, annot=True, fmt='.0f', cmap='viridis', ax=ax3,
                cbar_kws={'label': 'Clusters'}, xticklabels=col_labels)
    ax3.set_xlabel('Similarity Threshold', fontweight='bold', fontsize=12)
    ax3.set_ylabel('Shingle Size', fontweight='bold', fontsize=12)
    ax3.set_title('Number of Clusters', fontweight='bold', fontsize=13)

    ax4 = axes[1, 1]
    sns.heatmap(similarities_pivot, annot=True, fmt='.0f', cmap='viridis', ax=ax4,
                cbar_kws={'label': 'Similarities'}, xticklabels=col_labels)
    ax4.set_xlabel('Similarity Threshold', fontweight='bold', fontsize=12)
    ax4.set_ylabel('Shingle Size', fontweight='bold', fontsize=12)
    ax4.set_title('Number of Similarities Found', fontweight='bold', fontsize=13)

    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f"\nVisualization saved to: {save_path}")
    plt.close()


def visualize_poem_grid_search(results_df, save_path=None):
    if save_path is None:
        save_path = RESULTS_DIR / 'poem_grid_search_results.png'

    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    fig.suptitle('Poem-Level Clustering Grid Search Results', fontsize=16, fontweight='bold')

    thresholds = results_df['threshold'].values
    thresholds_pct = [f"{t:.0%}" for t in thresholds]

    def normalize(vals):
        return (vals - np.min(vals)) / (np.max(vals) - np.min(vals))

    ax1 = axes[0, 0]
    norm_vals = normalize(results_df['ari'].values)
    colors = plt.cm.viridis(norm_vals)
    ax1.plot(thresholds_pct, results_df['ari'].values, marker='o', linewidth=2, markersize=8)
    for i, (x, y) in enumerate(zip(thresholds_pct, results_df['ari'].values)):
        ax1.text(i, y, f'{y:.4f}', ha='center', va='bottom', fontsize=9)
    ax1.set_xlabel('Similarity Threshold', fontweight='bold')
    ax1.set_ylabel('Adjusted Rand Index (ARI)', fontweight='bold')
    ax1.set_title('ARI vs Threshold')
    ax1.grid(True, alpha=0.3)

    ax2 = axes[0, 1]
    norm_vals = normalize(results_df['v_measure'].values)
    colors = plt.cm.viridis(norm_vals)
    ax2.plot(thresholds_pct, results_df['v_measure'].values, marker='o', linewidth=2, markersize=8)
    for i, (x, y) in enumerate(zip(thresholds_pct, results_df['v_measure'].values)):
        ax2.text(i, y, f'{y:.4f}', ha='center', va='bottom', fontsize=9)
    ax2.set_xlabel('Similarity Threshold', fontweight='bold')
    ax2.set_ylabel('V-measure', fontweight='bold')
    ax2.set_title('V-measure vs Threshold')
    ax2.grid(True, alpha=0.3)

    ax3 = axes[1, 0]
    prr_vals = results_df['perfect_reconstruction_rate'].values * 100
    norm_vals = normalize(prr_vals)
    colors = plt.cm.viridis(norm_vals)
    ax3.plot(thresholds_pct, prr_vals, marker='o', linewidth=2, markersize=8)
    for i, (x, y) in enumerate(zip(thresholds_pct, prr_vals)):
        ax3.plot(x, y, marker='o', color=colors[i], markersize=10)
        ax3.text(i, y, f'{y:.1f}%', ha='center', va='bottom', fontsize=9)
    ax3.set_xlabel('Similarity Threshold', fontweight='bold')
    ax3.set_ylabel('Perfect Reconstruction Rate (%)', fontweight='bold')
    ax3.set_title('Perfect Reconstruction Rate vs Threshold')
    ax3.grid(True, alpha=0.3)

    ax4 = axes[1, 1]
    n_clusters_vals = results_df['n_clusters'].values
    norm_vals = normalize(n_clusters_vals)
    colors = plt.cm.viridis(norm_vals)
    ax4.plot(thresholds_pct, n_clusters_vals, marker='o', linewidth=2, markersize=8)
    for i, (x, y) in enumerate(zip(thresholds_pct, n_clusters_vals)):
        ax4.plot(x, y, marker='o', color=colors[i], markersize=10)
        ax4.text(i, y, f'{y}', ha='center', va='bottom', fontsize=9)
    ax4.set_xlabel('Similarity Threshold', fontweight='bold')
    ax4.set_ylabel('Number of Poem Clusters', fontweight='bold')
    ax4.set_title('Number of Clusters vs Threshold')
    ax4.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f"\nVisualization saved to: {save_path}")
    plt.close()


def verse_level_grid_search(texts, df, thresholds, shingle_sizes, num_perm=128):
    results = []
    best_ari = -1
    best_threshold = None
    best_shingle_size = None
    best_clusters = None
    best_similarities = None

    print("\n" + "="*100)
    print("VERSE-LEVEL 2D GRID SEARCH")
    print("="*100)
    print(f"\nTesting thresholds: {[f'{t:.0%}' for t in thresholds]}")
    print(f"Testing shingle sizes: {shingle_sizes}\n")

    total_combinations = len(thresholds) * len(shingle_sizes)
    print(f"Total combinations: {total_combinations}\n")

    for shingle_size in shingle_sizes:
        for threshold in thresholds:
            print(f"\nTesting shingle_size={shingle_size}, threshold={threshold:.0%}...")

            clusterer = MinHashLshClustering(
                threshold=threshold,
                shingle_size=shingle_size,
                num_perm=num_perm,
                chunk_size=1
            )

            clusters, similarities = clusterer.cluster(texts)

            if 'idgroup' in df.columns:
                temp_df = df.copy()
                temp_df['cluster_id'] = temp_df.index.map(clusters)

                mask = temp_df['idgroup'].notna() & temp_df['cluster_id'].notna()
                y_true = temp_df.loc[mask, 'idgroup'].tolist()
                y_pred = temp_df.loc[mask, 'cluster_id'].tolist()

                ari, v_measure = evaluate_clustering(y_true, y_pred)
                n_gt_clusters = len(set(y_true))
            else:
                ari, v_measure = 0, 0
                n_gt_clusters = 0

            n_clusters = len(set(clusters.values()))

            results.append({
                'shingle_size': shingle_size,
                'threshold': threshold,
                'n_clusters': n_clusters,
                'n_similarities': len(similarities),
                'ari': ari,
                'v_measure': v_measure,
                'n_gt_clusters': n_gt_clusters
            })

            if ari > best_ari:
                best_ari = ari
                best_threshold = threshold
                best_shingle_size = shingle_size
                best_clusters = clusters
                best_similarities = similarities

            print(f"  ARI: {ari:.4f}, V-measure: {v_measure:.4f}, Clusters: {n_clusters}")

    results_df = pd.DataFrame(results)

    print("\n" + "="*100)
    print("VERSE-LEVEL GRID SEARCH SUMMARY")
    print("="*100)
    print(f"\n{'Shingle':<10} {'Threshold':<12} {'Clusters':<10} {'Similarities':<15} {'ARI':<8} {'V-measure':<12}")
    print("-" * 80)

    for _, result in results_df.iterrows():
        print(f"{result['shingle_size']:<10} "
              f"{result['threshold']:<12.0%} "
              f"{result['n_clusters']:<10} "
              f"{result['n_similarities']:<15} "
              f"{result['ari']:<8.4f} "
              f"{result['v_measure']:<12.4f}")

    print(f"\n{'='*100}")
    print("BEST VERSE-LEVEL PARAMETERS")
    print("="*100)
    print(f"\nBest parameters by ARI:")
    print(f"  Shingle size: {best_shingle_size}")
    print(f"  Threshold: {best_threshold:.0%}")
    best_result = results_df[(results_df['threshold'] == best_threshold) &
                              (results_df['shingle_size'] == best_shingle_size)].iloc[0]
    print(f"  ARI: {best_result['ari']:.4f}")
    print(f"  V-measure: {best_result['v_measure']:.4f}")
    print(f"  Number of clusters: {best_result['n_clusters']}")
    print(f"  Number of similarities found: {best_result['n_similarities']}")

    visualize_verse_grid_search(results_df)

    results_csv = RESULTS_DIR / 'verse_grid_search_results.csv'
    results_df.to_csv(results_csv, index=False)
    print(f"\nVerse grid search results saved to: {results_csv}")

    return best_clusters, best_similarities, best_threshold, best_shingle_size, results_df


def poem_level_grid_search(df, poem_to_clusters, thresholds):
    results = []
    best_ari = -1
    best_threshold = None
    best_poem_clusters = None

    print("\n" + "="*100)
    print("POEM-LEVEL GRID SEARCH")
    print("="*100)
    print(f"\nTesting thresholds: {[f'{t:.0%}' for t in thresholds]}\n")

    for threshold in thresholds:
        print(f"\nTesting threshold {threshold:.0%}...")

        poem_clusters, poem_edges, n_clusters = cluster_poems(poem_to_clusters, threshold)

        if 'type_id' in df.columns:
            poem_to_type = df.groupby('idoriginal_poem')['type_id'].first().to_dict()

            y_true = []
            y_pred = []
            for poem_id, predicted_cluster in poem_clusters.items():
                if poem_id in poem_to_type:
                    y_true.append(poem_to_type[poem_id])
                    y_pred.append(predicted_cluster)

            ari, v_measure = evaluate_clustering(y_true, y_pred)
            reconstruction_rate, n_perfect, n_total_gt = calculate_perfect_reconstruction_rate(df, poem_clusters)
        else:
            ari, v_measure = 0, 0
            reconstruction_rate, n_perfect, n_total_gt = 0, 0, 0

        results.append({
            'threshold': threshold,
            'n_clusters': n_clusters,
            'n_edges': len(poem_edges),
            'ari': ari,
            'v_measure': v_measure,
            'perfect_reconstruction_rate': reconstruction_rate,
            'n_perfect_clusters': n_perfect,
            'n_total_gt_clusters': n_total_gt
        })

        if ari > best_ari:
            best_ari = ari
            best_threshold = threshold
            best_poem_clusters = poem_clusters

        print(f"  ARI: {ari:.4f}, V-measure: {v_measure:.4f}, Clusters: {n_clusters}")

    results_df = pd.DataFrame(results)

    print("\n" + "="*100)
    print("POEM-LEVEL GRID SEARCH SUMMARY")
    print("="*100)
    print(f"\n{'Threshold':<12} {'Clusters':<10} {'Edges':<10} {'ARI':<8} {'V-measure':<12} {'Perfect Recon.':<15}")
    print("-" * 80)

    for _, result in results_df.iterrows():
        print(f"{result['threshold']:<12.0%} "
              f"{result['n_clusters']:<10} "
              f"{result['n_edges']:<10} "
              f"{result['ari']:<8.4f} "
              f"{result['v_measure']:<12.4f} "
              f"{result['perfect_reconstruction_rate']:<15.1%}")

    print(f"\n{'='*100}")
    print("BEST POEM-LEVEL THRESHOLD")
    print("="*100)
    print(f"\nBest threshold by ARI: {best_threshold:.0%}")
    best_result = results_df[results_df['threshold'] == best_threshold].iloc[0]
    print(f"  ARI: {best_result['ari']:.4f}")
    print(f"  V-measure: {best_result['v_measure']:.4f}")
    print(f"  Perfect reconstruction rate: {best_result['perfect_reconstruction_rate']:.1%}")
    print(f"    ({best_result['n_perfect_clusters']:.0f}/{best_result['n_total_gt_clusters']:.0f} GT clusters perfectly reconstructed)")

    visualize_poem_grid_search(results_df)

    results_csv = RESULTS_DIR / 'poem_grid_search_results.csv'
    results_df.to_csv(results_csv, index=False)
    print(f"\nPoem grid search results saved to: {results_csv}")

    return best_poem_clusters, best_threshold, results_df


def main():
    print("="*100)
    print("LOADING DATA")
    print("="*100)
    print(f"Results will be saved to: {RESULTS_DIR}")

    df = pd.read_csv(DATA_FILE)

    if 'verse' in df.columns:
        df['text'] = df['verse']
    elif 'text' not in df.columns:
        raise ValueError("Dataset must have either 'verse' or 'text' column")

    df['text'] = df['text'].fillna('').astype(str)
    print(f"\nLoaded {df.shape[0]:,} verses")

    texts = df['text'].tolist()

    verse_thresholds = [0.2, 0.3, 0.4, 0.5]
    shingle_sizes = [2, 3, 4, 5]

    best_clusters, best_similarities, best_verse_threshold, best_shingle_size, verse_results = verse_level_grid_search(
        texts, df, verse_thresholds, shingle_sizes, num_perm=128
    )

    df['cluster_id'] = df.index.map(best_clusters)

    sim_dict = defaultdict(list)
    for doc1, doc2, sim in best_similarities:
        sim_dict[doc1].append(sim)
        sim_dict[doc2].append(sim)

    df['certainty'] = df.index.map(
        lambda i: np.mean(sim_dict[i]) if i in sim_dict else 1.0
    )

    preprocessor = TextPreprocessor(lowercase=True, remove_punctuation=True, remove_diacritics=True)
    df['text_preprocessed'] = df['text'].apply(preprocessor.preprocess)

    verse_output = RESULTS_DIR / "dbbe_verse_clustered_results.csv"
    df.to_csv(verse_output, index=False)
    print(f"\n{verse_output} saved with best parameters (shingle_size={best_shingle_size}, threshold={best_verse_threshold:.0%})")

    if 'idoriginal_poem' in df.columns and 'type_id' in df.columns:
        poem_to_clusters, poem_verse_counts = reconstruct_poems(df)

        poem_thresholds = [0.50, 0.60, 0.70, 0.8, 0.9]

        best_poem_clusters, best_poem_threshold, poem_results = poem_level_grid_search(
            df, poem_to_clusters, poem_thresholds
        )

        df['poem_cluster_id'] = df['idoriginal_poem'].map(best_poem_clusters)

        poem_output = RESULTS_DIR / "dbbe_poem_level_clusters.csv"
        df.to_csv(poem_output, index=False)
        print(f"\n{poem_output} saved with best threshold ({best_poem_threshold:.0%})")
    else:
        print("\n" + "="*100)
        print("SKIPPING POEM-LEVEL CLUSTERING")
        print("="*100)
        print("\nRequired columns 'idoriginal_poem' and/or 'type_id' not found")

    print("\n" + "="*100)
    print("ANALYSIS COMPLETE")
    print("="*100)
    print(f"All results saved to: {RESULTS_DIR}")


if __name__ == "__main__":
    main()

# 2. Full dataset

In [None]:
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
from datasketch import MinHash, MinHashLSHForest
import multiprocessing as mp
import hashlib
import time
import matplotlib.pyplot as plt
import seaborn as sns
import os
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from collections import defaultdict
from numba import njit, cuda
import platform
import socket
from datetime import datetime
import gc
from orthography_helpers import PoemThresholdSelector, TextPreprocessor, SystemResourceAnalyzer, ResourceMonitor, TimingLogger, PerformanceReporter

system_analyzer = SystemResourceAnalyzer()
resource_monitor = ResourceMonitor()
timing_logger = TimingLogger()

def get_system_info():
    info = {
        'hostname': socket.gethostname(),
        'platform': platform.platform(),
        'python_version': platform.python_version(),
        'processor': platform.processor(),
        'cpu_count_physical': system_analyzer.cpu_count_physical,
        'cpu_count_logical': system_analyzer.cpu_count_logical,
        'total_ram_gb': system_analyzer.total_ram_gb,
        'available_ram_gb': system_analyzer.available_ram_gb,
        'has_gpu': system_analyzer.has_gpu,
        'gpu_count': system_analyzer.gpu_count,
        'gpu_memory_gb': system_analyzer.gpu_memory_gb,
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }
    return info

system_analyzer.print_summary()
resource_monitor.start()
script_start_time = time.time()

CLEAN_PATTERN = re.compile(r'[^\w\s]')
WHITESPACE_PATTERN = re.compile(r'\s+')

class UnionFind:
    __slots__ = ['parent', 'rank']

    def __init__(self, n):
        self.parent = list(range(n))
        self.rank = [0] * n

    def find(self, x):
        if self.parent[x] != x:
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]

    def union(self, x, y):
        px, py = self.find(x), self.find(y)
        if px == py:
            return False
        if self.rank[px] < self.rank[py]:
            self.parent[px] = py
        elif self.rank[px] > self.rank[py]:
            self.parent[py] = px
        else:
            self.parent[py] = px
            self.rank[px] += 1
        return True

    def get_clusters(self):
        return np.array([self.find(i) for i in range(len(self.parent))], dtype=np.int32)

def get_ngrams_vectorized(text, n=4):
    if not text or len(text) < n:
        return set()
    text = str(text).lower()
    return set(text[i:i+n] for i in range(len(text)-n+1))

def compute_minhash_chunk(args):
    texts, start_idx, n_gram_size, num_perm, seed = args
    np.random.seed(seed)
    minhashes = []
    for text in texts:
        ngrams = get_ngrams_vectorized(text, n_gram_size)
        m = MinHash(num_perm=num_perm, seed=seed)
        if ngrams:
            for ngram in ngrams:
                m.update(ngram.encode('utf8'))
        minhashes.append(m)
    return minhashes

def compute_minhash_parallel(texts, n_gram_size=3, num_perm=128, n_cores=None):
    if n_cores is None:
        n_cores = system_analyzer.get_optimal_workers('cpu_intensive')

    chunk_size = system_analyzer.get_optimal_chunk_size(len(texts), n_cores)
    chunks = [(texts[i:i+chunk_size], i, n_gram_size, num_perm, 42)
              for i in range(0, len(texts), chunk_size)]

    print(f"  Using {n_cores} workers with chunk size {chunk_size}")

    with mp.Pool(n_cores) as pool:
        results = list(tqdm(pool.imap(compute_minhash_chunk, chunks),
                          total=len(chunks), desc=f"MinHash (n={n_gram_size})", leave=False))

    minhashes = [mh for chunk_mhs in results for mh in chunk_mhs]
    return minhashes

def fast_hash(data):
    return int(hashlib.md5(data).hexdigest()[:16], 16)

def find_exact_duplicates_fast(texts):
    n_workers = system_analyzer.get_optimal_workers('io_intensive')
    chunk_size = system_analyzer.get_optimal_chunk_size(len(texts), n_workers)

    print(f"  Using {n_workers} workers for hashing")

    def hash_chunk(chunk_data):
        chunk_texts, start_idx = chunk_data
        local_hashes = {}
        for i, text in enumerate(chunk_texts):
            normalized = str(text).strip().lower()
            if not normalized:
                continue
            text_hash = fast_hash(normalized.encode('utf-8'))
            local_hashes.setdefault(text_hash, []).append(start_idx + i)
        return local_hashes

    chunks = [(texts[i:i+chunk_size], i) for i in range(0, len(texts), chunk_size)]

    with ThreadPoolExecutor(max_workers=n_workers) as executor:
        chunk_results = list(tqdm(executor.map(hash_chunk, chunks),
                                 total=len(chunks), desc="Hashing", leave=False))

    text_hashes = {}
    for chunk_hash_dict in chunk_results:
        for hash_val, indices in chunk_hash_dict.items():
            text_hashes.setdefault(hash_val, []).extend(indices)

    duplicate_groups = [indices for indices in text_hashes.values() if len(indices) > 1]
    return duplicate_groups

def stratified_sample(df, n_sample=15000):
    datasets = df['source_dataset'].unique()
    total_size = len(df)
    sample_indices = []

    for dataset in datasets:
        dataset_indices = df[df['source_dataset'] == dataset].index.tolist()
        dataset_size = len(dataset_indices)
        proportion = dataset_size / total_size
        n_from_dataset = int(n_sample * proportion)
        n_from_dataset = min(n_from_dataset, dataset_size)
        if n_from_dataset > 0:
            sampled = np.random.choice(dataset_indices, size=n_from_dataset, replace=False)
            sample_indices.extend(sampled)

    return sorted(sample_indices)

def compute_cluster_cohesion(minhashes, cluster_labels):
    unique_clusters = np.unique(cluster_labels)
    cohesions = []

    for cluster_id in unique_clusters:
        cluster_indices = np.where(cluster_labels == cluster_id)[0]
        if len(cluster_indices) < 2:
            continue

        if len(cluster_indices) > 50:
            sampled_indices = np.random.choice(cluster_indices, 50, replace=False)
        else:
            sampled_indices = cluster_indices

        sims = []
        for i in range(len(sampled_indices)):
            for j in range(i+1, len(sampled_indices)):
                sim = minhashes[sampled_indices[i]].jaccard(minhashes[sampled_indices[j]])
                sims.append(sim)

        if sims:
            cohesions.append(np.mean(sims))

    return np.mean(cohesions) if cohesions else 0.0

def compute_cluster_separation(minhashes, cluster_labels, n_samples=500):
    unique_clusters = np.unique(cluster_labels)
    if len(unique_clusters) < 2:
        return 1.0

    separations = []
    for _ in range(n_samples):
        c1, c2 = np.random.choice(unique_clusters, 2, replace=False)
        idx1 = np.random.choice(np.where(cluster_labels == c1)[0])
        idx2 = np.random.choice(np.where(cluster_labels == c2)[0])
        sim = minhashes[idx1].jaccard(minhashes[idx2])
        separations.append(1 - sim)

    return np.mean(separations) if separations else 0.0

def compute_silhouette_approximation(minhashes, cluster_labels, n_samples=1000):
    unique_clusters = np.unique(cluster_labels)
    if len(unique_clusters) < 2:
        return 0.0

    n_total = len(cluster_labels)
    if n_total > n_samples:
        sample_indices = np.random.choice(n_total, n_samples, replace=False)
    else:
        sample_indices = np.arange(n_total)

    silhouettes = []

    for idx in sample_indices:
        cluster_id = cluster_labels[idx]
        same_cluster = np.where(cluster_labels == cluster_id)[0]
        same_cluster = same_cluster[same_cluster != idx]

        if len(same_cluster) == 0:
            continue

        if len(same_cluster) > 20:
            same_cluster = np.random.choice(same_cluster, 20, replace=False)

        a = np.mean([1 - minhashes[idx].jaccard(minhashes[j])
                    for j in same_cluster])

        other_clusters = unique_clusters[unique_clusters != cluster_id]
        if len(other_clusters) == 0:
            continue

        min_b = float('inf')
        for other_id in other_clusters:
            other_cluster = np.where(cluster_labels == other_id)[0]

            if len(other_cluster) > 20:
                other_cluster = np.random.choice(other_cluster, 20, replace=False)

            b = np.mean([1 - minhashes[idx].jaccard(minhashes[j])
                        for j in other_cluster])
            min_b = min(min_b, b)

        s = (min_b - a) / max(a, min_b) if max(a, min_b) > 0 else 0
        silhouettes.append(s)

    return np.mean(silhouettes) if silhouettes else 0.0

def evaluate_single_config(args):
    shingle_size, threshold, texts, sample_indices, duplicate_groups = args

    try:
        sample_texts = [texts[i] for i in sample_indices]
        minhashes_sample = compute_minhash_parallel(
            sample_texts,
            n_gram_size=shingle_size,
            num_perm=128
        )

        forest = MinHashLSHForest(num_perm=128)
        for idx, mh in enumerate(minhashes_sample):
            forest.add(str(idx), mh)
        forest.index()

        n_sample = len(sample_indices)
        uf = UnionFind(n_sample)

        sample_set = set(sample_indices)
        for group in duplicate_groups:
            sample_group = [sample_indices.index(g) for g in group if g in sample_set]
            if len(sample_group) > 1:
                for i in range(1, len(sample_group)):
                    uf.union(sample_group[0], sample_group[i])

        top_k = 50
        merges = 0
        for idx in range(n_sample):
            if uf.find(idx) != idx:
                continue
            neighbors = forest.query(minhashes_sample[idx], top_k)
            for neighbor_str in neighbors[1:]:
                neighbor_idx = int(neighbor_str)
                if uf.find(idx) == uf.find(neighbor_idx):
                    continue
                sim = minhashes_sample[idx].jaccard(minhashes_sample[neighbor_idx])
                if sim >= threshold:
                    if uf.union(idx, neighbor_idx):
                        merges += 1

        cluster_labels = uf.get_clusters()
        unique_clusters, cluster_sizes = np.unique(cluster_labels, return_counts=True)

        n_clusters = len(unique_clusters)
        n_multi = np.sum(cluster_sizes > 1)
        n_singleton = np.sum(cluster_sizes == 1)
        avg_size = float(cluster_sizes.mean())
        max_size = int(cluster_sizes.max())

        cohesion = compute_cluster_cohesion(minhashes_sample, cluster_labels)
        separation = compute_cluster_separation(minhashes_sample, cluster_labels)
        silhouette = compute_silhouette_approximation(minhashes_sample, cluster_labels)

        return {
            'shingle_size': shingle_size,
            'threshold': threshold,
            'n_clusters': n_clusters,
            'n_multi_clusters': n_multi,
            'n_singletons': n_singleton,
            'avg_cluster_size': avg_size,
            'max_cluster_size': max_size,
            'cohesion': cohesion,
            'separation': separation,
            'silhouette': silhouette,
            'merges': merges
        }

    except Exception as e:
        print(f"Error at shingle={shingle_size}, threshold={threshold:.2f}: {e}")
        return None

def grid_search_parameters(texts, df, duplicate_groups,
                          shingle_sizes=[2, 3,4,5],
                          threshold_range=(0.3, 0.9, 7),
                          n_sample=15000,
                          results_folder="full_orthographic_results",
                          max_workers=None):
    timing_logger.start_stage("01_verse_parameter_search")

    if max_workers is None:
        max_workers = system_analyzer.get_optimal_workers('cpu_intensive')


    print("2D GRID SEARCH: SHINGLE SIZE × THRESHOLD")


    sample_indices = stratified_sample(df, n_sample)
    print(f"Sample size: {len(sample_indices):,}")

    thresholds = np.linspace(threshold_range[0], threshold_range[1], threshold_range[2])

    print(f"\nParameter grid:")
    print(f"  Shingle sizes: {shingle_sizes}")
    print(f"  Thresholds: {len(thresholds)} values from {thresholds[0]:.2f} to {thresholds[-1]:.2f}")
    print(f"  Total combinations: {len(shingle_sizes) * len(thresholds)}")

    args_list = []
    for shingle_size in shingle_sizes:
        for threshold in thresholds:
            args_list.append((shingle_size, threshold, texts, sample_indices, duplicate_groups))

    print(f"\nRunning grid search with {max_workers} workers...")
    start_time = time.time()

    results = []
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(evaluate_single_config, args): args for args in args_list}

        with tqdm(total=len(futures), desc="Grid search") as pbar:
            for future in as_completed(futures):
                result = future.result()
                if result is not None:
                    results.append(result)
                pbar.update(1)

    print(f"Grid search complete in {time.time()-start_time:.1f}s")
    print(f"  Valid results: {len(results)} / {len(args_list)}")

    results_df = pd.DataFrame(results)

    print("\nComputing quality scores...")

    def normalize(series):
        min_val = series.min()
        max_val = series.max()
        if max_val - min_val < 1e-10:
            return pd.Series(0.5, index=series.index)
        return (series - min_val) / (max_val - min_val)

    silhouette_score = normalize(results_df['silhouette'])
    cohesion_score = normalize(results_df['cohesion'])
    separation_score = normalize(results_df['separation'])

    singleton_ratio = results_df['n_singletons'] / len(sample_indices)
    balance_score = 1 - singleton_ratio
    balance_score = np.clip(balance_score, 0, 1)

    results_df['quality_score'] = (
        silhouette_score * 0.25 +
        cohesion_score * 0.25 +
        separation_score * 0.25 +
        balance_score * 0.25
    )

    results_df = results_df.sort_values('quality_score', ascending=False)
    results_csv = os.path.join(results_folder, 'parameter_grid_search_results.csv')
    results_df.to_csv(results_csv, index=False)
    print(f"Results saved: {results_csv}")

    create_verse_grid_search_heatmap(results_df, results_folder)

    best_config = results_df.iloc[0]
    best_shingle = int(best_config['shingle_size'])
    best_threshold = float(best_config['threshold'])

    print("TOP 5 CONFIGURATIONS (BY QUALITY SCORE)")

    for idx, (i, row) in enumerate(results_df.head(5).iterrows(), 1):
        print(f"\n#{idx}. Shingle size: {int(row['shingle_size'])}, Threshold: {row['threshold']:.3f}")
        print(f"     Quality score: {row['quality_score']:.3f}")
        print(f"     Silhouette: {row['silhouette']:.3f}, Cohesion: {row['cohesion']:.3f}, "
              f"Separation: {row['separation']:.3f}")
        print(f"     Clusters: {int(row['n_multi_clusters']):,}, Singletons: {int(row['n_singletons']):,}, "
              f"Avg size: {row['avg_cluster_size']:.1f}")

    print("SELECTED CONFIGURATION (HIGHEST QUALITY)")
    print(f"Shingle size: {best_shingle}")
    print(f"Threshold: {best_threshold:.3f}")
    print(f"Quality score: {best_config['quality_score']:.3f}")
    print(f"  - Silhouette: {best_config['silhouette']:.3f}")
    print(f"  - Cohesion: {best_config['cohesion']:.3f}")
    print(f"  - Separation: {best_config['separation']:.3f}")
    print(f"Multi-member clusters: {int(best_config['n_multi_clusters']):,}")
    print(f"Singletons: {int(best_config['n_singletons']):,}")

    timing_logger.end_stage()
    return best_shingle, best_threshold, results_df

def create_verse_grid_search_heatmap(results_df, results_folder):
    print("\nCreating verse-level heatmap...")

    sns.set_palette("colorblind")
    fig, ax = plt.subplots(1, 1, figsize=(10, 8))

    pivot_quality = results_df.pivot_table(
        values='quality_score',
        index='threshold',
        columns='shingle_size',
        aggfunc='first'
    )
    sns.heatmap(pivot_quality, annot=True, fmt='.3f', cmap='viridis', ax=ax,
               cbar_kws={'label': 'Quality Score'})
    ax.set_ylabel('Threshold', fontweight='bold', fontsize=12)
    ax.set_xlabel('Shingle Size', fontweight='bold', fontsize=12)
    ax.set_title('Verse-Level Quality Score Heatmap', fontweight='bold', fontsize=14)

    plt.tight_layout()
    plot_path = os.path.join(results_folder, 'verse_grid_search_heatmap.png')
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    print(f"Verse heatmap saved: {plot_path}")
    plt.close()

def cluster_with_lsh_forest(minhashes, duplicate_groups, threshold, top_k=100):
    n_docs = len(minhashes)
    uf = UnionFind(n_docs)

    exact_merges = 0
    for group in duplicate_groups:
        for i in range(1, len(group)):
            if uf.union(group[0], group[i]):
                exact_merges += 1

    forest = MinHashLSHForest(num_perm=len(minhashes[0].hashvalues))

    n_workers = system_analyzer.get_optimal_workers('io_intensive')
    chunk_size = system_analyzer.get_optimal_chunk_size(n_docs, n_workers)

    print(f"  Indexing with {n_workers} threads, chunk size {chunk_size}")

    def index_chunk(chunk_data):
        chunk_minhashes, start_idx = chunk_data
        local_forest = MinHashLSHForest(num_perm=len(minhashes[0].hashvalues))
        for i, mh in enumerate(chunk_minhashes):
            local_forest.add(str(start_idx + i), mh)
        return local_forest

    for idx, mh in enumerate(tqdm(minhashes, desc="Indexing", leave=False)):
        forest.add(str(idx), mh)
    forest.index()

    lsh_merges = 0
    verified_pairs = 0

    optimal_chunk = system_analyzer.get_optimal_chunk_size(n_docs, system_analyzer.get_optimal_workers('memory_intensive'))

    for start_idx in tqdm(range(0, n_docs, optimal_chunk), desc="Clustering"):
        end_idx = min(start_idx + optimal_chunk, n_docs)
        for idx in range(start_idx, end_idx):
            if uf.find(idx) != idx:
                continue
            neighbors = forest.query(minhashes[idx], top_k)
            for neighbor_str in neighbors[1:]:
                neighbor_idx = int(neighbor_str)
                if uf.find(idx) == uf.find(neighbor_idx):
                    continue
                verified_pairs += 1
                sim = minhashes[idx].jaccard(minhashes[neighbor_idx])
                if sim >= threshold:
                    if uf.union(idx, neighbor_idx):
                        lsh_merges += 1

    cluster_labels = uf.get_clusters()
    unique_clusters, cluster_sizes = np.unique(cluster_labels, return_counts=True)

    return cluster_labels, {
        'n_clusters': len(unique_clusters),
        'n_multi_clusters': np.sum(cluster_sizes > 1),
        'n_singletons': np.sum(cluster_sizes == 1),
        'avg_cluster_size': float(cluster_sizes.mean()),
        'max_cluster_size': int(cluster_sizes.max()),
        'exact_merges': exact_merges,
        'lsh_merges': lsh_merges,
        'threshold': threshold,
        'verified_pairs': verified_pairs
    }

@njit
def jaccard_numba(a_arr, b_arr):
    intersection = 0
    a_set = set(a_arr)
    b_set = set(b_arr)

    for item in a_set:
        if item in b_set:
            intersection += 1

    union = len(a_set) + len(b_set) - intersection
    if union == 0:
        return 0.0
    return intersection / union

@njit
def count_shared_verses(a_arr, b_arr):
    shared = 0
    a_set = set(a_arr)
    b_set = set(b_arr)

    for item in a_set:
        if item in b_set:
            shared += 1

    return shared

class PoemUnionFind:
    __slots__ = ['parent', 'rank']

    def __init__(self, elements):
        self.parent = {e: e for e in elements}
        self.rank = {e: 0 for e in elements}

    def find(self, x):
        if self.parent[x] != x:
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]

    def union(self, x, y):
        px, py = self.find(x), self.find(y)
        if px == py:
            return False
        if self.rank[px] < self.rank[py]:
            px, py = py, px
        self.parent[py] = px
        if self.rank[px] == self.rank[py]:
            self.rank[px] += 1
        return True

    def get_clusters(self):
        clusters = defaultdict(set)
        for elem in self.parent.keys():
            clusters[self.find(elem)].add(elem)
        return dict(clusters)

def compute_similarity_batch(args):
    pairs_batch, poem_to_array_dict, min_shared = args

    results = []
    for p1, p2 in pairs_batch:
        arr1 = poem_to_array_dict[p1]
        arr2 = poem_to_array_dict[p2]

        shared = count_shared_verses(arr1, arr2)

        if shared >= min_shared:
            sim = jaccard_numba(arr1, arr2)
            results.append({
                'poem1': p1,
                'poem2': p2,
                'similarity': sim,
                'shared_verses': shared
            })

    return results

def cluster_all_poems_at_threshold(df, poem_threshold, poem_to_clusters, results_folder="full_orthographic_results"):
    timing_logger.start_stage("03_full_poem_clustering")


    print("CLUSTERING ALL POEMS WITH OPTIMAL THRESHOLD")

    print(f"Threshold: {poem_threshold:.3f}")

    poem_to_dataset = df.groupby('idoriginal_poem')['source_dataset'].first().to_dict()

    cluster_to_poems = defaultdict(set)
    for poem_id, clusters in poem_to_clusters.items():
        for c in clusters:
            cluster_to_poems[c].add(poem_id)

    print("\nFinding cross-dataset candidate pairs...")
    datasets = df['source_dataset'].unique()

    poems_by_dataset = defaultdict(list)
    for poem_id, dataset in poem_to_dataset.items():
        poems_by_dataset[dataset].append(poem_id)

    all_pairs = set()

    n_workers = system_analyzer.get_optimal_workers('io_intensive')

    def process_dataset_pair(dataset_pair):
        dataset1, dataset2 = dataset_pair
        poems1 = poems_by_dataset[dataset1]
        poems2 = poems_by_dataset[dataset2]
        poems2_set = set(poems2)

        local_pairs = set()
        for poem_id in poems1:
            clusters = poem_to_clusters.get(poem_id, [])

            candidates = set()
            for cluster_id in clusters:
                if int(cluster_id) in cluster_to_poems:
                    candidates.update(cluster_to_poems[int(cluster_id)])

            candidates = candidates & poems2_set

            for other_poem in candidates:
                pair = tuple(sorted([poem_id, other_poem]))
                local_pairs.add(pair)

        return local_pairs

    dataset_pairs = []
    for i, dataset1 in enumerate(datasets):
        for dataset2 in datasets[i+1:]:
            dataset_pairs.append((dataset1, dataset2))

    print(f"  Processing {len(dataset_pairs)} dataset pairs with {n_workers} workers...")

    with ThreadPoolExecutor(max_workers=n_workers) as executor:
        pair_results = list(tqdm(executor.map(process_dataset_pair, dataset_pairs),
                                total=len(dataset_pairs), desc="Dataset pairs"))

    for pair_set in pair_results:
        all_pairs.update(pair_set)

    print(f"  Total candidate pairs: {len(all_pairs):,}")

    print("\nClustering poems...")
    poem_ids = list(poem_to_clusters.keys())
    uf = PoemUnionFind(poem_ids)

    merges = 0
    for p1, p2 in tqdm(all_pairs, desc="Processing pairs"):
        clusters1 = poem_to_clusters[p1]
        clusters2 = poem_to_clusters[p2]

        intersection = len(set(clusters1) & set(clusters2))
        union = len(set(clusters1) | set(clusters2))

        if union > 0:
            jaccard = intersection / union
            if jaccard >= poem_threshold:
                if uf.union(p1, p2):
                    merges += 1

    print(f"  Performed {merges:,} merges")

    poem_clusters = uf.get_clusters()
    cluster_assignments = {}
    for cluster_id, poems in poem_clusters.items():
        for poem in poems:
            cluster_assignments[poem] = cluster_id

    n_clusters = len(poem_clusters)
    cluster_sizes = [len(poems) for poems in poem_clusters.values()]
    n_singletons = sum(1 for size in cluster_sizes if size == 1)

    n_cross_dataset_clusters = 0
    cross_dataset_cluster_ids = set()
    for cluster_id, poems in poem_clusters.items():
        datasets = set(poem_to_dataset.get(p) for p in poems)
        if len(datasets) > 1:
            n_cross_dataset_clusters += 1
            cross_dataset_cluster_ids.add(cluster_id)

    df['poem_cluster_id'] = df['idoriginal_poem'].astype(str).map(cluster_assignments)
    df['is_cross_dataset_poem_cluster'] = df['poem_cluster_id'].isin(cross_dataset_cluster_ids)

    output_csv = os.path.join(results_folder, "poems_clustered_full.csv")
    df.to_csv(output_csv, index=False)
    print(f"\nFull results saved: {output_csv}")

    poem_summary = {
        'n_verses': len(df),
        'n_poems': len(poem_to_clusters),
        'n_datasets': len(set(poem_to_dataset.values())),
        'best_jaccard_threshold': poem_threshold,
        'n_poem_clusters': len(set(cluster_assignments.values())),
        'n_cross_dataset_clusters': n_cross_dataset_clusters,
        'n_poems_in_cross_dataset_clusters': sum(df['is_cross_dataset_poem_cluster'])
    }

    summary_csv = os.path.join(results_folder, 'poem_clustering_full_summary.csv')
    pd.DataFrame([poem_summary]).to_csv(summary_csv, index=False)
    print(f"Summary saved: {summary_csv}")
    timing_logger.end_stage()
    return df, poem_clusters, cluster_assignments, poem_summary

def print_example_clusters(df, results_folder="full_orthographic_results"):
    print("5 EXAMPLE VERSE-LEVEL CLUSTERS (multi-member)")
    cluster_info = df[df['cluster_id'] != -1].groupby('cluster_id').agg({
        'verse': 'count',
        'source_dataset': lambda x: list(x.unique())
    }).rename(columns={'verse': 'size'})

    multi_clusters = cluster_info[cluster_info['size'] > 1].sort_values('size', ascending=False)

    for idx, (cluster_id, row) in enumerate(multi_clusters.head(5).iterrows(), 1):
        print(f"\nVerse Cluster {idx} (ID: {cluster_id})")
        print(f"  Size: {row['size']} verses")
        print(f"  Datasets: {', '.join(row['source_dataset'])}")

        cluster_verses = df[df['cluster_id'] == cluster_id]
        print(f"  Example verses:")
        for i, (_, verse_row) in enumerate(cluster_verses.head(3).iterrows(), 1):
            verse_text = str(verse_row['verse'])[:80]
            print(f"    {i}. [{verse_row['source_dataset']}] {verse_text}...")

    print("5 EXAMPLE POEM-LEVEL CLUSTERS (multi-member)")

    if 'poem_cluster_id' in df.columns:
        poem_cluster_info = df[df['poem_cluster_id'].notna()].groupby('poem_cluster_id').agg({
            'idoriginal_poem': lambda x: len(set(x)),
            'source_dataset': lambda x: list(set(x))
        }).rename(columns={'idoriginal_poem': 'n_poems'})

        multi_poem_clusters = poem_cluster_info[poem_cluster_info['n_poems'] > 1].sort_values('n_poems', ascending=False)

        for idx, (cluster_id, row) in enumerate(multi_poem_clusters.head(5).iterrows(), 1):
            print(f"\nPoem Cluster {idx} (ID: {cluster_id})")
            print(f"  Size: {row['n_poems']} poems")
            print(f"  Datasets: {', '.join(row['source_dataset'])}")

            cluster_poems = df[df['poem_cluster_id'] == cluster_id]['idoriginal_poem'].unique()
            print(f"  Poems in cluster:")
            for i, poem_id in enumerate(cluster_poems[:5], 1):
                poem_data = df[df['idoriginal_poem'] == poem_id]
                dataset = poem_data['source_dataset'].iloc[0]
                n_verses = len(poem_data)
                print(f"    {i}. Poem {poem_id} [{dataset}] - {n_verses} verses")

                first_verse = poem_data.iloc[0]['verse'][:80]
                print(f"       First verse: {first_verse}...")

            if len(cluster_poems) > 5:
                print(f"    ... and {len(cluster_poems) - 5} more poems")
    else:
        print("  Poem clustering not yet completed")

def main():
    results_folder = "full_orthographic_results"
    os.makedirs(results_folder, exist_ok=True)
    print(f"Results will be saved to: {results_folder}/\n")

    clustered_file = os.path.join(results_folder, "clustered_optimized.csv")
    metrics_file = os.path.join(results_folder, "clustering_metrics.csv")

    if os.path.exists(clustered_file) and os.path.exists(metrics_file):
        print(f"Loading from: {clustered_file}")

        timing_logger.start_stage("00_load_existing_results")

        df = pd.read_csv(clustered_file)
        metrics = pd.read_csv(metrics_file).iloc[0].to_dict()

        print(f"\nLoaded {len(df):,} verses")
        print(f"Verse clusters: {metrics['n_clusters']:,}")
        print(f"Multi-member clusters: {metrics['n_multi_clusters']:,}")
        print(f"Singletons: {metrics['n_singletons']:,}")

        verse_summary = {
            'n_verses': len(df),
            'best_shingle_size': int(metrics['best_shingle_size']),
            'best_threshold': float(metrics['best_threshold']),
            'n_clusters': int(metrics['n_clusters']),
            'n_multi_clusters': int(metrics['n_multi_clusters']),
            'n_singletons': int(metrics['n_singletons']),
            'max_cluster_size': int(metrics['max_cluster_size'])
        }

        timing_logger.end_stage()
        print("\nSkipping verse clustering - jumping to poem-level analysis")
    else:
        timing_logger.start_stage("00_data_loading")
        df = pd.read_csv("~/Downloads/concatenated.csv")
        df = df[df['source_dataset'].isin(['rhoby', 'dbbe', 'phi', 'papyri'])]
        df = df[df['verse'].fillna('').astype(str).str.len() >= 20]
        preprocessor = TextPreprocessor(lowercase=True, remove_punctuation=True, remove_diacritics=True)
        df['verse'] = df['verse'].apply(preprocessor.preprocess)
        df = df.reset_index(drop=True)
        df = df[df['verse'].str.strip().str.lower() != 'nan']
        texts = df['verse'].fillna('').astype(str).tolist()

        print(f"Verses: {len(texts):,}")

        timing_logger.end_stage()

        timing_logger.start_stage("01_exact_duplicates")

        duplicate_groups = find_exact_duplicates_fast(texts)
        print(f"Found {len(duplicate_groups):,} exact duplicate groups")

        timing_logger.end_stage()

        best_shingle, best_threshold, grid_results = grid_search_parameters(
            texts, df, duplicate_groups,
            shingle_sizes=[2, 3, 4, 5],
            threshold_range=(0.3, 0.85, 7),
            n_sample=15000,
            results_folder=results_folder,
            max_workers=system_analyzer.get_optimal_workers('cpu_intensive')
        )

        timing_logger.start_stage("02_minhash_computation")

        print(f"\nComputing MinHashes with optimal shingle size={best_shingle}...")
        minhashes = compute_minhash_parallel(texts, n_gram_size=best_shingle, num_perm=128)

        timing_logger.end_stage()
        timing_logger.start_stage("03_verse_clustering")

        print(f"\nClustering with threshold={best_threshold:.3f}...")
        cluster_labels, metrics = cluster_with_lsh_forest(
            minhashes, duplicate_groups, best_threshold, top_k=100
        )

        timing_logger.end_stage()

        timing_logger.start_stage("04_save_verse_results")

        df['cluster_id'] = cluster_labels
        output_csv = os.path.join(results_folder, "clustered_optimized.csv")
        df.to_csv(output_csv, index=False)
        print(f"Clustered data saved: {output_csv}")

        timing_logger.end_stage()

        verse_summary = {
            'n_verses': len(df),
            'best_shingle_size': best_shingle,
            'best_threshold': best_threshold,
            'n_clusters': metrics['n_clusters'],
            'n_multi_clusters': metrics['n_multi_clusters'],
            'n_singletons': metrics['n_singletons'],
            'max_cluster_size': metrics['max_cluster_size']
        }

        metrics.update({
            'total_time_minutes': (time.time() - script_start_time) / 60,
            'best_shingle_size': best_shingle,
            'best_threshold': best_threshold
        })
        metrics_csv = os.path.join(results_folder, "clustering_metrics.csv")
        pd.DataFrame([metrics]).to_csv(metrics_csv, index=False)
        print(f"Metrics saved: {metrics_csv}")

        gc.collect()

    print("POEM-LEVEL ANALYSIS")

    selector = PoemThresholdSelector(
        sample_size=15000,
        random_seed=42,
        min_shared_verses=1
    )

    poem_threshold, poem_grid_results, poem_similarities_df, poem_to_clusters = selector.run_threshold_analysis(df)

    gc.collect()

    df_final, poem_clusters, cluster_assignments, poem_summary = cluster_all_poems_at_threshold(
        df,
        poem_threshold,
        poem_to_clusters,
        results_folder
    )

    print_example_clusters(df_final, results_folder)
    return verse_summary, poem_threshold, poem_summary

if __name__ == "__main__":
    verse_summary, poem_threshold, poem_summary = main()

    resource_monitor.stop()
    total_time = time.time() - script_start_time

    system_info = get_system_info()
    resource_stats = resource_monitor.get_stats()
    timing_summary = timing_logger.get_summary()

    reporter = PerformanceReporter(results_folder='full_orthographic_results')
    report_path = reporter.generate_report(
        system_info=get_system_info(),
        resource_stats=resource_monitor.get_stats(),
        timing_summary=timing_logger.get_summary(),
        verse_summary=verse_summary,
        poem_threshold=poem_threshold,
        total_time=time.time() - script_start_time,
        poem_summary=poem_summary
    )

SYSTEM RESOURCE ANALYSIS
CPU Cores (Physical): 12
CPU Cores (Logical):  14
Total RAM:            30.80 GB
Available RAM:        12.58 GB
GPU Available:        No
Optimal Workers (CPU Intensive):    14
Optimal Workers (Memory Intensive): 6
Optimal Workers (I/O Intensive):    28
Results will be saved to: full_orthographic_results/



  df = pd.read_csv("~/Downloads/concatenated.csv")


Verses: 1,537,740
  Using 28 workers for hashing


                                                

Found 83,415 exact duplicate groups
2D GRID SEARCH: SHINGLE SIZE × THRESHOLD
Sample size: 14,997

Parameter grid:
  Shingle sizes: [2, 3, 4, 5]
  Thresholds: 7 values from 0.30 to 0.85
  Total combinations: 28

Running grid search with 14 workers...


Grid search:   0%|          | 0/28 [00:00<?, ?it/s]

  Using 14 workers with chunk size 133


MinHash (n=2):   0%|          | 0/113 [00:00<?, ?it/s]

  Using 14 workers with chunk size 133


MinHash (n=2):  61%|██████    | 69/113 [00:01<00:00, 60.74it/s]

  Using 14 workers with chunk size 133


MinHash (n=2):  14%|█▍        | 16/113 [00:00<00:02, 42.62it/s] 

  Using 14 workers with chunk size 133


MinHash (n=2):  23%|██▎       | 26/113 [00:00<00:01, 48.93it/s] 

  Using 14 workers with chunk size 133


                                                               

  Using 14 workers with chunk size 133


MinHash (n=2):  50%|████▉     | 56/113 [00:01<00:02, 27.64it/s]

  Using 14 workers with chunk size 133


MinHash (n=2):   2%|▏         | 2/113 [00:00<00:27,  3.98it/s]  

  Using 14 workers with chunk size 133


MinHash (n=2):  65%|██████▌   | 74/113 [00:02<00:00, 46.92it/s]

  Using 14 workers with chunk size 133


MinHash (n=3):  12%|█▏        | 13/113 [00:01<00:06, 14.97it/s] 

  Using 14 workers with chunk size 133


MinHash (n=3):  52%|█████▏    | 59/113 [00:03<00:02, 21.74it/s] 

  Using 14 workers with chunk size 133


MinHash (n=3):  68%|██████▊   | 77/113 [00:02<00:00, 41.89it/s]

  Using 14 workers with chunk size 133


MinHash (n=3):  93%|█████████▎| 105/113 [00:03<00:00, 47.51it/s]

  Using 14 workers with chunk size 133


MinHash (n=3):  38%|███▊      | 43/113 [00:02<00:02, 23.42it/s] 

  Using 14 workers with chunk size 133


Grid search:   4%|▎         | 1/28 [00:55<25:11, 55.99s/it]     

  Using 14 workers with chunk size 133


Grid search:   7%|▋         | 2/28 [01:02<11:42, 27.02s/it]    

  Using 14 workers with chunk size 133


MinHash (n=4):   1%|          | 1/113 [00:00<00:36,  3.04it/s]

  Using 14 workers with chunk size 133


Grid search:  14%|█▍        | 4/28 [01:13<05:07, 12.82s/it]     

  Using 14 workers with chunk size 133


MinHash (n=4):  26%|██▌       | 29/113 [00:01<00:02, 34.34it/s]

  Using 14 workers with chunk size 133


Grid search:  21%|██▏       | 6/28 [01:24<03:18,  9.01s/it]     

  Using 14 workers with chunk size 133
