# 1. DBBE

In [None]:
import re
import time
import unicodedata
from collections import Counter, defaultdict
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Set

import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import adjusted_rand_score, v_measure_score

RESULTS_DIR = Path("dbbe_orthographic_results")
RESULTS_DIR.mkdir(exist_ok=True)
DATA_FILE = 'paper_verses.csv'

try:
    import cupy as cp
    GPU_AVAILABLE = True
    print("GPU detected - using CuPy acceleration")
except ImportError:
    cp = np
    GPU_AVAILABLE = False
    print("No GPU - using NumPy (CPU mode)")


class TextPreprocessor:
    def __init__(self, lowercase=True, remove_punctuation=True, remove_diacritics=True):
        self.lowercase = lowercase
        self.remove_punctuation = remove_punctuation
        self.remove_diacritics = remove_diacritics
        if remove_punctuation:
            self.punct_pattern = re.compile(r'[^\w\s]', re.UNICODE)
            self.remove_chars_pattern = re.compile(r'[\(\)\{\}]')

    def _remove_diacritics(self, text: str) -> str:
        return ''.join(
            c for c in unicodedata.normalize('NFD', text)
            if unicodedata.category(c) != 'Mn'
        )

    def preprocess(self, text: str) -> str:
        if not isinstance(text, str):
            text = str(text) if pd.notna(text) else ''

        if self.remove_diacritics:
            text = self._remove_diacritics(text)
        if self.lowercase:
            text = text.lower()
        if self.remove_punctuation:
            text = self.remove_chars_pattern.sub('', text)
            text = self.punct_pattern.sub(' ', text)

        return ' '.join(text.split())

    def preprocess_batch(self, texts: List[str]) -> List[str]:
        return [self.preprocess(t) for t in texts]


class ShingleGenerator:
    def __init__(self, shingle_size: int = 4, use_gpu: bool = GPU_AVAILABLE):
        self.shingle_size = shingle_size
        self.use_gpu = use_gpu and GPU_AVAILABLE
        self.xp = cp if self.use_gpu else np

    def generate_shingles(self, text: str) -> np.ndarray:
        if len(text) < self.shingle_size:
            return np.array([hash(text) % (2**31)], dtype=np.int32)

        chars = self.xp.array([ord(c) for c in text], dtype=np.int32)
        n_shingles = len(text) - self.shingle_size + 1

        shingles = self.xp.zeros(n_shingles, dtype=np.int32)
        for i in range(self.shingle_size):
            shingles += chars[i:i+n_shingles] * (31 ** i)

        unique_shingles = self.xp.unique(shingles)

        if self.use_gpu:
            unique_shingles = cp.asnumpy(unique_shingles)

        return unique_shingles

    def generate_batch(self, texts: List[str]) -> List[np.ndarray]:
        return [self.generate_shingles(t) for t in texts]


class MinHashProcessor:
    def __init__(self, num_perm: int = 128, use_gpu: bool = GPU_AVAILABLE):
        self.num_perm = num_perm
        self.use_gpu = use_gpu and GPU_AVAILABLE
        self.xp = cp if self.use_gpu else np

        rng = self.xp.random.RandomState(42)
        self.hash_a = rng.randint(1, 2**31-1, num_perm, dtype=np.int64)
        self.hash_b = rng.randint(0, 2**31-1, num_perm, dtype=np.int64)
        self.prime = np.int64(2**31-1)

        if self.use_gpu:
            print(f"Using GPU for MinHash ({num_perm} permutations)")

    def compute_signature(self, shingles: np.ndarray) -> np.ndarray:
        if len(shingles) == 0:
            return np.full(self.num_perm, self.prime, dtype=np.int64)

        if self.use_gpu:
            shingles_gpu = self.xp.array(shingles, dtype=np.int64)
        else:
            shingles_gpu = shingles.astype(np.int64)

        shingles_expanded = shingles_gpu[:, self.xp.newaxis]
        hashes = (self.hash_a * shingles_expanded + self.hash_b) % self.prime
        signature = self.xp.min(hashes, axis=0)

        if self.use_gpu:
            signature = cp.asnumpy(signature)

        return signature

    def compute_batch(self, shingles_batch: List[np.ndarray]) -> np.ndarray:
        signatures = np.zeros((len(shingles_batch), self.num_perm), dtype=np.int64)
        for i, shingles in enumerate(shingles_batch):
            signatures[i] = self.compute_signature(shingles)
        return signatures


class LSHIndex:
    def __init__(self, threshold: float = 0.3, num_perm: int = 128):
        self.threshold = threshold
        self.num_perm = num_perm
        self.bands = 16
        self.rows = num_perm // self.bands
        self.signatures = []
        self.num_docs = 0
        self.hash_tables = [defaultdict(list) for _ in range(self.bands)]

    def _hash_band(self, band: np.ndarray) -> int:
        return int(hash(tuple(band)) % (2**31))

    def insert_batch(self, signatures: np.ndarray, start_idx: int):
        batch_size = signatures.shape[0]
        self.signatures.append(signatures)

        for band_idx in range(self.bands):
            start_row = band_idx * self.rows
            end_row = start_row + self.rows

            for doc_idx in range(batch_size):
                band = signatures[doc_idx, start_row:end_row]
                band_hash = self._hash_band(band)
                global_doc_id = start_idx + doc_idx
                self.hash_tables[band_idx][band_hash].append(global_doc_id)

        self.num_docs += batch_size

    def query_batch(self, signatures: np.ndarray, start_idx: int) -> List[set]:
        batch_size = signatures.shape[0]
        candidates = [set() for _ in range(batch_size)]

        for band_idx in range(self.bands):
            start_row = band_idx * self.rows
            end_row = start_row + self.rows

            for doc_idx in range(batch_size):
                query_doc_id = start_idx + doc_idx
                band = signatures[doc_idx, start_row:end_row]
                band_hash = self._hash_band(band)
                bucket = self.hash_tables[band_idx].get(band_hash, [])
                candidates[doc_idx].update(c for c in bucket if c < query_doc_id)

        return candidates


class SimilarityComputer:
    def __init__(self, threshold: float = 0.3, use_gpu: bool = GPU_AVAILABLE):
        self.threshold = threshold
        self.use_gpu = use_gpu and GPU_AVAILABLE
        self.xp = cp if self.use_gpu else np

    def compute_batch_similarities(self, query_sig: np.ndarray,
                                   candidate_sigs: np.ndarray) -> np.ndarray:
        if self.use_gpu:
            query_gpu = self.xp.array(query_sig)
            cands_gpu = self.xp.array(candidate_sigs)
            query_expanded = self.xp.tile(query_gpu, (len(candidate_sigs), 1))
            matches = self.xp.sum(query_expanded == cands_gpu, axis=1)
            sims = matches.astype(np.float32) / query_sig.shape[0]
            return cp.asnumpy(sims)
        else:
            query_expanded = np.tile(query_sig, (len(candidate_sigs), 1))
            matches = np.sum(query_expanded == candidate_sigs, axis=1)
            return matches.astype(np.float32) / query_sig.shape[0]


class UnionFind:
    def __init__(self, n: int):
        self.parent = list(range(n))
        self.rank = [0] * n

    def find(self, x: int) -> int:
        if self.parent[x] != x:
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]

    def union(self, x: int, y: int):
        px, py = self.find(x), self.find(y)
        if px == py:
            return
        if self.rank[px] < self.rank[py]:
            px, py = py, px
        self.parent[py] = px
        if self.rank[px] == self.rank[py]:
            self.rank[px] += 1

    def get_clusters(self) -> Dict[int, int]:
        return {i: self.find(i) for i in range(len(self.parent))}


class FastMinHashClustering:
    def __init__(self, threshold: float = 0.3, shingle_size: int = 4,
                 num_perm: int = 128, chunk_size: int = 50000,
                 use_gpu: Optional[bool] = None):

        if use_gpu is None:
            use_gpu = GPU_AVAILABLE

        self.threshold = threshold
        self.chunk_size = chunk_size
        self.use_gpu = use_gpu and GPU_AVAILABLE

        self.preprocessor = TextPreprocessor(
            lowercase=True,
            remove_punctuation=True,
            remove_diacritics=True
        )
        self.shingler = ShingleGenerator(shingle_size, use_gpu)
        self.minhash = MinHashProcessor(num_perm, use_gpu)
        self.lsh_index = LSHIndex(threshold, num_perm)
        self.similarity_computer = SimilarityComputer(threshold, use_gpu)
        self.all_similarities = []

        mode = "GPU (CuPy)" if self.use_gpu else "CPU (NumPy)"
        print(f"Initialized in {mode} mode")

    def cluster(self, texts: List[str]) -> Tuple[Dict[int, int], List[Tuple[int, int, float]]]:
        n_docs = len(texts)
        n_chunks = (n_docs + self.chunk_size - 1) // self.chunk_size

        print(f"\nClustering {n_docs:,} documents in {n_chunks} chunks")
        print(f"threshold={self.threshold}, chunk_size={self.chunk_size:,}")

        start_time = time.time()

        for chunk_idx in tqdm(range(n_chunks), desc="Processing"):
            chunk_start = chunk_idx * self.chunk_size
            chunk_end = min(chunk_start + self.chunk_size, n_docs)
            chunk_texts = texts[chunk_start:chunk_end]

            processed = self.preprocessor.preprocess_batch(chunk_texts)
            shingles = self.shingler.generate_batch(processed)
            signatures = self.minhash.compute_batch(shingles)
            self.lsh_index.insert_batch(signatures, chunk_start)

            if chunk_start > 0:
                candidates = self.lsh_index.query_batch(signatures, chunk_start)

                for doc_idx, cand_set in enumerate(candidates):
                    if not cand_set:
                        continue

                    query_doc_id = chunk_start + doc_idx
                    query_sig = signatures[doc_idx]

                    cand_list = sorted(cand_set)
                    cand_sigs = []
                    for cand_id in cand_list:
                        batch_idx = cand_id // self.chunk_size
                        local_idx = cand_id % self.chunk_size
                        if batch_idx < len(self.lsh_index.signatures):
                            cand_sigs.append(self.lsh_index.signatures[batch_idx][local_idx])

                    if cand_sigs:
                        cand_sigs = np.array(cand_sigs)
                        sims = self.similarity_computer.compute_batch_similarities(
                            query_sig, cand_sigs
                        )

                        for cand_id, sim in zip(cand_list[:len(sims)], sims):
                            if sim >= self.threshold:
                                self.all_similarities.append((cand_id, query_doc_id, float(sim)))

        elapsed = time.time() - start_time
        print(f"\nFound {len(self.all_similarities):,} similarities in {elapsed:.2f}s")
        print(f"Throughput: {n_docs/elapsed:,.0f} docs/sec")

        print("Building clusters...")
        uf = UnionFind(n_docs)
        for doc1, doc2, _ in tqdm(self.all_similarities, desc="Clustering"):
            uf.union(doc1, doc2)

        clusters = uf.get_clusters()
        n_clusters = len(set(clusters.values()))

        total_time = time.time() - start_time
        print(f"\nCreated {n_clusters:,} clusters in {total_time:.2f}s total")

        return clusters, self.all_similarities


def reconstruct_poems(df):
    poem_to_clusters = defaultdict(set)
    poem_verse_counts = defaultdict(int)

    for _, row in df.iterrows():
        poem_id = row['idoriginal_poem']
        cluster_id = row['cluster_id']
        poem_verse_counts[poem_id] += 1
        if cluster_id != -1:
            poem_to_clusters[poem_id].add(cluster_id)

    print(f"\nReconstructed {len(poem_to_clusters)} poems")
    return poem_to_clusters, poem_verse_counts


def calculate_poem_cluster_similarity(clusters_a: Set[int], clusters_b: Set[int]) -> float:
    if not clusters_a or not clusters_b:
        return 0.0
    intersection = len(clusters_a & clusters_b)
    union = len(clusters_a | clusters_b)
    return intersection / union if union > 0 else 0.0


def cluster_poems(poem_to_clusters: Dict, similarity_threshold: float = 0.60):
    poem_ids = list(poem_to_clusters.keys())
    n_poems = len(poem_ids)

    edges = []
    for i in range(n_poems):
        for j in range(i + 1, n_poems):
            poem_a = poem_ids[i]
            poem_b = poem_ids[j]
            similarity = calculate_poem_cluster_similarity(
                poem_to_clusters[poem_a],
                poem_to_clusters[poem_b]
            )
            if similarity >= similarity_threshold:
                edges.append((poem_a, poem_b, similarity))

    class PoemUnionFind:
        def __init__(self, elements):
            self.parent = {e: e for e in elements}
            self.rank = {e: 0 for e in elements}

        def find(self, x):
            if self.parent[x] != x:
                self.parent[x] = self.find(self.parent[x])
            return self.parent[x]

        def union(self, x, y):
            px, py = self.find(x), self.find(y)
            if px == py:
                return
            if self.rank[px] < self.rank[py]:
                px, py = py, px
            self.parent[py] = px
            if self.rank[px] == self.rank[py]:
                self.rank[px] += 1

    uf = PoemUnionFind(poem_ids)
    for poem_a, poem_b, _ in edges:
        uf.union(poem_a, poem_b)

    poem_clusters = {poem_id: uf.find(poem_id) for poem_id in poem_ids}
    n_clusters = len(set(poem_clusters.values()))

    return poem_clusters, edges, n_clusters


def evaluate_clustering(y_true, y_pred):
    ari = adjusted_rand_score(y_true, y_pred)
    v_measure = v_measure_score(y_true, y_pred)
    return ari, v_measure


def calculate_perfect_reconstruction_rate(df, poem_clusters):
    poem_to_type = df.groupby('idoriginal_poem')['type_id'].first().to_dict()

    gt_to_poems = defaultdict(set)
    for poem_id, gt_type in poem_to_type.items():
        gt_to_poems[gt_type].add(poem_id)

    pred_to_poems = defaultdict(set)
    for poem_id, pred_cluster in poem_clusters.items():
        pred_to_poems[pred_cluster].add(poem_id)

    perfectly_reconstructed = 0
    total_gt_clusters = len(gt_to_poems)

    for gt_type, gt_poems in gt_to_poems.items():
        for pred_cluster, pred_poems in pred_to_poems.items():
            if gt_poems == pred_poems:
                perfectly_reconstructed += 1
                break

    reconstruction_rate = perfectly_reconstructed / total_gt_clusters if total_gt_clusters > 0 else 0
    return reconstruction_rate, perfectly_reconstructed, total_gt_clusters


def visualize_verse_grid_search(results_df, save_path=None):
    if save_path is None:
        save_path = RESULTS_DIR / 'verse_grid_search_results.png'

    ari_pivot = results_df.pivot(index='shingle_size', columns='threshold', values='ari')
    vmeasure_pivot = results_df.pivot(index='shingle_size', columns='threshold', values='v_measure')
    clusters_pivot = results_df.pivot(index='shingle_size', columns='threshold', values='n_clusters')
    similarities_pivot = results_df.pivot(index='shingle_size', columns='threshold', values='n_similarities')

    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('Verse-Level Clustering Grid Search Results', fontsize=18, fontweight='bold')

    col_labels = [f"{col:.0%}" for col in ari_pivot.columns]

    ax1 = axes[0, 0]
    sns.heatmap(ari_pivot, annot=True, fmt='.4f', cmap='viridis', ax=ax1,
                cbar_kws={'label': 'ARI'}, xticklabels=col_labels)
    ax1.set_xlabel('Similarity Threshold', fontweight='bold', fontsize=12)
    ax1.set_ylabel('Shingle Size', fontweight='bold', fontsize=12)
    ax1.set_title('Adjusted Rand Index (ARI)', fontweight='bold', fontsize=13)

    ax2 = axes[0, 1]
    sns.heatmap(vmeasure_pivot, annot=True, fmt='.4f', cmap='viridis', ax=ax2,
                cbar_kws={'label': 'V-measure'}, xticklabels=col_labels)
    ax2.set_xlabel('Similarity Threshold', fontweight='bold', fontsize=12)
    ax2.set_ylabel('Shingle Size', fontweight='bold', fontsize=12)
    ax2.set_title('V-measure', fontweight='bold', fontsize=13)

    ax3 = axes[1, 0]
    sns.heatmap(clusters_pivot, annot=True, fmt='.0f', cmap='viridis', ax=ax3,
                cbar_kws={'label': 'Clusters'}, xticklabels=col_labels)
    ax3.set_xlabel('Similarity Threshold', fontweight='bold', fontsize=12)
    ax3.set_ylabel('Shingle Size', fontweight='bold', fontsize=12)
    ax3.set_title('Number of Clusters', fontweight='bold', fontsize=13)

    ax4 = axes[1, 1]
    sns.heatmap(similarities_pivot, annot=True, fmt='.0f', cmap='viridis', ax=ax4,
                cbar_kws={'label': 'Similarities'}, xticklabels=col_labels)
    ax4.set_xlabel('Similarity Threshold', fontweight='bold', fontsize=12)
    ax4.set_ylabel('Shingle Size', fontweight='bold', fontsize=12)
    ax4.set_title('Number of Similarities Found', fontweight='bold', fontsize=13)

    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f"\nVisualization saved to: {save_path}")
    plt.close()


def visualize_poem_grid_search(results_df, save_path=None):
    if save_path is None:
        save_path = RESULTS_DIR / 'poem_grid_search_results.png'
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    fig.suptitle('Poem-Level Clustering Grid Search Results', fontsize=16, fontweight='bold')

    thresholds = results_df['threshold'].values
    thresholds_pct = [f"{t:.0%}" for t in thresholds]

    def normalize(vals):
        return (vals - np.min(vals)) / (np.max(vals) - np.min(vals))

    ax1 = axes[0, 0]
    norm_vals = normalize(results_df['ari'].values)
    colors = plt.cm.viridis(norm_vals)
    ax1.plot(thresholds_pct, results_df['ari'].values, marker='o', linewidth=2, markersize=8)
    for i, (x, y) in enumerate(zip(thresholds_pct, results_df['ari'].values)):
        ax1.text(i, y, f'{y:.4f}', ha='center', va='bottom', fontsize=9)
    ax1.set_xlabel('Similarity Threshold', fontweight='bold')
    ax1.set_ylabel('Adjusted Rand Index (ARI)', fontweight='bold')
    ax1.set_title('ARI vs Threshold')
    ax1.grid(True, alpha=0.3)

    ax2 = axes[0, 1]
    norm_vals = normalize(results_df['v_measure'].values)
    colors = plt.cm.viridis(norm_vals)
    ax2.plot(thresholds_pct, results_df['v_measure'].values, marker='o', linewidth=2, markersize=8)
    for i, (x, y) in enumerate(zip(thresholds_pct, results_df['v_measure'].values)):
        ax2.text(i, y, f'{y:.4f}', ha='center', va='bottom', fontsize=9)
    ax2.set_xlabel('Similarity Threshold', fontweight='bold')
    ax2.set_ylabel('V-measure', fontweight='bold')
    ax2.set_title('V-measure vs Threshold')
    ax2.grid(True, alpha=0.3)

    ax3 = axes[1, 0]
    prr_vals = results_df['perfect_reconstruction_rate'].values * 100
    norm_vals = normalize(prr_vals)
    colors = plt.cm.viridis(norm_vals)
    ax3.plot(thresholds_pct, prr_vals, marker='o', linewidth=2, markersize=8)
    for i, (x, y) in enumerate(zip(thresholds_pct, prr_vals)):
        ax3.plot(x, y, marker='o', color=colors[i], markersize=10)
        ax3.text(i, y, f'{y:.1f}%', ha='center', va='bottom', fontsize=9)
    ax3.set_xlabel('Similarity Threshold', fontweight='bold')
    ax3.set_ylabel('Perfect Reconstruction Rate (%)', fontweight='bold')
    ax3.set_title('Perfect Reconstruction Rate vs Threshold')
    ax3.grid(True, alpha=0.3)

    ax4 = axes[1, 1]
    n_clusters_vals = results_df['n_clusters'].values
    norm_vals = normalize(n_clusters_vals)
    colors = plt.cm.viridis(norm_vals)
    ax4.plot(thresholds_pct, n_clusters_vals, marker='o', linewidth=2, markersize=8)
    for i, (x, y) in enumerate(zip(thresholds_pct, n_clusters_vals)):
        ax4.plot(x, y, marker='o', color=colors[i], markersize=10)
        ax4.text(i, y, f'{y}', ha='center', va='bottom', fontsize=9)
    ax4.set_xlabel('Similarity Threshold', fontweight='bold')
    ax4.set_ylabel('Number of Poem Clusters', fontweight='bold')
    ax4.set_title('Number of Clusters vs Threshold')
    ax4.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f"\nVisualization saved to: {save_path}")
    plt.close()


def verse_level_grid_search(texts, df, thresholds, shingle_sizes, num_perm=128):
    results = []
    best_ari = -1
    best_threshold = None
    best_shingle_size = None
    best_clusters = None
    best_similarities = None

    print("\n" + "="*100)
    print("VERSE-LEVEL 2D GRID SEARCH")
    print("="*100)
    print(f"\nTesting thresholds: {[f'{t:.0%}' for t in thresholds]}")
    print(f"Testing shingle sizes: {shingle_sizes}\n")

    total_combinations = len(thresholds) * len(shingle_sizes)
    print(f"Total combinations: {total_combinations}\n")

    for shingle_size in shingle_sizes:
        for threshold in thresholds:
            print(f"\nTesting shingle_size={shingle_size}, threshold={threshold:.0%}...")

            clusterer = FastMinHashClustering(
                threshold=threshold,
                shingle_size=shingle_size,
                num_perm=num_perm,
                chunk_size=1
            )

            clusters, similarities = clusterer.cluster(texts)

            if 'idgroup' in df.columns:
                temp_df = df.copy()
                temp_df['cluster_id'] = temp_df.index.map(clusters)

                mask = temp_df['idgroup'].notna() & temp_df['cluster_id'].notna()
                y_true = temp_df.loc[mask, 'idgroup'].tolist()
                y_pred = temp_df.loc[mask, 'cluster_id'].tolist()

                ari, v_measure = evaluate_clustering(y_true, y_pred)
                n_gt_clusters = len(set(y_true))
            else:
                ari, v_measure = 0, 0
                n_gt_clusters = 0

            n_clusters = len(set(clusters.values()))

            results.append({
                'shingle_size': shingle_size,
                'threshold': threshold,
                'n_clusters': n_clusters,
                'n_similarities': len(similarities),
                'ari': ari,
                'v_measure': v_measure,
                'n_gt_clusters': n_gt_clusters
            })

            if ari > best_ari:
                best_ari = ari
                best_threshold = threshold
                best_shingle_size = shingle_size
                best_clusters = clusters
                best_similarities = similarities

            print(f"  ARI: {ari:.4f}, V-measure: {v_measure:.4f}, Clusters: {n_clusters}")

    results_df = pd.DataFrame(results)

    print("\n" + "="*100)
    print("VERSE-LEVEL GRID SEARCH SUMMARY")
    print("="*100)
    print(f"\n{'Shingle':<10} {'Threshold':<12} {'Clusters':<10} {'Similarities':<15} {'ARI':<8} {'V-measure':<12}")
    print("-" * 80)

    for _, result in results_df.iterrows():
        print(f"{result['shingle_size']:<10} "
              f"{result['threshold']:<12.0%} "
              f"{result['n_clusters']:<10} "
              f"{result['n_similarities']:<15} "
              f"{result['ari']:<8.4f} "
              f"{result['v_measure']:<12.4f}")

    print(f"\n{'='*100}")
    print("BEST VERSE-LEVEL PARAMETERS")
    print("="*100)
    print(f"\nBest parameters by ARI:")
    print(f"  Shingle size: {best_shingle_size}")
    print(f"  Threshold: {best_threshold:.0%}")
    best_result = results_df[(results_df['threshold'] == best_threshold) &
                              (results_df['shingle_size'] == best_shingle_size)].iloc[0]
    print(f"  ARI: {best_result['ari']:.4f}")
    print(f"  V-measure: {best_result['v_measure']:.4f}")
    print(f"  Number of clusters: {best_result['n_clusters']}")
    print(f"  Number of similarities found: {best_result['n_similarities']}")

    visualize_verse_grid_search(results_df)

    results_csv = RESULTS_DIR / 'verse_grid_search_results.csv'
    results_df.to_csv(results_csv, index=False)
    print(f"\nVerse grid search results saved to: {results_csv}")

    return best_clusters, best_similarities, best_threshold, best_shingle_size, results_df


def poem_level_grid_search(df, poem_to_clusters, thresholds):
    results = []
    best_ari = -1
    best_threshold = None
    best_poem_clusters = None

    print("\n" + "="*100)
    print("POEM-LEVEL GRID SEARCH")
    print("="*100)
    print(f"\nTesting thresholds: {[f'{t:.0%}' for t in thresholds]}\n")

    for threshold in thresholds:
        print(f"\nTesting threshold {threshold:.0%}...")

        poem_clusters, poem_edges, n_clusters = cluster_poems(poem_to_clusters, threshold)

        if 'type_id' in df.columns:
            poem_to_type = df.groupby('idoriginal_poem')['type_id'].first().to_dict()

            y_true = []
            y_pred = []
            for poem_id, predicted_cluster in poem_clusters.items():
                if poem_id in poem_to_type:
                    y_true.append(poem_to_type[poem_id])
                    y_pred.append(predicted_cluster)

            ari, v_measure = evaluate_clustering(y_true, y_pred)
            reconstruction_rate, n_perfect, n_total_gt = calculate_perfect_reconstruction_rate(df, poem_clusters)
        else:
            ari, v_measure = 0, 0
            reconstruction_rate, n_perfect, n_total_gt = 0, 0, 0

        results.append({
            'threshold': threshold,
            'n_clusters': n_clusters,
            'n_edges': len(poem_edges),
            'ari': ari,
            'v_measure': v_measure,
            'perfect_reconstruction_rate': reconstruction_rate,
            'n_perfect_clusters': n_perfect,
            'n_total_gt_clusters': n_total_gt
        })

        if ari > best_ari:
            best_ari = ari
            best_threshold = threshold
            best_poem_clusters = poem_clusters

        print(f"  ARI: {ari:.4f}, V-measure: {v_measure:.4f}, Clusters: {n_clusters}")

    results_df = pd.DataFrame(results)

    print("\n" + "="*100)
    print("POEM-LEVEL GRID SEARCH SUMMARY")
    print("="*100)
    print(f"\n{'Threshold':<12} {'Clusters':<10} {'Edges':<10} {'ARI':<8} {'V-measure':<12} {'Perfect Recon.':<15}")
    print("-" * 80)

    for _, result in results_df.iterrows():
        print(f"{result['threshold']:<12.0%} "
              f"{result['n_clusters']:<10} "
              f"{result['n_edges']:<10} "
              f"{result['ari']:<8.4f} "
              f"{result['v_measure']:<12.4f} "
              f"{result['perfect_reconstruction_rate']:<15.1%}")

    print(f"\n{'='*100}")
    print("BEST POEM-LEVEL THRESHOLD")
    print("="*100)
    print(f"\nBest threshold by ARI: {best_threshold:.0%}")
    best_result = results_df[results_df['threshold'] == best_threshold].iloc[0]
    print(f"  ARI: {best_result['ari']:.4f}")
    print(f"  V-measure: {best_result['v_measure']:.4f}")
    print(f"  Perfect reconstruction rate: {best_result['perfect_reconstruction_rate']:.1%}")
    print(f"    ({best_result['n_perfect_clusters']:.0f}/{best_result['n_total_gt_clusters']:.0f} GT clusters perfectly reconstructed)")

    visualize_poem_grid_search(results_df)

    results_csv = RESULTS_DIR / 'poem_grid_search_results.csv'
    results_df.to_csv(results_csv, index=False)
    print(f"\nPoem grid search results saved to: {results_csv}")

    return best_poem_clusters, best_threshold, results_df


def main():
    print("="*100)
    print("LOADING DATA")
    print("="*100)
    print(f"Results will be saved to: {RESULTS_DIR}")

    df = pd.read_csv(DATA_FILE)

    if 'verse' in df.columns:
        df['text'] = df['verse']
    elif 'text' not in df.columns:
        raise ValueError("Dataset must have either 'verse' or 'text' column")

    df['text'] = df['text'].fillna('').astype(str)
    print(f"\nLoaded {df.shape[0]:,} verses")

    texts = df['text'].tolist()

    verse_thresholds = [0.2, 0.3, 0.4, 0.5]
    shingle_sizes = [2, 3, 4, 5]

    best_clusters, best_similarities, best_verse_threshold, best_shingle_size, verse_results = verse_level_grid_search(
        texts, df, verse_thresholds, shingle_sizes, num_perm=128
    )

    df['cluster_id'] = df.index.map(best_clusters)

    sim_dict = defaultdict(list)
    for doc1, doc2, sim in best_similarities:
        sim_dict[doc1].append(sim)
        sim_dict[doc2].append(sim)

    df['certainty'] = df.index.map(
        lambda i: np.mean(sim_dict[i]) if i in sim_dict else 1.0
    )

    preprocessor = TextPreprocessor(lowercase=True, remove_punctuation=True, remove_diacritics=True)
    df['text_preprocessed'] = df['text'].apply(preprocessor.preprocess)

    verse_output = RESULTS_DIR / "dbbe_verse_clustered_results.csv"
    df.to_csv(verse_output, index=False)
    print(f"\n{verse_output} saved with best parameters (shingle_size={best_shingle_size}, threshold={best_verse_threshold:.0%})")

    if 'idoriginal_poem' in df.columns and 'type_id' in df.columns:
        poem_to_clusters, poem_verse_counts = reconstruct_poems(df)

        poem_thresholds = [0.50, 0.60, 0.70, 0.8]

        best_poem_clusters, best_poem_threshold, poem_results = poem_level_grid_search(
            df, poem_to_clusters, poem_thresholds
        )

        df['poem_cluster_id'] = df['idoriginal_poem'].map(best_poem_clusters)

        poem_output = RESULTS_DIR / "dbbe_poem_level_clusters.csv"
        df.to_csv(poem_output, index=False)
        print(f"\n{poem_output} saved with best threshold ({best_poem_threshold:.0%})")
    else:
        print("\n" + "="*100)
        print("SKIPPING POEM-LEVEL CLUSTERING")
        print("="*100)
        print("\nRequired columns 'idoriginal_poem' and/or 'type_id' not found")

    print("\n" + "="*100)
    print("ANALYSIS COMPLETE")
    print("="*100)
    print(f"All results saved to: {RESULTS_DIR}")


if __name__ == "__main__":
    main()

In [None]:
RESULTS_DIR = Path("dbbe_orthographic_results")
results_df = pd.read_csv(RESULTS_DIR / 'verse_grid_search_results.csv')


def visualize_verse_grid_search_larger_font(results_df, save_path=None):
    if save_path is None:
        save_path = RESULTS_DIR / 'verse_grid_search_results_large_font.png'

    ari_pivot = results_df.pivot(index='shingle_size', columns='threshold', values='ari')
    vmeasure_pivot = results_df.pivot(index='shingle_size', columns='threshold', values='v_measure')
    clusters_pivot = results_df.pivot(index='shingle_size', columns='threshold', values='n_clusters')
    similarities_pivot = results_df.pivot(index='shingle_size', columns='threshold', values='n_similarities')

    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('Verse-Level Clustering Grid Search Results', fontsize=22, fontweight='bold')  # larger

    col_labels = [f"{col:.0%}" for col in ari_pivot.columns]

    for ax, pivot, title, cbar_label in zip(
        axes.flat,
        [ari_pivot, vmeasure_pivot, clusters_pivot, similarities_pivot],
        ['Adjusted Rand Index (ARI)', 'V-measure', 'Number of Clusters', 'Number of Similarities Found'],
        ['ARI', 'V-measure', 'Clusters', 'Similarities']
    ):
        sns.heatmap(pivot, annot=True, fmt='.4f' if pivot is not clusters_pivot and pivot is not similarities_pivot else '.0f',
                    cmap='viridis', ax=ax, cbar_kws={'label': cbar_label}, annot_kws={'fontsize':16})
        ax.set_xlabel('Similarity Threshold', fontsize=14, fontweight='bold')
        ax.set_ylabel('Shingle Size', fontsize=14, fontweight='bold')
        ax.set_title(title, fontsize=16, fontweight='bold')

    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    
visualize_verse_grid_search_larger_font(results_df)

In [None]:
df = pd.read_csv('dbbe_orthographic_results/dbbe_poem_level_clusters.csv')
output_file = Path("dbbe_orthographic_results/five_largest_clusters.txt")

poem_to_verses = defaultdict(list)
for _, row in df.iterrows():
    poem_id = row['idoriginal_poem']
    order = row['order'] if 'order' in df.columns else 0
    text = str(row['text']) if pd.notna(row['text']) else ''
    poem_to_verses[poem_id].append((order, text))

cluster_to_poems = defaultdict(list)
for _, row in df.iterrows():
    cid = row['poem_cluster_id']
    poem_id = row['idoriginal_poem']
    if poem_id not in cluster_to_poems[cid]:
        cluster_to_poems[cid].append(poem_id)

cluster_sizes = {cid: len(pids) for cid, pids in cluster_to_poems.items()}

top_clusters = sorted(cluster_sizes.items(), key=lambda x: x[1], reverse=True)[:5]

with open(output_file, 'w', encoding='utf-8') as f:
    for cid, size in top_clusters:
        f.write(f"CLUSTER ID={cid}, poems={size}\n")
        f.write("-" * 80 + "\n")
        for poem_id in cluster_to_poems[cid]:
            verses_sorted = [v for _, v in sorted(poem_to_verses[poem_id], key=lambda x: x[0])]
            f.write(f"Poem ID: {poem_id}\n")
            for verse_text in verses_sorted:
                rows = df[(df['idoriginal_poem'] == poem_id) & (df['text'] == verse_text)]
                prefix = "* " if (rows['poem_cluster_id'] == cid).any() else "  "
                f.write(f"{prefix}{verse_text}\n")
            f.write("-" * 40 + "\n")
        f.write("\n")

print(f"Five largest clusters saved to: {output_file}")

# 2. Full dataset

In [None]:
import re
import unicodedata
from typing import Dict
import pandas as pd
import numpy as np
from tqdm import tqdm
from datasketch import MinHash, MinHashLSHForest
import multiprocessing as mp
import hashlib
import time
import matplotlib.pyplot as plt
import seaborn as sns
import os
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from collections import defaultdict
from numba import njit, cuda
import psutil
import platform
import socket
from datetime import datetime
import threading
from pathlib import Path
import gc

class SystemResourceAnalyzer:
    def __init__(self):
        self.cpu_count_physical = psutil.cpu_count(logical=False) or mp.cpu_count()
        self.cpu_count_logical = psutil.cpu_count(logical=True) or mp.cpu_count()
        self.total_ram_gb = psutil.virtual_memory().total / (1024**3)
        self.available_ram_gb = psutil.virtual_memory().available / (1024**3)
        self.has_gpu = self._check_gpu()
        self.gpu_count = self._get_gpu_count() if self.has_gpu else 0
        self.gpu_memory_gb = self._get_gpu_memory() if self.has_gpu else 0
        
    def _check_gpu(self):
        try:
            cuda.detect()
            return True
        except:
            return False
    
    def _get_gpu_count(self):
        try:
            return len(cuda.gpus)
        except:
            return 0
    
    def _get_gpu_memory(self):
        try:
            if cuda.gpus:
                return cuda.current_context().get_memory_info()[1] / (1024**3)
            return 0
        except:
            return 0
    
    def get_optimal_workers(self, task_type='cpu_intensive'):
        if task_type == 'cpu_intensive':
            return min(self.cpu_count_logical, 64)
        elif task_type == 'io_intensive':
            return min(self.cpu_count_logical * 2, 128)
        elif task_type == 'memory_intensive':
            workers = int(self.available_ram_gb / 2)
            return max(min(workers, self.cpu_count_logical), 4)
        else:
            return self.cpu_count_logical
    
    def get_optimal_chunk_size(self, total_items, workers):
        base_chunk = max(50, total_items // (workers * 8))
        available_ram_per_worker_gb = self.available_ram_gb / workers * 0.7
        max_chunk_by_memory = int(available_ram_per_worker_gb * 100000)
        return min(base_chunk, max_chunk_by_memory, 50000)
    
    def should_use_gpu(self, data_size):
        if not self.has_gpu:
            return False
        return data_size > 10000 and self.gpu_memory_gb > 2
    
    def print_summary(self):
        print("="*80)
        print("SYSTEM RESOURCE ANALYSIS")
        print("="*80)
        print(f"CPU Cores (Physical): {self.cpu_count_physical}")
        print(f"CPU Cores (Logical):  {self.cpu_count_logical}")
        print(f"Total RAM:            {self.total_ram_gb:.2f} GB")
        print(f"Available RAM:        {self.available_ram_gb:.2f} GB")
        print(f"GPU Available:        {'Yes' if self.has_gpu else 'No'}")
        if self.has_gpu:
            print(f"GPU Count:            {self.gpu_count}")
            print(f"GPU Memory:           {self.gpu_memory_gb:.2f} GB per GPU")
        print("="*80)
        print(f"Optimal Workers (CPU Intensive):    {self.get_optimal_workers('cpu_intensive')}")
        print(f"Optimal Workers (Memory Intensive): {self.get_optimal_workers('memory_intensive')}")
        print(f"Optimal Workers (I/O Intensive):    {self.get_optimal_workers('io_intensive')}")
        print("="*80)

class ResourceMonitor:
    def __init__(self):
        self.monitoring = False
        self.thread = None
        self.peak_ram_gb = 0
        self.peak_cpu_percent = 0
        self.ram_samples = []
        self.cpu_samples = []
        self.process = psutil.Process()
        
    def _monitor_loop(self):
        while self.monitoring:
            ram_gb = self.process.memory_info().rss / (1024**3)
            self.ram_samples.append(ram_gb)
            self.peak_ram_gb = max(self.peak_ram_gb, ram_gb)
            
            try:
                cpu_percent = self.process.cpu_percent(interval=0.1)
                self.cpu_samples.append(cpu_percent)
                self.peak_cpu_percent = max(self.peak_cpu_percent, cpu_percent)
            except:
                pass
            
            time.sleep(0.5)
    
    def start(self):
        self.monitoring = True
        self.thread = threading.Thread(target=self._monitor_loop, daemon=True)
        self.thread.start()
    
    def stop(self):
        self.monitoring = False
        if self.thread:
            self.thread.join(timeout=2)
    
    def get_stats(self):
        return {
            'peak_ram_gb': self.peak_ram_gb,
            'avg_ram_gb': np.mean(self.ram_samples) if self.ram_samples else 0,
            'peak_cpu_percent': self.peak_cpu_percent,
            'avg_cpu_percent': np.mean(self.cpu_samples) if self.cpu_samples else 0
        }

class TimingLogger:
    def __init__(self):
        self.stages = {}
        self.current_stage = None
        self.stage_start = None
    
    def start_stage(self, name):
        self.current_stage = name
        self.stage_start = time.time()
    
    def end_stage(self):
        if self.current_stage and self.stage_start:
            duration = time.time() - self.stage_start
            self.stages[self.current_stage] = duration
            self.current_stage = None
            self.stage_start = None
    
    def get_summary(self):
        return self.stages.copy()

system_analyzer = SystemResourceAnalyzer()
resource_monitor = ResourceMonitor()
timing_logger = TimingLogger()

def get_system_info():
    info = {
        'hostname': socket.gethostname(),
        'platform': platform.platform(),
        'python_version': platform.python_version(),
        'processor': platform.processor(),
        'cpu_count_physical': system_analyzer.cpu_count_physical,
        'cpu_count_logical': system_analyzer.cpu_count_logical,
        'total_ram_gb': system_analyzer.total_ram_gb,
        'available_ram_gb': system_analyzer.available_ram_gb,
        'has_gpu': system_analyzer.has_gpu,
        'gpu_count': system_analyzer.gpu_count,
        'gpu_memory_gb': system_analyzer.gpu_memory_gb,
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }
    return info

system_analyzer.print_summary()
resource_monitor.start()
script_start_time = time.time()

CLEAN_PATTERN = re.compile(r'[^\w\s]')
WHITESPACE_PATTERN = re.compile(r'\s+')

def preprocess_text(text: str, options: Dict[str, bool] = None) -> str:
    if options is None:
        options = {'lowercase': True, 'remove_diacritics': True, 'remove_punctuation': True}
    text = str(text)
    if options.get('lowercase', True):
        text = text.lower()
    if options.get('remove_diacritics', True):
        text = unicodedata.normalize('NFD', text)
        text = ''.join(char for char in text if unicodedata.category(char) != 'Mn')
        text = unicodedata.normalize('NFC', text)
    else:
        text = unicodedata.normalize('NFC', text)
    if options.get('remove_punctuation', True):
        text = CLEAN_PATTERN.sub('', text)
    text = WHITESPACE_PATTERN.sub(' ', text)
    return text.strip()

class UnionFind:
    __slots__ = ['parent', 'rank']
    
    def __init__(self, n):
        self.parent = list(range(n))
        self.rank = [0] * n
    
    def find(self, x):
        if self.parent[x] != x:
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]
    
    def union(self, x, y):
        px, py = self.find(x), self.find(y)
        if px == py:
            return False
        if self.rank[px] < self.rank[py]:
            self.parent[px] = py
        elif self.rank[px] > self.rank[py]:
            self.parent[py] = px
        else:
            self.parent[py] = px
            self.rank[px] += 1
        return True
    
    def get_clusters(self):
        return np.array([self.find(i) for i in range(len(self.parent))], dtype=np.int32)

def get_ngrams_vectorized(text, n=4):
    if not text or len(text) < n:
        return set()
    text = str(text).lower()
    return set(text[i:i+n] for i in range(len(text)-n+1))

def compute_minhash_chunk(args):
    texts, start_idx, n_gram_size, num_perm, seed = args
    np.random.seed(seed)
    minhashes = []
    for text in texts:
        ngrams = get_ngrams_vectorized(text, n_gram_size)
        m = MinHash(num_perm=num_perm, seed=seed)
        if ngrams:
            for ngram in ngrams:
                m.update(ngram.encode('utf8'))
        minhashes.append(m)
    return minhashes

def compute_minhash_parallel(texts, n_gram_size=3, num_perm=128, n_cores=None):
    if n_cores is None:
        n_cores = system_analyzer.get_optimal_workers('cpu_intensive')
    
    chunk_size = system_analyzer.get_optimal_chunk_size(len(texts), n_cores)
    chunks = [(texts[i:i+chunk_size], i, n_gram_size, num_perm, 42) 
              for i in range(0, len(texts), chunk_size)]
    
    print(f"  Using {n_cores} workers with chunk size {chunk_size}")
    
    with mp.Pool(n_cores) as pool:
        results = list(tqdm(pool.imap(compute_minhash_chunk, chunks),
                          total=len(chunks), desc=f"MinHash (n={n_gram_size})", leave=False))
    
    minhashes = [mh for chunk_mhs in results for mh in chunk_mhs]
    return minhashes

def fast_hash(data):
    return int(hashlib.md5(data).hexdigest()[:16], 16)

def find_exact_duplicates_fast(texts):
    n_workers = system_analyzer.get_optimal_workers('io_intensive')
    chunk_size = system_analyzer.get_optimal_chunk_size(len(texts), n_workers)
    
    print(f"  Using {n_workers} workers for hashing")
    
    def hash_chunk(chunk_data):
        chunk_texts, start_idx = chunk_data
        local_hashes = {}
        for i, text in enumerate(chunk_texts):
            normalized = str(text).strip().lower()
            if not normalized:
                continue
            text_hash = fast_hash(normalized.encode('utf-8'))
            local_hashes.setdefault(text_hash, []).append(start_idx + i)
        return local_hashes
    
    chunks = [(texts[i:i+chunk_size], i) for i in range(0, len(texts), chunk_size)]
    
    with ThreadPoolExecutor(max_workers=n_workers) as executor:
        chunk_results = list(tqdm(executor.map(hash_chunk, chunks),
                                 total=len(chunks), desc="Hashing", leave=False))
    
    text_hashes = {}
    for chunk_hash_dict in chunk_results:
        for hash_val, indices in chunk_hash_dict.items():
            text_hashes.setdefault(hash_val, []).extend(indices)
    
    duplicate_groups = [indices for indices in text_hashes.values() if len(indices) > 1]
    return duplicate_groups

def stratified_sample(df, n_sample=15000):
    datasets = df['source_dataset'].unique()
    total_size = len(df)
    sample_indices = []
    
    for dataset in datasets:
        dataset_indices = df[df['source_dataset'] == dataset].index.tolist()
        dataset_size = len(dataset_indices)
        proportion = dataset_size / total_size
        n_from_dataset = int(n_sample * proportion)
        n_from_dataset = min(n_from_dataset, dataset_size)
        if n_from_dataset > 0:
            sampled = np.random.choice(dataset_indices, size=n_from_dataset, replace=False)
            sample_indices.extend(sampled)
    
    return sorted(sample_indices)

def compute_cluster_cohesion(minhashes, cluster_labels):
    unique_clusters = np.unique(cluster_labels)
    cohesions = []
    
    for cluster_id in unique_clusters:
        cluster_indices = np.where(cluster_labels == cluster_id)[0]
        if len(cluster_indices) < 2:
            continue
        
        if len(cluster_indices) > 50:
            sampled_indices = np.random.choice(cluster_indices, 50, replace=False)
        else:
            sampled_indices = cluster_indices
        
        sims = []
        for i in range(len(sampled_indices)):
            for j in range(i+1, len(sampled_indices)):
                sim = minhashes[sampled_indices[i]].jaccard(minhashes[sampled_indices[j]])
                sims.append(sim)
        
        if sims:
            cohesions.append(np.mean(sims))
    
    return np.mean(cohesions) if cohesions else 0.0

def compute_cluster_separation(minhashes, cluster_labels, n_samples=500):
    unique_clusters = np.unique(cluster_labels)
    if len(unique_clusters) < 2:
        return 1.0
    
    separations = []
    for _ in range(n_samples):
        c1, c2 = np.random.choice(unique_clusters, 2, replace=False)
        idx1 = np.random.choice(np.where(cluster_labels == c1)[0])
        idx2 = np.random.choice(np.where(cluster_labels == c2)[0])
        sim = minhashes[idx1].jaccard(minhashes[idx2])
        separations.append(1 - sim)
    
    return np.mean(separations) if separations else 0.0

def compute_silhouette_approximation(minhashes, cluster_labels, n_samples=1000):
    unique_clusters = np.unique(cluster_labels)
    if len(unique_clusters) < 2:
        return 0.0
    
    n_total = len(cluster_labels)
    if n_total > n_samples:
        sample_indices = np.random.choice(n_total, n_samples, replace=False)
    else:
        sample_indices = np.arange(n_total)
    
    silhouettes = []
    
    for idx in sample_indices:
        cluster_id = cluster_labels[idx]
        same_cluster = np.where(cluster_labels == cluster_id)[0]
        same_cluster = same_cluster[same_cluster != idx]
        
        if len(same_cluster) == 0:
            continue
        
        if len(same_cluster) > 20:
            same_cluster = np.random.choice(same_cluster, 20, replace=False)
        
        a = np.mean([1 - minhashes[idx].jaccard(minhashes[j]) 
                    for j in same_cluster])
        
        other_clusters = unique_clusters[unique_clusters != cluster_id]
        if len(other_clusters) == 0:
            continue
        
        min_b = float('inf')
        for other_id in other_clusters:
            other_cluster = np.where(cluster_labels == other_id)[0]
            
            if len(other_cluster) > 20:
                other_cluster = np.random.choice(other_cluster, 20, replace=False)
            
            b = np.mean([1 - minhashes[idx].jaccard(minhashes[j]) 
                        for j in other_cluster])
            min_b = min(min_b, b)
        
        s = (min_b - a) / max(a, min_b) if max(a, min_b) > 0 else 0
        silhouettes.append(s)
    
    return np.mean(silhouettes) if silhouettes else 0.0

def evaluate_single_config(args):
    shingle_size, threshold, texts, sample_indices, duplicate_groups = args
    
    try:
        sample_texts = [texts[i] for i in sample_indices]
        minhashes_sample = compute_minhash_parallel(
            sample_texts, 
            n_gram_size=shingle_size, 
            num_perm=128
        )
        
        forest = MinHashLSHForest(num_perm=128)
        for idx, mh in enumerate(minhashes_sample):
            forest.add(str(idx), mh)
        forest.index()
        
        n_sample = len(sample_indices)
        uf = UnionFind(n_sample)
        
        sample_set = set(sample_indices)
        for group in duplicate_groups:
            sample_group = [sample_indices.index(g) for g in group if g in sample_set]
            if len(sample_group) > 1:
                for i in range(1, len(sample_group)):
                    uf.union(sample_group[0], sample_group[i])
        
        top_k = 50
        merges = 0
        for idx in range(n_sample):
            if uf.find(idx) != idx:
                continue
            neighbors = forest.query(minhashes_sample[idx], top_k)
            for neighbor_str in neighbors[1:]:
                neighbor_idx = int(neighbor_str)
                if uf.find(idx) == uf.find(neighbor_idx):
                    continue
                sim = minhashes_sample[idx].jaccard(minhashes_sample[neighbor_idx])
                if sim >= threshold:
                    if uf.union(idx, neighbor_idx):
                        merges += 1
        
        cluster_labels = uf.get_clusters()
        unique_clusters, cluster_sizes = np.unique(cluster_labels, return_counts=True)
        
        n_clusters = len(unique_clusters)
        n_multi = np.sum(cluster_sizes > 1)
        n_singleton = np.sum(cluster_sizes == 1)
        avg_size = float(cluster_sizes.mean())
        max_size = int(cluster_sizes.max())
        
        cohesion = compute_cluster_cohesion(minhashes_sample, cluster_labels)
        separation = compute_cluster_separation(minhashes_sample, cluster_labels)
        silhouette = compute_silhouette_approximation(minhashes_sample, cluster_labels)
        
        return {
            'shingle_size': shingle_size,
            'threshold': threshold,
            'n_clusters': n_clusters,
            'n_multi_clusters': n_multi,
            'n_singletons': n_singleton,
            'avg_cluster_size': avg_size,
            'max_cluster_size': max_size,
            'cohesion': cohesion,
            'separation': separation,
            'silhouette': silhouette,
            'merges': merges
        }
    
    except Exception as e:
        print(f"Error at shingle={shingle_size}, threshold={threshold:.2f}: {e}")
        return None

def grid_search_parameters(texts, df, duplicate_groups, 
                          shingle_sizes=[2, 3,4,5],
                          threshold_range=(0.3, 0.9, 7),
                          n_sample=15000,
                          results_folder="full_orthographic_results",
                          max_workers=None):
    timing_logger.start_stage("01_verse_parameter_search")
    
    if max_workers is None:
        max_workers = system_analyzer.get_optimal_workers('cpu_intensive')
    
    print("\n" + "="*80)
    print("2D GRID SEARCH: SHINGLE SIZE × THRESHOLD")
    print("="*80)
    
    sample_indices = stratified_sample(df, n_sample)
    print(f"Sample size: {len(sample_indices):,}")
    
    thresholds = np.linspace(threshold_range[0], threshold_range[1], threshold_range[2])
    
    print(f"\nParameter grid:")
    print(f"  Shingle sizes: {shingle_sizes}")
    print(f"  Thresholds: {len(thresholds)} values from {thresholds[0]:.2f} to {thresholds[-1]:.2f}")
    print(f"  Total combinations: {len(shingle_sizes) * len(thresholds)}")
    
    args_list = []
    for shingle_size in shingle_sizes:
        for threshold in thresholds:
            args_list.append((shingle_size, threshold, texts, sample_indices, duplicate_groups))
    
    print(f"\nRunning grid search with {max_workers} workers...")
    start_time = time.time()
    
    results = []
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(evaluate_single_config, args): args for args in args_list}
        
        with tqdm(total=len(futures), desc="Grid search") as pbar:
            for future in as_completed(futures):
                result = future.result()
                if result is not None:
                    results.append(result)
                pbar.update(1)
    
    print(f"Grid search complete in {time.time()-start_time:.1f}s")
    print(f"  Valid results: {len(results)} / {len(args_list)}")
    
    results_df = pd.DataFrame(results)
    
    print("\nComputing quality scores...")
    
    def normalize(series):
        min_val = series.min()
        max_val = series.max()
        if max_val - min_val < 1e-10:
            return pd.Series(0.5, index=series.index)
        return (series - min_val) / (max_val - min_val)
    
    silhouette_score = normalize(results_df['silhouette'])
    cohesion_score = normalize(results_df['cohesion'])
    separation_score = normalize(results_df['separation'])
    
    singleton_ratio = results_df['n_singletons'] / len(sample_indices)
    balance_score = 1 - singleton_ratio
    balance_score = np.clip(balance_score, 0, 1)
    
    results_df['quality_score'] = (
        silhouette_score * 0.25 +
        cohesion_score * 0.25 +
        separation_score * 0.25 +
        balance_score * 0.25
    )
    
    results_df = results_df.sort_values('quality_score', ascending=False)
    results_csv = os.path.join(results_folder, 'parameter_grid_search_results.csv')
    results_df.to_csv(results_csv, index=False)
    print(f"Results saved: {results_csv}")
    
    create_verse_grid_search_heatmap(results_df, results_folder)
    
    best_config = results_df.iloc[0]
    best_shingle = int(best_config['shingle_size'])
    best_threshold = float(best_config['threshold'])
    
    print("\n" + "="*80)
    print("TOP 5 CONFIGURATIONS (BY QUALITY SCORE)")
    print("="*80)
    
    for idx, (i, row) in enumerate(results_df.head(5).iterrows(), 1):
        print(f"\n#{idx}. Shingle size: {int(row['shingle_size'])}, Threshold: {row['threshold']:.3f}")
        print(f"     Quality score: {row['quality_score']:.3f}")
        print(f"     Silhouette: {row['silhouette']:.3f}, Cohesion: {row['cohesion']:.3f}, "
              f"Separation: {row['separation']:.3f}")
        print(f"     Clusters: {int(row['n_multi_clusters']):,}, Singletons: {int(row['n_singletons']):,}, "
              f"Avg size: {row['avg_cluster_size']:.1f}")
    
    print("\n" + "="*80)
    print("SELECTED CONFIGURATION (HIGHEST QUALITY)")
    print("="*80)
    print(f"Shingle size: {best_shingle}")
    print(f"Threshold: {best_threshold:.3f}")
    print(f"Quality score: {best_config['quality_score']:.3f}")
    print(f"  - Silhouette: {best_config['silhouette']:.3f}")
    print(f"  - Cohesion: {best_config['cohesion']:.3f}")
    print(f"  - Separation: {best_config['separation']:.3f}")
    print(f"Multi-member clusters: {int(best_config['n_multi_clusters']):,}")
    print(f"Singletons: {int(best_config['n_singletons']):,}")
    print("="*80)
    
    timing_logger.end_stage()
    return best_shingle, best_threshold, results_df

def create_verse_grid_search_heatmap(results_df, results_folder):
    print("\nCreating verse-level heatmap...")
    
    sns.set_palette("colorblind")
    fig, ax = plt.subplots(1, 1, figsize=(10, 8))
    
    pivot_quality = results_df.pivot_table(
        values='quality_score',
        index='threshold',
        columns='shingle_size',
        aggfunc='first'
    )
    
    sns.heatmap(pivot_quality, annot=True, fmt='.3f', cmap='viridis', ax=ax,
               cbar_kws={'label': 'Quality Score'})
    ax.set_ylabel('Threshold', fontweight='bold', fontsize=12)
    ax.set_xlabel('Shingle Size', fontweight='bold', fontsize=12)
    ax.set_title('Verse-Level Quality Score Heatmap', fontweight='bold', fontsize=14)
    
    plt.tight_layout()
    plot_path = os.path.join(results_folder, 'verse_grid_search_heatmap.png')
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    print(f"Verse heatmap saved: {plot_path}")
    plt.close()

def cluster_with_lsh_forest(minhashes, duplicate_groups, threshold, top_k=100):
    n_docs = len(minhashes)
    uf = UnionFind(n_docs)
    
    exact_merges = 0
    for group in duplicate_groups:
        for i in range(1, len(group)):
            if uf.union(group[0], group[i]):
                exact_merges += 1
    
    forest = MinHashLSHForest(num_perm=len(minhashes[0].hashvalues))
    
    n_workers = system_analyzer.get_optimal_workers('io_intensive')
    chunk_size = system_analyzer.get_optimal_chunk_size(n_docs, n_workers)
    
    print(f"  Indexing with {n_workers} threads, chunk size {chunk_size}")
    
    def index_chunk(chunk_data):
        chunk_minhashes, start_idx = chunk_data
        local_forest = MinHashLSHForest(num_perm=len(minhashes[0].hashvalues))
        for i, mh in enumerate(chunk_minhashes):
            local_forest.add(str(start_idx + i), mh)
        return local_forest
    
    for idx, mh in enumerate(tqdm(minhashes, desc="Indexing", leave=False)):
        forest.add(str(idx), mh)
    forest.index()
    
    lsh_merges = 0
    verified_pairs = 0
    
    optimal_chunk = system_analyzer.get_optimal_chunk_size(n_docs, system_analyzer.get_optimal_workers('memory_intensive'))
    
    for start_idx in tqdm(range(0, n_docs, optimal_chunk), desc="Clustering"):
        end_idx = min(start_idx + optimal_chunk, n_docs)
        for idx in range(start_idx, end_idx):
            if uf.find(idx) != idx:
                continue
            neighbors = forest.query(minhashes[idx], top_k)
            for neighbor_str in neighbors[1:]:
                neighbor_idx = int(neighbor_str)
                if uf.find(idx) == uf.find(neighbor_idx):
                    continue
                verified_pairs += 1
                sim = minhashes[idx].jaccard(minhashes[neighbor_idx])
                if sim >= threshold:
                    if uf.union(idx, neighbor_idx):
                        lsh_merges += 1
    
    cluster_labels = uf.get_clusters()
    unique_clusters, cluster_sizes = np.unique(cluster_labels, return_counts=True)
    
    return cluster_labels, {
        'n_clusters': len(unique_clusters),
        'n_multi_clusters': np.sum(cluster_sizes > 1),
        'n_singletons': np.sum(cluster_sizes == 1),
        'avg_cluster_size': float(cluster_sizes.mean()),
        'max_cluster_size': int(cluster_sizes.max()),
        'exact_merges': exact_merges,
        'lsh_merges': lsh_merges,
        'threshold': threshold,
        'verified_pairs': verified_pairs
    }

@njit
def jaccard_numba(a_arr, b_arr):
    intersection = 0
    a_set = set(a_arr)
    b_set = set(b_arr)
    
    for item in a_set:
        if item in b_set:
            intersection += 1
    
    union = len(a_set) + len(b_set) - intersection
    if union == 0:
        return 0.0
    return intersection / union

@njit
def count_shared_verses(a_arr, b_arr):
    shared = 0
    a_set = set(a_arr)
    b_set = set(b_arr)
    
    for item in a_set:
        if item in b_set:
            shared += 1
    
    return shared

class PoemUnionFind:
    __slots__ = ['parent', 'rank']
    
    def __init__(self, elements):
        self.parent = {e: e for e in elements}
        self.rank = {e: 0 for e in elements}
    
    def find(self, x):
        if self.parent[x] != x:
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]
    
    def union(self, x, y):
        px, py = self.find(x), self.find(y)
        if px == py:
            return False
        if self.rank[px] < self.rank[py]:
            px, py = py, px
        self.parent[py] = px
        if self.rank[px] == self.rank[py]:
            self.rank[px] += 1
        return True
    
    def get_clusters(self):
        clusters = defaultdict(set)
        for elem in self.parent.keys():
            clusters[self.find(elem)].add(elem)
        return dict(clusters)

def compute_similarity_batch(args):
    pairs_batch, poem_to_array_dict, min_shared = args
    
    results = []
    for p1, p2 in pairs_batch:
        arr1 = poem_to_array_dict[p1]
        arr2 = poem_to_array_dict[p2]
        
        shared = count_shared_verses(arr1, arr2)
        
        if shared >= min_shared:
            sim = jaccard_numba(arr1, arr2)
            results.append({
                'poem1': p1,
                'poem2': p2,
                'similarity': sim,
                'shared_verses': shared
            })
    
    return results

class PoemThresholdSelector:
    def __init__(self, sample_size: int = 15000, random_seed: int = 42, min_shared_verses: int = 1):
        self.sample_size = sample_size
        self.random_seed = random_seed
        self.min_shared_verses = min_shared_verses
        np.random.seed(random_seed)
    
    @staticmethod
    def reconstruct_poems_vectorized(df):
        valid_mask = df['cluster_id'] != -1
        df_valid = df[valid_mask].copy()
        
        df_valid['idoriginal_poem'] = df_valid['idoriginal_poem'].astype(str)
        
        grouped = df_valid.groupby('idoriginal_poem')['cluster_id'].apply(
            lambda x: np.array(sorted(set(x)), dtype=np.int32)
        )
        
        return grouped.to_dict()
    
    @staticmethod
    def build_inverted_index_fast(poem_to_clusters):
        cluster_to_poems = defaultdict(set)
        for poem_id, clusters in poem_to_clusters.items():
            for c in clusters:
                cluster_to_poems[c].add(poem_id)
        return dict(cluster_to_poems)
    
    def find_candidate_pairs_for_sample(self, sample_poems, poem_to_clusters, cluster_to_poems):
        sample_set = set(sample_poems)
        candidate_pairs = set()
        
        poem_potential_matches = defaultdict(set)
        
        print("  Building potential matches using inverted index...")
        
        for cluster_id, poems_in_cluster in cluster_to_poems.items():
            sample_poems_in_cluster = list(poems_in_cluster & sample_set)
            
            if len(sample_poems_in_cluster) < 2:
                continue
            
            for poem in sample_poems_in_cluster:
                poem_potential_matches[poem].update(sample_poems_in_cluster)
        
        n_workers = system_analyzer.get_optimal_workers('io_intensive')
        
        def process_poem_batch(poems_batch):
            local_pairs = set()
            for poem_id in poems_batch:
                poem_id_str = str(poem_id)
                for other_poem in poem_potential_matches.get(poem_id, set()):
                    other_poem_str = str(other_poem)
                    if other_poem_str > poem_id_str:
                        local_pairs.add((poem_id_str, other_poem_str))
            return local_pairs
        
        poem_chunks = np.array_split(sample_poems, n_workers * 4)
        
        with ThreadPoolExecutor(max_workers=n_workers) as executor:
            chunk_results = list(tqdm(executor.map(process_poem_batch, poem_chunks), 
                                     total=len(poem_chunks), desc="Building sample pairs"))
        
        for chunk_pairs in chunk_results:
            candidate_pairs.update(chunk_pairs)
        
        return candidate_pairs
    
    def compute_sample_similarities(self, candidate_pairs, poem_to_array):
        pairs_list = list(candidate_pairs)
        
        if len(pairs_list) == 0:
            return pd.DataFrame()
        
        n_cores = system_analyzer.get_optimal_workers('cpu_intensive')
        chunk_size = system_analyzer.get_optimal_chunk_size(len(pairs_list), n_cores)
        chunks = [pairs_list[i:i+chunk_size] for i in range(0, len(pairs_list), chunk_size)]
        
        print(f"  Computing {len(pairs_list):,} pairs using {n_cores} cores with chunk size {chunk_size}...")
        
        poem_to_array_dict = {k: v for k, v in poem_to_array.items()}
        
        all_results = []
        with ProcessPoolExecutor(max_workers=n_cores) as executor:
            args_list = [(chunk, poem_to_array_dict, self.min_shared_verses) for chunk in chunks]
            futures = [executor.submit(compute_similarity_batch, args) for args in args_list]
            
            for future in tqdm(as_completed(futures), total=len(futures), desc="Computing similarities"):
                all_results.extend(future.result())
        
        return pd.DataFrame(all_results)
    
    def stratified_sample_poems(self, df, poem_to_clusters):
        has_source = 'source_dataset' in df.columns
        
        if has_source:
            poem_to_source = df.groupby('idoriginal_poem')['source_dataset'].first().to_dict()
        
        poem_metadata = []
        for poem_id, clusters in poem_to_clusters.items():
            metadata = {
                'poem_id': poem_id,
                'n_clusters': len(clusters)
            }
            
            if has_source:
                metadata['source'] = poem_to_source.get(poem_id, 'unknown')
            
            poem_metadata.append(metadata)
        
        poem_df = pd.DataFrame(poem_metadata)
        
        poem_df['size_bin'] = pd.cut(poem_df['n_clusters'], 
                                      bins=[0, 5, 10, 20, 50, np.inf],
                                      labels=['tiny', 'small', 'medium', 'large', 'huge'])
        
        sample_indices = []
        
        if has_source:
            print("  Stratifying by source dataset and poem size...")
            for (source, size_bin), group in poem_df.groupby(['source', 'size_bin']):
                n_in_group = len(group)
                proportion = n_in_group / len(poem_df)
                n_sample = max(1, int(self.sample_size * proportion))
                n_sample = min(n_sample, n_in_group)
                
                sampled = group.sample(n=n_sample, random_state=self.random_seed)
                sample_indices.extend(sampled['poem_id'].tolist())
        else:
            print("  Stratifying by poem size only...")
            for size_bin, group in poem_df.groupby('size_bin'):
                n_in_group = len(group)
                proportion = n_in_group / len(poem_df)
                n_sample = max(1, int(self.sample_size * proportion))
                n_sample = min(n_sample, n_in_group)
                
                sampled = group.sample(n=n_sample, random_state=self.random_seed)
                sample_indices.extend(sampled['poem_id'].tolist())
        
        if len(sample_indices) < self.sample_size:
            remaining = self.sample_size - len(sample_indices)
            available = set(poem_df['poem_id']) - set(sample_indices)
            if available:
                additional = np.random.choice(list(available), 
                                             size=min(remaining, len(available)), 
                                             replace=False)
                sample_indices.extend(additional)
        
        return sample_indices[:self.sample_size]
    
    def cluster_at_threshold(self, similarities_df, threshold, sample_poems, poem_to_array):
        valid_pairs = similarities_df[similarities_df['similarity'] >= threshold]
        
        uf = PoemUnionFind(sample_poems)
        
        for _, row in valid_pairs.iterrows():
            uf.union(row['poem1'], row['poem2'])
        
        poem_clusters = uf.get_clusters()
        cluster_assignments = {}
        for cluster_id, poems in poem_clusters.items():
            for poem in poems:
                cluster_assignments[poem] = cluster_id
        
        return cluster_assignments
    
    def compute_cluster_cohesion(self, poem_to_array, cluster_assignments, max_comparisons=10):
        poem_ids = list(poem_to_array.keys())
        cohesions = []
        
        clusters = defaultdict(list)
        for poem_id in poem_ids:
            cluster_id = cluster_assignments.get(poem_id)
            if cluster_id is not None:
                clusters[cluster_id].append(poem_id)
        
        for cluster_id, cluster_poems in clusters.items():
            if len(cluster_poems) < 2:
                continue
            
            if len(cluster_poems) > 20:
                sampled = np.random.choice(cluster_poems, 20, replace=False)
            else:
                sampled = cluster_poems
            
            sims = []
            for i in range(len(sampled)):
                for j in range(i+1, min(i+1+max_comparisons, len(sampled))):
                    sim = jaccard_numba(poem_to_array[sampled[i]], poem_to_array[sampled[j]])
                    sims.append(sim)
            
            if sims:
                cohesions.append(np.mean(sims))
        
        return np.mean(cohesions) if cohesions else 0.0
    
    def compute_cluster_separation(self, poem_to_array, cluster_assignments, n_samples=200):
        poem_ids = list(poem_to_array.keys())
        unique_clusters = set(cluster_assignments.values())
        
        if len(unique_clusters) < 2:
            return 1.0
        
        separations = []
        cluster_to_poems = defaultdict(list)
        for poem_id, cluster_id in cluster_assignments.items():
            cluster_to_poems[cluster_id].append(poem_id)
        
        unique_clusters = list(unique_clusters)
        for _ in range(n_samples):
            c1, c2 = np.random.choice(unique_clusters, 2, replace=False)
            
            p1 = np.random.choice(cluster_to_poems[c1])
            p2 = np.random.choice(cluster_to_poems[c2])
            
            sim = jaccard_numba(poem_to_array[p1], poem_to_array[p2])
            separations.append(1 - sim)
        
        return np.mean(separations) if separations else 0.0
    
    def compute_silhouette_approximation(self, poem_to_array, cluster_assignments, n_samples=300):
        poem_ids = list(poem_to_array.keys())
        unique_clusters = set(cluster_assignments.values())
        
        if len(unique_clusters) < 2:
            return 0.0
        
        if len(poem_ids) > n_samples:
            sampled_poems = np.random.choice(poem_ids, n_samples, replace=False)
        else:
            sampled_poems = poem_ids
        
        silhouettes = []
        cluster_to_poems = defaultdict(list)
        for poem_id, cluster_id in cluster_assignments.items():
            cluster_to_poems[cluster_id].append(poem_id)
        
        convergence_window = 30
        convergence_threshold = 0.01
        
        for i, poem_id in enumerate(sampled_poems):
            cluster_id = cluster_assignments[poem_id]
            same_cluster = [p for p in cluster_to_poems[cluster_id] if p != poem_id]
            
            if len(same_cluster) == 0:
                continue
            
            if len(same_cluster) > 10:
                same_cluster = np.random.choice(same_cluster, 10, replace=False)
            
            a = np.mean([1 - jaccard_numba(poem_to_array[poem_id], poem_to_array[p]) 
                        for p in same_cluster])
            
            other_clusters = [c for c in unique_clusters if c != cluster_id]
            if len(other_clusters) == 0:
                continue
            
            min_b = float('inf')
            for other_cluster in other_clusters:
                other_poems = cluster_to_poems[other_cluster]
                
                if len(other_poems) > 10:
                    other_poems = np.random.choice(other_poems, 10, replace=False)
                
                b = np.mean([1 - jaccard_numba(poem_to_array[poem_id], poem_to_array[p]) 
                            for p in other_poems])
                min_b = min(min_b, b)
            
            s = (min_b - a) / max(a, min_b) if max(a, min_b) > 0 else 0
            silhouettes.append(s)
            
            if i > convergence_window and i % 30 == 0:
                recent_mean = np.mean(silhouettes[-convergence_window:])
                prev_mean = np.mean(silhouettes[-2*convergence_window:-convergence_window])
                
                if abs(recent_mean - prev_mean) < convergence_threshold:
                    break
        
        return np.mean(silhouettes) if silhouettes else 0.0
    
    def evaluate_threshold(self, threshold, similarities_df, sample_poems, poem_to_array):
        cluster_assignments = self.cluster_at_threshold(
            similarities_df, threshold, sample_poems, poem_to_array
        )
        
        clusters = defaultdict(list)
        for poem_id, cluster_id in cluster_assignments.items():
            clusters[cluster_id].append(poem_id)
        
        n_clusters = len(clusters)
        cluster_sizes = [len(poems) for poems in clusters.values()]
        n_singletons = sum(1 for size in cluster_sizes if size == 1)
        avg_size = np.mean(cluster_sizes) if cluster_sizes else 0
        max_size = max(cluster_sizes) if cluster_sizes else 0
        
        cohesion = self.compute_cluster_cohesion(poem_to_array, cluster_assignments)
        separation = self.compute_cluster_separation(poem_to_array, cluster_assignments)
        silhouette = self.compute_silhouette_approximation(poem_to_array, cluster_assignments)
        
        n_pairs_above = len(similarities_df[similarities_df['similarity'] >= threshold])
        pct_pairs_above = (n_pairs_above / len(similarities_df) * 100) if len(similarities_df) > 0 else 0
        
        return {
            'threshold': threshold,
            'n_clusters': n_clusters,
            'n_singletons': n_singletons,
            'avg_cluster_size': avg_size,
            'max_cluster_size': max_size,
            'cohesion': cohesion,
            'separation': separation,
            'silhouette': silhouette,
            'n_pairs_above': n_pairs_above,
            'pct_pairs_above': pct_pairs_above
        }
    
    def grid_search_thresholds(self, similarities_df, sample_poems, poem_to_array):
        print("\n" + "="*70)
        print("ADAPTIVE GRID SEARCH: TWO-STAGE APPROACH")
        print("="*70)
        
        coarse_thresholds = np.linspace(0.01, 0.1, 7)
        print(f"Stage 1: Testing {len(coarse_thresholds)} coarse thresholds...")
        
        coarse_results = []
        for threshold in tqdm(coarse_thresholds, desc="Coarse search"):
            result = self.evaluate_threshold(threshold, similarities_df, sample_poems, poem_to_array)
            coarse_results.append(result)
        
        coarse_df = pd.DataFrame(coarse_results)
        
        def normalize(series):
            min_val = series.min()
            max_val = series.max()
            if max_val - min_val < 1e-10:
                return pd.Series(0.5, index=series.index)
            return (series - min_val) / (max_val - min_val)
        
        silhouette_score = normalize(coarse_df['silhouette'])
        cohesion_score = normalize(coarse_df['cohesion'])
        separation_score = normalize(coarse_df['separation'])
        singleton_ratio = coarse_df['n_singletons'] / len(sample_poems)
        balance_score = np.clip(1 - singleton_ratio, 0, 1)
        
        coarse_df['quality_score'] = (
            silhouette_score * 0.40 +
            cohesion_score * 0.30 +
            separation_score * 0.20 +
            balance_score * 0.10
        )
        
        best_idx = coarse_df['quality_score'].idxmax()
        best_coarse = coarse_df.loc[best_idx]
        best_thresh = best_coarse['threshold']
        
        print(f"  Best coarse threshold: {best_thresh:.3f} (quality: {best_coarse['quality_score']:.3f})")
        
        fine_range = 0.15
        fine_thresholds = np.linspace(
            max(0.3, best_thresh - fine_range),
            min(0.9, best_thresh + fine_range),
            9
        )
        
        print(f"Stage 2: Refining around {best_thresh:.3f} ± {fine_range}...")
        
        fine_results = []
        for threshold in tqdm(fine_thresholds, desc="Fine search"):
            if any(abs(r['threshold'] - threshold) < 0.01 for r in coarse_results):
                continue
            result = self.evaluate_threshold(threshold, similarities_df, sample_poems, poem_to_array)
            fine_results.append(result)
        
        all_results = coarse_results + fine_results
        results_df = pd.DataFrame(all_results)
        
        silhouette_score = normalize(results_df['silhouette'])
        cohesion_score = normalize(results_df['cohesion'])
        separation_score = normalize(results_df['separation'])
        singleton_ratio = results_df['n_singletons'] / len(sample_poems)
        balance_score = np.clip(1 - singleton_ratio, 0, 1)
        
        results_df['quality_score'] = (
            silhouette_score * 0.40 +
            cohesion_score * 0.30 +
            separation_score * 0.20 +
            balance_score * 0.10
        )
        
        results_df = results_df.sort_values('quality_score', ascending=False)
        results_df.to_csv('full_orthographic_results/poem_threshold_grid_search.csv', index=False)
        print(f"\nGrid search results saved")
        
        return results_df
    
    def plot_poem_grid_search_line(self, results_df, selected_threshold, results_folder):
        print("\nCreating poem-level line graph...")
        
        sns.set_palette("colorblind")
        fig, ax = plt.subplots(1, 1, figsize=(10, 6))
        
        thresholds = results_df['threshold'].values
        
        ax.plot(thresholds, results_df['quality_score'], 'o-', 
                linewidth=2, markersize=8, color='#0173B2', label='Quality Score')
        ax.axvline(selected_threshold, color='#CC0000', linestyle='--', linewidth=2,
                   label=f'Selected: {selected_threshold:.3f}')
        ax.set_xlabel('Jaccard Similarity Threshold', fontweight='bold', fontsize=12)
        ax.set_ylabel('Quality Score', fontweight='bold', fontsize=12)
        ax.set_title('Poem-Level Quality Score vs Threshold', fontweight='bold', fontsize=14)
        ax.legend(fontsize=11)
        ax.grid(alpha=0.3)
        
        best_idx = results_df['quality_score'].idxmax()
        ax.scatter(results_df.loc[best_idx, 'threshold'], 
                   results_df.loc[best_idx, 'quality_score'],
                   color='red', s=200, marker='*', edgecolors='black', linewidth=2, zorder=10)
        
        plt.tight_layout()
        plot_path = os.path.join(results_folder, 'poem_grid_search_line.png')
        plt.savefig(plot_path, dpi=300, bbox_inches='tight')
        print(f"Poem line graph saved: {plot_path}")
        plt.close()
    
    def run_threshold_analysis(self, df):
        timing_logger.start_stage("02_poem_threshold_analysis")
        
        print("="*70)
        print("OPTIMIZED POEM-LEVEL THRESHOLD SELECTION")
        print("="*70)
        print(f"Minimum shared verses: {self.min_shared_verses}")
        
        print("\nStep 1: Reconstructing poems...")
        poem_to_clusters = self.reconstruct_poems_vectorized(df)
        print(f"  Found {len(poem_to_clusters):,} poems")
        
        print("\nStep 2: Building inverted index...")
        cluster_to_poems = self.build_inverted_index_fast(poem_to_clusters)
        print(f"  Found {len(cluster_to_poems):,} verse clusters")
        
        print(f"\nStep 3: Sampling {self.sample_size:,} poems...")
        sample_poems = self.stratified_sample_poems(df, poem_to_clusters)
        print(f"  Sampled {len(sample_poems):,} poems")
        
        print("\nStep 4: Finding candidate pairs in sample...")
        start_time = time.time()
        candidate_pairs = self.find_candidate_pairs_for_sample(
            sample_poems, poem_to_clusters, cluster_to_poems
        )
        print(f"  Found {len(candidate_pairs):,} candidate pairs in {time.time()-start_time:.1f}s")
        
        print("\nStep 5: Converting to arrays...")
        poem_to_array = {
            p: np.array(sorted(poem_to_clusters[p]), dtype=np.int32)
            for p in sample_poems
        }
        
        print(f"\nStep 6: Computing similarities...")
        start_time = time.time()
        similarities_df = self.compute_sample_similarities(candidate_pairs, poem_to_array)
        print(f"  Computed {len(similarities_df):,} similarities in {time.time()-start_time:.1f}s")
        
        similarities_df.to_csv('full_orthographic_results/poem_similarities_sample.csv', index=False)
        
        print("\nStep 7: Adaptive grid search over thresholds...")
        grid_results = self.grid_search_thresholds(similarities_df, sample_poems, poem_to_array)
        
        print("\n" + "="*70)
        print("THRESHOLD SELECTION BASED ON QUALITY METRICS")
        print("="*70)
        
        best_result = grid_results.iloc[0]
        threshold = float(best_result['threshold'])
        
        print(f"Selected Threshold:   {threshold:.4f}")
        print(f"Quality Score:        {best_result['quality_score']:.4f}")
        print(f"Silhouette:           {best_result['silhouette']:.4f}")
        print(f"Cohesion:             {best_result['cohesion']:.4f}")
        print(f"Separation:           {best_result['separation']:.4f}")
        print(f"Clusters:             {int(best_result['n_clusters']):,}")
        print(f"Singletons:           {int(best_result['n_singletons']):,}")
        print(f"Avg Cluster Size:     {best_result['avg_cluster_size']:.2f}")
        
        print("\nStep 8: Creating visualizations...")
        self.plot_poem_grid_search_line(grid_results, threshold, 'full_orthographic_results')
        
        summary = {
            'selected_threshold': threshold,
            'quality_score': best_result['quality_score'],
            'silhouette': best_result['silhouette'],
            'cohesion': best_result['cohesion'],
            'separation': best_result['separation'],
            'n_clusters': int(best_result['n_clusters']),
            'n_singletons': int(best_result['n_singletons']),
            'avg_cluster_size': best_result['avg_cluster_size'],
            'min_shared_verses': self.min_shared_verses,
            'sample_size': len(sample_poems),
            'total_poems': len(poem_to_clusters)
        }
        
        pd.DataFrame([summary]).to_csv('full_orthographic_results/poem_enhanced_threshold_summary.csv', index=False)
        print(f"Summary saved")
        
        print("\n" + "="*70)
        print("POEM THRESHOLD ANALYSIS COMPLETE")
        print("="*70)
        
        timing_logger.end_stage()
        return threshold, grid_results, similarities_df, poem_to_clusters

def cluster_all_poems_at_threshold(df, poem_threshold, poem_to_clusters, results_folder="full_orthographic_results"):
    timing_logger.start_stage("03_full_poem_clustering")
    
    print("\n" + "="*80)
    print("CLUSTERING ALL POEMS WITH OPTIMAL THRESHOLD")
    print("="*80)
    print(f"Threshold: {poem_threshold:.3f}")
    
    poem_to_dataset = df.groupby('idoriginal_poem')['source_dataset'].first().to_dict()
    
    cluster_to_poems = defaultdict(set)
    for poem_id, clusters in poem_to_clusters.items():
        for c in clusters:
            cluster_to_poems[c].add(poem_id)
    
    print("\nFinding cross-dataset candidate pairs...")
    datasets = df['source_dataset'].unique()
    
    poems_by_dataset = defaultdict(list)
    for poem_id, dataset in poem_to_dataset.items():
        poems_by_dataset[dataset].append(poem_id)
    
    all_pairs = set()
    
    n_workers = system_analyzer.get_optimal_workers('io_intensive')
    
    def process_dataset_pair(dataset_pair):
        dataset1, dataset2 = dataset_pair
        poems1 = poems_by_dataset[dataset1]
        poems2 = poems_by_dataset[dataset2]
        poems2_set = set(poems2)
        
        local_pairs = set()
        for poem_id in poems1:
            clusters = poem_to_clusters.get(poem_id, [])
            
            candidates = set()
            for cluster_id in clusters:
                if int(cluster_id) in cluster_to_poems:
                    candidates.update(cluster_to_poems[int(cluster_id)])
            
            candidates = candidates & poems2_set
            
            for other_poem in candidates:
                pair = tuple(sorted([poem_id, other_poem]))
                local_pairs.add(pair)
        
        return local_pairs
    
    dataset_pairs = []
    for i, dataset1 in enumerate(datasets):
        for dataset2 in datasets[i+1:]:
            dataset_pairs.append((dataset1, dataset2))
    
    print(f"  Processing {len(dataset_pairs)} dataset pairs with {n_workers} workers...")
    
    with ThreadPoolExecutor(max_workers=n_workers) as executor:
        pair_results = list(tqdm(executor.map(process_dataset_pair, dataset_pairs),
                                total=len(dataset_pairs), desc="Dataset pairs"))
    
    for pair_set in pair_results:
        all_pairs.update(pair_set)
    
    print(f"  Total candidate pairs: {len(all_pairs):,}")
    
    print("\nClustering poems...")
    poem_ids = list(poem_to_clusters.keys())
    uf = PoemUnionFind(poem_ids)
    
    merges = 0
    for p1, p2 in tqdm(all_pairs, desc="Processing pairs"):
        clusters1 = poem_to_clusters[p1]
        clusters2 = poem_to_clusters[p2]
        
        intersection = len(set(clusters1) & set(clusters2))
        union = len(set(clusters1) | set(clusters2))
        
        if union > 0:
            jaccard = intersection / union
            if jaccard >= poem_threshold:
                if uf.union(p1, p2):
                    merges += 1
    
    print(f"  Performed {merges:,} merges")
    
    poem_clusters = uf.get_clusters()
    cluster_assignments = {}
    for cluster_id, poems in poem_clusters.items():
        for poem in poems:
            cluster_assignments[poem] = cluster_id
    
    n_clusters = len(poem_clusters)
    cluster_sizes = [len(poems) for poems in poem_clusters.values()]
    n_singletons = sum(1 for size in cluster_sizes if size == 1)
    
    n_cross_dataset_clusters = 0
    cross_dataset_cluster_ids = set()
    for cluster_id, poems in poem_clusters.items():
        datasets = set(poem_to_dataset.get(p) for p in poems)
        if len(datasets) > 1:
            n_cross_dataset_clusters += 1
            cross_dataset_cluster_ids.add(cluster_id)
    
    print(f"\n  Total poem clusters: {n_clusters:,}")
    print(f"  Cross-dataset clusters: {n_cross_dataset_clusters:,}")
    print(f"  Singleton poems: {n_singletons:,}")
    print(f"  Avg cluster size: {np.mean(cluster_sizes):.2f}")
    print(f"  Max cluster size: {max(cluster_sizes)}")
    
    df['poem_cluster_id'] = df['idoriginal_poem'].astype(str).map(cluster_assignments)
    df['is_cross_dataset_poem_cluster'] = df['poem_cluster_id'].isin(cross_dataset_cluster_ids)
    
    output_csv = os.path.join(results_folder, "poems_clustered_full.csv")
    df.to_csv(output_csv, index=False)
    print(f"\nFull results saved: {output_csv}")
    
    poem_summary = {
        'n_verses': len(df),
        'n_poems': len(poem_to_clusters),
        'n_datasets': len(set(poem_to_dataset.values())),
        'best_jaccard_threshold': poem_threshold,
        'n_poem_clusters': len(set(cluster_assignments.values())),
        'n_cross_dataset_clusters': n_cross_dataset_clusters,
        'n_poems_in_cross_dataset_clusters': sum(df['is_cross_dataset_poem_cluster'])
    }
    
    summary_csv = os.path.join(results_folder, 'poem_clustering_full_summary.csv')
    pd.DataFrame([poem_summary]).to_csv(summary_csv, index=False)
    print(f"Summary saved: {summary_csv}")
    
    timing_logger.end_stage()
    
    print("\n" + "="*80)
    print("FULL POEM CLUSTERING COMPLETE")
    print("="*80)
    
    return df, poem_clusters, cluster_assignments, poem_summary

def print_example_clusters(df, results_folder="full_orthographic_results"):
    print("\n" + "="*80)
    print("EXAMPLE CLUSTERS")
    print("="*80)
    
    print("\n" + "-"*80)
    print("5 EXAMPLE VERSE-LEVEL CLUSTERS (multi-member)")
    print("-"*80)
    
    cluster_info = df[df['cluster_id'] != -1].groupby('cluster_id').agg({
        'verse': 'count',
        'source_dataset': lambda x: list(x.unique())
    }).rename(columns={'verse': 'size'})
    
    multi_clusters = cluster_info[cluster_info['size'] > 1].sort_values('size', ascending=False)
    
    for idx, (cluster_id, row) in enumerate(multi_clusters.head(5).iterrows(), 1):
        print(f"\nVerse Cluster {idx} (ID: {cluster_id})")
        print(f"  Size: {row['size']} verses")
        print(f"  Datasets: {', '.join(row['source_dataset'])}")
        
        cluster_verses = df[df['cluster_id'] == cluster_id]
        print(f"  Example verses:")
        for i, (_, verse_row) in enumerate(cluster_verses.head(3).iterrows(), 1):
            verse_text = str(verse_row['verse'])[:80]
            print(f"    {i}. [{verse_row['source_dataset']}] {verse_text}...")
    
    print("\n" + "-"*80)
    print("5 EXAMPLE POEM-LEVEL CLUSTERS (multi-member)")
    print("-"*80)
    
    if 'poem_cluster_id' in df.columns:
        poem_cluster_info = df[df['poem_cluster_id'].notna()].groupby('poem_cluster_id').agg({
            'idoriginal_poem': lambda x: len(set(x)),
            'source_dataset': lambda x: list(set(x))
        }).rename(columns={'idoriginal_poem': 'n_poems'})
        
        multi_poem_clusters = poem_cluster_info[poem_cluster_info['n_poems'] > 1].sort_values('n_poems', ascending=False)
        
        for idx, (cluster_id, row) in enumerate(multi_poem_clusters.head(5).iterrows(), 1):
            print(f"\nPoem Cluster {idx} (ID: {cluster_id})")
            print(f"  Size: {row['n_poems']} poems")
            print(f"  Datasets: {', '.join(row['source_dataset'])}")
            
            cluster_poems = df[df['poem_cluster_id'] == cluster_id]['idoriginal_poem'].unique()
            print(f"  Poems in cluster:")
            for i, poem_id in enumerate(cluster_poems[:5], 1):
                poem_data = df[df['idoriginal_poem'] == poem_id]
                dataset = poem_data['source_dataset'].iloc[0]
                n_verses = len(poem_data)
                print(f"    {i}. Poem {poem_id} [{dataset}] - {n_verses} verses")
                
                first_verse = poem_data.iloc[0]['verse'][:80]
                print(f"       First verse: {first_verse}...")
            
            if len(cluster_poems) > 5:
                print(f"    ... and {len(cluster_poems) - 5} more poems")
    else:
        print("  Poem clustering not yet completed")

def main():
    results_folder = "full_orthographic_results"
    os.makedirs(results_folder, exist_ok=True)
    print(f"Results will be saved to: {results_folder}/\n")
    
    clustered_file = os.path.join(results_folder, "clustered_optimized.csv")
    metrics_file = os.path.join(results_folder, "clustering_metrics.csv")
    
    if os.path.exists(clustered_file) and os.path.exists(metrics_file):
        print("="*80)
        print("FOUND EXISTING VERSE CLUSTERING RESULTS")
        print("="*80)
        print(f"Loading from: {clustered_file}")
        
        timing_logger.start_stage("00_load_existing_results")
        
        df = pd.read_csv(clustered_file)
        metrics = pd.read_csv(metrics_file).iloc[0].to_dict()
        
        print(f"\nLoaded {len(df):,} verses")
        print(f"Verse clusters: {metrics['n_clusters']:,}")
        print(f"Multi-member clusters: {metrics['n_multi_clusters']:,}")
        print(f"Singletons: {metrics['n_singletons']:,}")
        
        verse_summary = {
            'n_verses': len(df),
            'best_shingle_size': int(metrics['best_shingle_size']),
            'best_threshold': float(metrics['best_threshold']),
            'n_clusters': int(metrics['n_clusters']),
            'n_multi_clusters': int(metrics['n_multi_clusters']),
            'n_singletons': int(metrics['n_singletons']),
            'max_cluster_size': int(metrics['max_cluster_size'])
        }
        
        timing_logger.end_stage()
        
        print("\nSkipping verse clustering - jumping to poem-level analysis")
        print("="*80)
    
    else:
        print("="*80)
        print("NO EXISTING RESULTS FOUND - RUNNING FULL PIPELINE")
        print("="*80)
        
        timing_logger.start_stage("00_data_loading")
        
        df = pd.read_csv("concatenated.csv")
        df = df[df['source_dataset'].isin(['rhoby', 'dbbe', 'phi', 'papyri'])]
        df = df[df['verse'].fillna('').astype(str).str.len() >= 20]
        df['verse'] = df['verse'].apply(preprocess_text)
        df = df.reset_index(drop=True)
        df = df[df['verse'].str.strip().str.lower() != 'nan']
        texts = df['verse'].fillna('').astype(str).tolist()
        
        print(f"Verses: {len(texts):,}")
        
        timing_logger.end_stage()
        
        timing_logger.start_stage("01_exact_duplicates")
        
        duplicate_groups = find_exact_duplicates_fast(texts)
        print(f"Found {len(duplicate_groups):,} exact duplicate groups")
        
        timing_logger.end_stage()
        
        best_shingle, best_threshold, grid_results = grid_search_parameters(
            texts, df, duplicate_groups,
            shingle_sizes=[2, 3, 4, 5],
            threshold_range=(0.3, 0.85, 7),
            n_sample=15000,
            results_folder=results_folder,
            max_workers=system_analyzer.get_optimal_workers('cpu_intensive')
        )
        
        timing_logger.start_stage("02_minhash_computation")
        
        print(f"\nComputing MinHashes with optimal shingle size={best_shingle}...")
        minhashes = compute_minhash_parallel(texts, n_gram_size=best_shingle, num_perm=128)
        
        timing_logger.end_stage()
        
        timing_logger.start_stage("03_verse_clustering")
        
        print(f"\nClustering with threshold={best_threshold:.3f}...")
        cluster_labels, metrics = cluster_with_lsh_forest(
            minhashes, duplicate_groups, best_threshold, top_k=100
        )
        
        timing_logger.end_stage()
        
        timing_logger.start_stage("04_save_verse_results")
        
        df['cluster_id'] = cluster_labels
        output_csv = os.path.join(results_folder, "clustered_optimized.csv")
        df.to_csv(output_csv, index=False)
        print(f"Clustered data saved: {output_csv}")
        
        timing_logger.end_stage()
        
        verse_summary = {
            'n_verses': len(df),
            'best_shingle_size': best_shingle,
            'best_threshold': best_threshold,
            'n_clusters': metrics['n_clusters'],
            'n_multi_clusters': metrics['n_multi_clusters'],
            'n_singletons': metrics['n_singletons'],
            'max_cluster_size': metrics['max_cluster_size']
        }
        
        metrics.update({
            'total_time_minutes': (time.time() - script_start_time) / 60,
            'best_shingle_size': best_shingle,
            'best_threshold': best_threshold
        })
        metrics_csv = os.path.join(results_folder, "clustering_metrics.csv")
        pd.DataFrame([metrics]).to_csv(metrics_csv, index=False)
        print(f"Metrics saved: {metrics_csv}")
        
        print(f"\n{'='*70}")
        print("VERSE-LEVEL CLUSTERING COMPLETE")
        print(f"{'='*70}")
        print(f"Optimal shingle size: {best_shingle}")
        print(f"Optimal threshold: {best_threshold:.3f}")
        print(f"Multi-member clusters: {metrics['n_multi_clusters']:,}")
        print(f"Singletons: {metrics['n_singletons']:,}")
        print(f"Max cluster size: {metrics['max_cluster_size']}")
        print(f"All results saved to: {results_folder}/")
        print(f"{'='*70}")
        
        gc.collect()
    
    print("\n" + "="*80)
    print("POEM-LEVEL ANALYSIS")
    print("="*80)
    
    selector = PoemThresholdSelector(
        sample_size=15000, 
        random_seed=42, 
        min_shared_verses=1
    )
    
    poem_threshold, poem_grid_results, poem_similarities_df, poem_to_clusters = selector.run_threshold_analysis(df)
    
    gc.collect()
    
    df_final, poem_clusters, cluster_assignments, poem_summary = cluster_all_poems_at_threshold(
        df, 
        poem_threshold,
        poem_to_clusters,
        results_folder
    )
    
    print_example_clusters(df_final, results_folder)
    
    return verse_summary, poem_threshold

if __name__ == "__main__":
    verse_summary, poem_threshold = main()
    
    resource_monitor.stop()
    total_time = time.time() - script_start_time
    
    system_info = get_system_info()
    resource_stats = resource_monitor.get_stats()
    timing_summary = timing_logger.get_summary()
    
    report_lines = []
    report_lines.append("="*80)
    report_lines.append("COMPREHENSIVE ORTHOGRAPHIC CLUSTERING PERFORMANCE REPORT")
    report_lines.append("="*80)
    report_lines.append("")
    
    report_lines.append("SYSTEM INFORMATION")
    report_lines.append("-" * 80)
    report_lines.append(f"Hostname:            {system_info['hostname']}")
    report_lines.append(f"Platform:            {system_info['platform']}")
    report_lines.append(f"Python Version:      {system_info['python_version']}")
    report_lines.append(f"Processor:           {system_info['processor']}")
    report_lines.append(f"CPU Cores (Physical):{system_info['cpu_count_physical']}")
    report_lines.append(f"CPU Cores (Logical): {system_info['cpu_count_logical']}")
    report_lines.append(f"Total RAM:           {system_info['total_ram_gb']:.2f} GB")
    report_lines.append(f"Available RAM:       {system_info['available_ram_gb']:.2f} GB")
    report_lines.append(f"GPU Available:       {'Yes' if system_info['has_gpu'] else 'No'}")
    if system_info['has_gpu']:
        report_lines.append(f"GPU Count:           {system_info['gpu_count']}")
        report_lines.append(f"GPU Memory:          {system_info['gpu_memory_gb']:.2f} GB")
    report_lines.append(f"Timestamp:           {system_info['timestamp']}")
    report_lines.append("")
    
    report_lines.append("PEAK RESOURCE USAGE")
    report_lines.append("-" * 80)
    report_lines.append(f"Peak RAM Usage:      {resource_stats['peak_ram_gb']:.2f} GB")
    report_lines.append(f"Average RAM Usage:   {resource_stats['avg_ram_gb']:.2f} GB")
    report_lines.append(f"Peak CPU Percent:    {resource_stats['peak_cpu_percent']:.1f}%")
    report_lines.append(f"Average CPU Percent: {resource_stats['avg_cpu_percent']:.1f}%")
    report_lines.append("")
    
    report_lines.append("TIMING BREAKDOWN (BY STAGE)")
    report_lines.append("-" * 80)
    
    total_measured = sum(timing_summary.values())
    for stage_name, duration in timing_summary.items():
        pct = (duration / total_measured * 100) if total_measured > 0 else 0
        report_lines.append(f"{stage_name:.<50} {duration:>8.1f}s ({pct:>5.1f}%)")
    
    report_lines.append(f"{'TOTAL MEASURED TIME':.<50} {total_measured:>8.1f}s")
    report_lines.append(f"{'TOTAL WALL CLOCK TIME':.<50} {total_time:>8.1f}s ({total_time/60:>6.1f} min)")
    report_lines.append("")
    
    report_lines.append("DETAILED TIMING ANALYSIS")
    report_lines.append("-" * 80)
    
    verse_stages = [k for k in timing_summary.keys() if k.startswith(('00_', '01_', '02_', '03_', '04_'))]
    verse_time = sum(timing_summary.get(k, 0) for k in verse_stages)
    
    poem_stages = [k for k in timing_summary.keys() if k.startswith(('05_',)) or 'poem' in k.lower()]
    poem_time = sum(timing_summary.get(k, 0) for k in poem_stages)
    
    report_lines.append(f"Verse-Level Clustering:  {verse_time:>8.1f}s ({verse_time/60:>6.1f} min)")
    report_lines.append(f"Poem-Level Clustering:   {poem_time:>8.1f}s ({poem_time/60:>6.1f} min)")
    report_lines.append("")
    
    report_lines.append("CLUSTERING RESULTS SUMMARY")
    report_lines.append("-" * 80)
    report_lines.append("Verse-Level:")
    report_lines.append(f"  Total verses:             {verse_summary['n_verses']:,}")
    report_lines.append(f"  Total clusters:           {verse_summary['n_clusters']:,}")
    report_lines.append(f"  Multi-member clusters:    {verse_summary['n_multi_clusters']:,}")
    report_lines.append(f"  Singletons:               {verse_summary['n_singletons']:,}")
    report_lines.append(f"  Best shingle size:        {verse_summary['best_shingle_size']}")
    report_lines.append(f"  Best threshold:           {verse_summary['best_threshold']:.3f}")
    report_lines.append("")
    report_lines.append("Poem-Level:")
    report_lines.append(f"  Selected threshold:       {poem_threshold:.3f}")
    report_lines.append("")
    
    report_lines.append("PERFORMANCE METRICS")
    report_lines.append("-" * 80)
    if verse_time > 0:
        report_lines.append(f"Verse clustering throughput:  {verse_summary['n_verses'] / verse_time:.1f} verses/sec")
    report_lines.append(f"Overall processing rate:      {verse_summary['n_verses'] / total_time:.1f} verses/sec")
    report_lines.append("")
    
    report_lines.append("="*80)
    report_lines.append("END OF REPORT")
    report_lines.append("="*80)
    
    report_path = Path('full_orthographic_results') / 'clustering_performance_report.txt'
    with open(report_path, 'w') as f:
        f.write('\n'.join(report_lines))
    
    print(f"\nPerformance report saved to: {report_path}")
    print("="*80)
    print("Completed.")
    print("="*80)

In [7]:
import pickle
from pathlib import Path

CHECKPOINT_DIR = Path("/scratch/gent/vo/000/gvo00042/vsc48660/full_semantic_clustering_checkpoints_tmp2")  # adjust path if needed

with open(CHECKPOINT_DIR / 'timing_metadata.pkl', 'rb') as f:
    checkpoint_timing = pickle.load(f)

print(checkpoint_timing)


FileNotFoundError: [Errno 2] No such file or directory: '/scratch/gent/vo/000/gvo00042/vsc48660/full_semantic_clustering_checkpoints_tmp2/timing_metadata.pkl'