In [214]:
import pymupdf
import pymupdf4llm
import pathlib
import pandas as pd
import numpy as np
from pprint import pprint
import re
from collections import Counter

# pip install llama_index

# md_text = pymupdf4llm.to_markdown(fileinquestion)
# pathlib.Path("output.md").write_bytes(md_text.encode())
# llama_reader = pymupdf4llm.LlamaMarkdownReader()
# print(llama_reader())
# llama_docs = llama_reader.load_data(fileinquestion)

#tesseract ocr
# english
# russian
# chinese (new)
# mandarin (traditional)
# japanese
# Hindi
# Arabic
# French
# Hebrew
# German
# Korean
# Italian
# Polish
# Portugese
# Spanish 
# Indonesian
#turkish
# Urdu

In [382]:

def extract_blocks_and_distributions(html_string): # Match each full <p> block
    block_pattern = re.findall(
        r'<p style="top:([\d.]+)pt;left:([\d.]+)pt;line-height:([\d.]+)pt">(.*?)</p>',
        html_string,
        flags=re.DOTALL
    )
    results , font_sizes, line_heights = [],[],[]

    for top, left, line_height, inner_html in block_pattern:
        font_match = re.search(r'font-size:([\d.]+)pt', inner_html)
        font_size = float(font_match.group(1)) if font_match else None

        font_family_match = re.search(r'font-family:([^;"]+)', inner_html)
        font_family = font_family_match.group(1).strip() if font_family_match else None

        is_bold = bool(re.search(r'<b>', inner_html))
        is_italic = bool(re.search(r'<i>', inner_html))

        clean_text = re.sub(r'<[^>]+>', '', inner_html).strip()
        word_count = len(clean_text.split())
        word_density = word_count / float(line_height) if float(line_height) > 0 else 0

        results.append({
            "top": float(top),
            "left": float(left),
            "line_height": float(line_height),
            "font_size": font_size,
            "font_family": font_family,
            "bold": is_bold,
            "italic": is_italic,
            "text": clean_text,
            "word_count": word_count,
            "word_density": word_density
        })
        line_heights.append(float(line_height))
        if font_match:
            font_size = float(font_match.group(1))
            font_sizes.append(font_size)
        else:
            print("⚠️ Font size not found in:", inner_html[:60])
        line_heights.append(float(line_height))
    return results, Counter(font_sizes), Counter(line_heights)

heading_patterns = [
    r'^\d+\.',                # 1.
    r'^\d+\.\d+(\.\d+)*',     # 1.1, 2.3.4
    r'^\d+\)',                # 1)
    r'^[A-Z]\.',              # A.
    r'^[a-z]\)',              # a)
    r'^[ivxlcdm]+\)',         # i), ii), iv)
    r'^[IVXLCDM]+\)',         # I), II)
    r'^[a-z]\.',              # a.
    r'^[IVXLCDM]+\.',         # I.
    r'^•\s*',                 # • bullet
    r'^-\s*',                 # - bullet
    r'^\*\s*'                 # * bullet
]
compiled_patterns = [re.compile(p) for p in heading_patterns]

def extract_crucial_pattern_lines(dflist):
    dfcrucial_pattern = []

    for page_num, df in enumerate(dflist):
        if df.empty or 'text' not in df.columns:
            continue

        for _, row in df.iterrows():
            first_word = row['text'].strip().split(' ')[0]
            for pattern in compiled_patterns:
                if pattern.match(first_word):
                    row_copy = row.copy()
                    row_copy['page'] = page_num
                    dfcrucial_pattern.append(row_copy)
                    break  # stop at first match

    if dfcrucial_pattern:
        return pd.DataFrame(dfcrucial_pattern).reset_index(drop=True)
    else:
        return pd.DataFrame()  # empty if nothing found

def find_alternate_page_repeats_splitold(dflist, mid):
    def get_matches(df_base, df1, df2):
        matched = []
        for _, row in df_base.iterrows():
            text = row['text'].strip()
            top = row['top']

            def match(df):
                return any(
                    (abs(top - r['top']) <= 1.0) and (r['text'].strip() == text)
                    for _, r in df.iterrows()
                )

            if match(df1) and match(df2):
                matched.append(row)

        return pd.DataFrame(matched)

    even_matches, odd_matches = pd.DataFrame(), pd.DataFrame()

    # Mid page even
    if mid % 2 == 0:
        page_even = dflist[mid]
        before_even = dflist[mid - 2] if mid - 2 >= 0 else None
        after_even = dflist[mid + 2] if mid + 2 < len(dflist) else None

        if before_even is not None and after_even is not None:
            even_matches = get_matches(page_even, before_even, after_even)
        elif before_even is not None:
            even_matches = get_matches(page_even, before_even)
        elif after_even is not None:
            even_matches = get_matches(page_even, after_even)

    # Mid - 1 page odd
    if mid - 1 >= 0 and (mid - 1) % 2 == 1:
        odd_page_idx = mid - 1
        page_odd = dflist[odd_page_idx]
        before_odd = dflist[odd_page_idx - 2] if odd_page_idx - 2 >= 0 else None
        after_odd = dflist[odd_page_idx + 2] if odd_page_idx + 2 < len(dflist) else None

        if before_odd is not None and after_odd is not None:
            odd_matches = get_matches(page_odd, before_odd, after_odd)
        elif before_odd is not None:
            odd_matches = get_matches(page_odd, before_odd)
        elif after_odd is not None:
            odd_matches = get_matches(page_odd, after_odd)
    return even_matches, odd_matches

def find_alternate_page_repeats_split(dflist, mid):
    def is_valid_df(df):
        return df is not None and not df.empty

    def get_matches(df_base, *others):
        matched = []
        for _, row in df_base.iterrows():
            text = row['text'].strip()
            top = row['top']

            def match(df):
                return any(
                    (abs(top - r['top']) <= 1.0) and (r['text'].strip() == text)
                    for _, r in df.iterrows()
                )

            if all(match(df) for df in others if is_valid_df(df)):
                matched.append(row)

        return pd.DataFrame(matched)

    even_matches, odd_matches = pd.DataFrame(), pd.DataFrame()

    # Even page block
    if mid % 2 == 0 and mid < len(dflist):
        page_even = dflist[mid]
        before_even = dflist[mid - 2] if mid - 2 >= 0 else None
        after_even = dflist[mid + 2] if mid + 2 < len(dflist) else None

        if is_valid_df(before_even) and is_valid_df(after_even):
            even_matches = get_matches(page_even, before_even, after_even)
        elif is_valid_df(before_even):
            even_matches = get_matches(page_even, before_even)
        elif is_valid_df(after_even):
            even_matches = get_matches(page_even, after_even)

    # Odd page block
    odd_page_idx = mid - 1
    if odd_page_idx >= 0 and odd_page_idx % 2 == 1 and odd_page_idx < len(dflist):
        page_odd = dflist[odd_page_idx]
        before_odd = dflist[odd_page_idx - 2] if odd_page_idx - 2 >= 0 else None
        after_odd = dflist[odd_page_idx + 2] if odd_page_idx + 2 < len(dflist) else None

        if is_valid_df(before_odd) and is_valid_df(after_odd):
            odd_matches = get_matches(page_odd, before_odd, after_odd)
        elif is_valid_df(before_odd):
            odd_matches = get_matches(page_odd, before_odd)
        elif is_valid_df(after_odd):
            odd_matches = get_matches(page_odd, after_odd)

    return even_matches.reset_index(drop=True), odd_matches.reset_index(drop=True)



def clean_pages_of_even_odd_repeats(dflist, even_df, odd_df, tolerance=1.0):
    """
    Cleans even-indexed pages using even_df and odd-indexed pages using odd_df.
    If even_df == odd_df, treats all pages the same.
    """
    def to_clean_set(df):
        return set((row['text'].strip(), round(row['top'], 1)) for _, row in df.iterrows())

    even_set = to_clean_set(even_df)
    odd_set = to_clean_set(odd_df)
    same_repeats = even_set == odd_set

    cleaned_pages = []

    for idx, df in enumerate(dflist):
        compare_set = even_set if same_repeats or idx % 2 == 0 else odd_set

        mask = df.apply(
            lambda row: not any(
                (abs(row['top'] - rep_top) <= tolerance) and (row['text'].strip() == rep_text)
                for rep_text, rep_top in compare_set
            ),
            axis=1
        )
        cleaned_df = df[mask].reset_index(drop=True)
        cleaned_pages.append(cleaned_df)
    return cleaned_pages
def merge_paragraphs_by_font(
    df,
    font_min=8.0,
    font_max=14.0,
    font_tolerance=1.0,
    line_gap=2.0
):
    """
    Groups consecutive lines (in original order) with similar font size and small vertical gap into paragraphs.
    No sorting is applied — assumes df is already in reading order.

    Parameters:
        df (pd.DataFrame): Input DataFrame with 'top', 'font_size', 'line_height', 'text' columns.
        font_min (float): Minimum font size to allow.
        font_max (float): Maximum font size to allow.
        font_tolerance (float): Allowed deviation in font size between lines.
        line_gap (float): Maximum allowed vertical gap to group lines.

    Returns:
        List[dict]: List of merged paragraph dictionaries.
    """
    if df.empty:
        return []

    # Filter font range
    df = df[(df['font_size'] >= font_min) & (df['font_size'] <= font_max)].reset_index(drop=True)
    if df.empty:
        return []

    paragraphs = []
    current_para = {
        'top': df.loc[0, 'top'],
        'font_size': df.loc[0, 'font_size'],
        'text': df.loc[0, 'text'],
        'line_count': 1
    }

    for i in range(1, len(df)):
        prev = df.loc[i - 1]
        curr = df.loc[i]
    
        same_font = abs(curr['font_size'] - prev['font_size']) <= font_tolerance
        small_gap = abs(curr['top'] - prev['top']) <= (prev['line_height'] + line_gap)

        if same_font and small_gap:
            current_para['text'] += ' ' + curr['text']
            current_para['line_count'] += 1
        else:
            paragraphs.append(current_para)
            current_para = {
                'top': curr['top'],
                'font_size': curr['font_size'],
                'text': curr['text'],
                'line_count': 1
            }

    paragraphs.append(current_para)
    return paragraphs


def get_top_fonts(font_counter, top_n=3):
    """Returns the top_n most frequent font sizes in descending order."""
    font_freq = font_counter.most_common(top_n)
    return [size for size, _ in font_freq]
def create_font_level_map(font_counter):
    """Maps largest font sizes to H1, H2, H3; everything else is Body."""
    sorted_fonts = sorted(font_counter.keys(), reverse=True)
    return {
        sorted_fonts[0]: "H1" if len(sorted_fonts) > 0 else None,
        sorted_fonts[1]: "H2" if len(sorted_fonts) > 1 else None,
        sorted_fonts[2]: "H3" if len(sorted_fonts) > 2 else None
    }
def classify_font_size(size, font_to_level):
    """Returns level for a given font size using pre-defined mapping."""
    return font_to_level.get(size, "Body")
def label_font_levels(cleaned_dflist, font_to_level):
    """Adds a 'level' column to each cleaned page's DataFrame."""
    for i, df in enumerate(cleaned_dflist):
        if 'font_size' not in df.columns:
            continue
        cleaned_dflist[i]['level'] = df['font_size'].apply(lambda s: classify_font_size(s, font_to_level))
    return cleaned_dflist

def extract_heading_summary(cleaned_dflist, levels=('H1', 'H2')):
    """Extracts rows marked as H1 or H2 and returns a summary DataFrame."""
    heading_rows = []

    for page_num, df in enumerate(cleaned_dflist):
        if 'level' not in df.columns:
            continue
        headings_df = df[df['level'].isin(levels)].copy()
        headings_df['page_num'] = page_num
        heading_rows.append(headings_df[['page_num', 'level', 'text']])

    return pd.concat(heading_rows, ignore_index=True) if heading_rows else pd.DataFrame()

def filter_gibberish_rows(df, text_column='text', min_alpha_ratio=0.3, min_length=3):
    """
    Removes rows from df where the text is mostly non-alphabetic (e.g., --------, ...., ====),
    or is too short to be meaningful.

    Parameters:
        df (pd.DataFrame): The input DataFrame with a text column.
        text_column (str): Column name that contains the text.
        min_alpha_ratio (float): Minimum ratio of alphabetic chars to keep the row.
        min_length (int): Minimum total length of string to keep it.

    Returns:
        pd.DataFrame: Cleaned DataFrame with gibberish lines removed.
    """
    def is_gibberish(text):
        if len(text.strip()) < min_length:
            return True
        alpha_count = sum(c.isalpha() for c in text)
        return (alpha_count / len(text)) < min_alpha_ratio

    mask = df[text_column].apply(lambda t: not is_gibberish(t))
    return df[mask].reset_index(drop=True)

def extract_table_with_titleold(page, max_title_distance=50): #################Old


    """
    Extracts the first table and its potential title from a PyMuPDF page.

    Parameters:
        page (fitz.Page): The PDF page object.
        max_title_distance (float): Max vertical distance (in pt) to search above the table for a title.

    Returns:
        tuple:
            - pd.DataFrame: The extracted table (if found), else None.
            - str: The possible table title (if found), else None.
    """
    tables = page.find_tables()
    print(f"{len(tables.tables)} table(s) found.")

    if not tables.tables:
        return None, None

    table = tables[0]
    table_data = table.extract()
    table_df = pd.DataFrame(table_data)

    print("First table extracted:")
    pprint(table_data)

    # Look for text just above the table
    bbox = table.bbox
    text_dict = page.get_text("dict")
    possible_titles = []

    for block in text_dict.get("blocks", []):
        if "lines" in block and block["bbox"][3] < bbox[1]:  # Above the table
            if abs(block["bbox"][3] - bbox[1]) <= max_title_distance:
                text = " ".join(
                    span["text"] for line in block["lines"] for span in line["spans"]
                ).strip()
                if text:
                    possible_titles.append((block["bbox"][1], text))

    # Closest first
    possible_titles.sort(key=lambda x: -x[0])

    if possible_titles:
        title = possible_titles[0][1]
        print("Possible table title:", title)
    else:
        title = None
        print("No clear title found above the table.")

    return table_df, title

def extract_table_with_title(page, max_distance=50):
    """
    Extracts the first table and possible title text (above or below),
    along with formatting metadata (font size, bold, italic, underline).

    Parameters:
        page (fitz.Page): The PDF page object.
        max_distance (float): Max vertical distance in points to search above or below the table.

    Returns:
        tuple:
            - pd.DataFrame: The extracted table.
            - dict: Info about possible title (or footer), with formatting.
    """
    tables = page.find_tables()
    print(f"{len(tables.tables)} table(s) found.")

    if not tables.tables:
        return None, None

    table = tables[0]
    table_data = table.extract()
    table_df = pd.DataFrame(table_data)
    print("First table extracted:")
    pprint(table_data)

    bbox = table.bbox  # (x0, y0, x1, y1)
    text_dict = page.get_text("dict")

    def collect_nearby_text(blocks, ref_y, direction="above"):
        nearby = []
        for block in blocks:
            if "lines" not in block:
                continue
            block_y = block["bbox"][3] if direction == "above" else block["bbox"][1]
            is_above = direction == "above" and block_y < ref_y and (ref_y - block_y) <= max_distance
            is_below = direction == "below" and block_y > ref_y and (block_y - ref_y) <= max_distance
            if is_above or is_below:
                for line in block["lines"]:
                    for span in line["spans"]:
                        text = span["text"].strip()
                        if text:
                            nearby.append({
                                "y": block_y,
                                "text": text,
                                "font_size": span.get("size"),
                                "bold": "bold" in span.get("font", "").lower(),
                                "italic": "italic" in span.get("font", "").lower(),
                                "underline": span.get("flags", 0) & 4 != 0
                            })
        return nearby

    above_texts = collect_nearby_text(text_dict.get("blocks", []), ref_y=bbox[1], direction="above")
    below_texts = collect_nearby_text(text_dict.get("blocks", []), ref_y=bbox[3], direction="below")

    def rank_candidates(candidates):
        # Prioritize bold or larger font entries
        sorted_cand = sorted(
            candidates,
            key=lambda x: (x["bold"], x["font_size"] or 0, -abs(x["y"])),  # bold > font size > closest
            reverse=True
        )
        return sorted_cand[0] if sorted_cand else None

    title_above = rank_candidates(above_texts)
    title_below = rank_candidates(below_texts)

    # Choose the better one by some priority (e.g. favor above)
    chosen_title = title_above or title_below

    if chosen_title:
        print("Possible table label:")
        pprint(chosen_title)
    else:
        print("No meaningful title/label found near the table.")

    return table_df, chosen_title

def merge_consecutive_same_font_and_leftoldnew(dflist, target_fonts):
    """
    Merges lines in-place within each DataFrame in dflist where:
    - font_size ∈ target_fonts
    - consecutive lines share same font_size and left
    Adds a 'parabool' column to mark whether a line is a merged paragraph.
    
    Parameters:
        dflist (list of pd.DataFrame): Each DataFrame is a page.
        target_fonts (list): List of font sizes to consider for paragraph merging.
    
    Returns:
        list of pd.DataFrame: Modified DataFrames with merged paragraphs and 'parabool' flag.
    """
    updated_dflist = []

    for df in dflist:
        if df.empty or 'left' not in df.columns or 'font_size' not in df.columns:
            df['parabool'] = False
            updated_dflist.append(df)
            continue

        df = df.copy()
        df['parabool'] = False
        drop_indices = set()

        for left_value in sorted(df['left'].unique()):
            # Get indices where left matches
            group_df = df[(df['left'] == left_value) & (df['font_size'].isin(target_fonts))]
            indices = group_df.index.tolist()
            i = 0

            while i < len(indices):
                current_idx = indices[i]
                current_font = df.loc[current_idx, 'font_size']
                current_text = df.loc[current_idx, 'text']

                merged = False
                j = i + 1

                while j < len(indices):
                    next_idx = indices[j]
                    next_font = df.loc[next_idx, 'font_size']

                    if next_font == current_font:
                        # Merge text
                        df.at[current_idx, 'text'] += ' ' + df.loc[next_idx, 'text']
                        drop_indices.add(next_idx)
                        merged = True
                        j += 1
                    else:
                        break

                if merged:
                    df.at[current_idx, 'parabool'] = True
                else:
                    df.at[current_idx, 'parabool'] = False

                i = j  # skip merged rows

        # Drop merged lines
        df = df.drop(index=list(drop_indices)).reset_index(drop=True)
        updated_dflist.append(df)

    return updated_dflist
def merge_consecutive_same_font_and_left(dflist, target_fonts):
    """
    Merges lines in-place within each DataFrame in dflist where:
    - font_size ∈ target_fonts
    - consecutive lines share same font_size and left
    - lines are NOT bold or italic
    Adds a 'parabool' column to mark whether a line is a merged paragraph.

    Parameters:
        dflist (list of pd.DataFrame): Each DataFrame is a page.
        target_fonts (list): List of font sizes to consider for paragraph merging.

    Returns:
        list of pd.DataFrame: Modified DataFrames with merged paragraphs and 'parabool' flag.
    """
    updated_dflist = []

    for df in dflist:
        if df.empty or 'left' not in df.columns or 'font_size' not in df.columns:
            df['parabool'] = False
            updated_dflist.append(df)
            continue

        df = df.copy()
        df['parabool'] = False
        drop_indices = set()

        for left_value in sorted(df['left'].unique()):
            group_df = df[(df['left'] == left_value) & (df['font_size'].isin(target_fonts))]
            indices = group_df.index.tolist()
            i = 0

            while i < len(indices):
                current_idx = indices[i]
                current_font = df.loc[current_idx, 'font_size']

                # Skip bold/italic rows
                if df.loc[current_idx, 'bold'] : #or df.loc[current_idx, 'italic']: #ignore the italics for now
                    df.at[current_idx, 'parabool'] = False
                    i += 1
                    continue

                j = i + 1
                merged = False

                while j < len(indices):
                    next_idx = indices[j]
                    next_font = df.loc[next_idx, 'font_size']

                    # Stop if font differs or next is bold/italic
                    if next_font != current_font or df.loc[next_idx, 'bold'] or df.loc[next_idx, 'italic']:
                        break

                    # Merge
                    df.at[current_idx, 'text'] += ' ' + df.loc[next_idx, 'text']
                    drop_indices.add(next_idx)
                    merged = True
                    j += 1

                df.at[current_idx, 'parabool'] = merged
                i = j

        # Drop merged rows
        df = df.drop(index=list(drop_indices)).reset_index(drop=True)
        updated_dflist.append(df)

    return updated_dflist


def generate_font_stats(dflist):
    all_stats = []

    for i, df in enumerate(dflist):
        if df.empty or 'font_size' not in df.columns:
            continue

        # Group by font size and family
        grouped = df.groupby(['font_size', 'font_family'])

        for (font_size, font_family), group in grouped:
            total_rows = len(group)
            total_words = group['word_count'].sum() if 'word_count' in group.columns else 0
            avg_word_density = group['word_density'].mean() if 'word_density' in group.columns else 0

            all_stats.append({
                'page_number': i,
                'font_size': font_size,
                'font_family': font_family,
                'count': total_rows,
                'total_words': total_words,
                'avg_word_density': round(avg_word_density, 2)
            })

    return pd.DataFrame(all_stats)

def get_global_font_counter(font_stats_df):
    """
    Aggregates font counts across all pages from font_stats_df
    to recreate a usable font_counter object.
    """
    return Counter(dict(
        font_stats_df.groupby('font_size')['count'].sum()
    ))
def recalculate_word_stats(dflist):
    """
    Recalculates word count and word density (words per font size)
    for each row in each DataFrame of the given list.

    Parameters:
        dflist (list of pd.DataFrame): Each DataFrame should have at least
                                       'text' and 'font_size' columns.

    Returns:
        list of pd.DataFrame: Updated DataFrames with recalculated columns:
                              'word_count' and 'word_density'.
    """
    updated_dflist = []

    for df in dflist:
        if 'text' not in df.columns or 'font_size' not in df.columns:
            updated_dflist.append(df)  # skip pages without required info
            continue

        df = df.copy()
        df['word_count'] = df['text'].apply(lambda t: len(str(t).split()))
        df['word_density'] = df.apply(
            lambda row: row['word_count'] / row['font_size'] if row['font_size'] else 0,
            axis=1
        )
        updated_dflist.append(df)

    return updated_dflist



def get_rare_large_fonts(font_counter, method='rms', freq_filter='average'):
    """
    Identifies large fonts (likely headings) that are:
      - larger than the body font size
      - used less frequently than a frequency threshold

    Parameters:
        font_counter (Counter): font_size -> count
        method (str): 'rms' or 'average' to compute body font size
        freq_filter (str|float): 'average', 'median', or numeric threshold

    Returns:
        body_font (float): Computed body font size
        rare_large_fonts (list): Sorted font sizes likely used for headings
    """
    font_sizes = np.array(list(font_counter.keys()), dtype=float)
    counts = np.array([int(font_counter[fs]) for fs in font_sizes])

    # Step 1: Compute the body font
    if method == 'rms':
        body_font = np.sqrt(np.sum((font_sizes ** 2) * counts) / np.sum(counts))
    else:  # 'average'
        body_font = np.sum(font_sizes * counts) / np.sum(counts)

    # Step 2: Determine frequency threshold
    if freq_filter == 'average':
        freq_threshold = np.mean(counts)
    elif freq_filter == 'median':
        freq_threshold = np.median(counts)
    elif isinstance(freq_filter, (int, float)):
        freq_threshold = freq_filter
    else:
        raise ValueError("Invalid freq_filter. Use 'average', 'median', or a numeric value.")

    # Step 3: Filter large, rare fonts
    rare_large_fonts = [
        float(fs) for fs in font_counter
        if fs > body_font and font_counter[fs] < freq_threshold
    ]

    return round(body_font, 2), sorted(rare_large_fonts, reverse=True)

def map_fonts_to_heading_levels(rare_fonts):
    """
    Maps a list of rare large fonts to H1, H2, H3 levels.

    Logic:
    - 1 → H1
    - 2 → H1, H2
    - 3 → H1, H2, H3
    - 4 → H1, H2, H3, H3
    - 5 → H1, H2, H2, H3, H3
    - 6 → H1, H1, H2, H2, H3, H3
    - 7+ → Only first 6 fonts are used as per 6-rule

    Parameters:
        rare_fonts (list[float]): Sorted list of large rare font sizes (descending)

    Returns:
        dict: Mapping {font_size: heading_level}
    """
    rare_fonts = sorted(rare_fonts, reverse=True)[:6]  # cap at 6 fonts
    n = len(rare_fonts)
    heading_map = {}

    if n == 1:
        heading_levels = ['H1']
    elif n == 2:
        heading_levels = ['H1', 'H2']
    elif n == 3:
        heading_levels = ['H1', 'H2', 'H3']
    elif n == 4:
        heading_levels = ['H1', 'H2', 'H3', 'H3']
    elif n == 5:
        heading_levels = ['H1', 'H2', 'H2', 'H3', 'H3']
    else:  # n == 6
        heading_levels = ['H1', 'H1', 'H2', 'H2', 'H3', 'H3']

    for fs, level in zip(rare_fonts, heading_levels):
        heading_map[fs] = level

    return heading_map

def analyze_word_density_patterns(dflist, method='word_density', stat='average', threshold_factor=1.5):
    """
    Analyzes word density in a list of DataFrames to find lines that are unusually dense or sparse.

    Parameters:
        dflist (list of pd.DataFrame): List of page DataFrames with 'word_count', 'font_size', 'line_height'.
        method (str): Mode of analysis. One of:
            - 'word_density'     : word_count / line_height
            - 'density_by_font'  : word_count / font_size
            - 'weighted_density' : (word_count ** 2) / font_size
            - 'font_scaled'      : word_density * font_size
        stat (str): How to compute the central reference ('average', 'median').
        threshold_factor (float): How far from the mean is considered rare/unusual.

    Returns:
        dict: {
            'metric_name': str,
            'central_value': float,
            'dense_lines': list of (page_idx, row_idx),
            'sparse_lines': list of (page_idx, row_idx)
        }
    """
    scores = []
    locations = []

    for page_idx, df in enumerate(dflist):
        for row_idx, row in df.iterrows():
            wc, fs, lh = row.get('word_count'), row.get('font_size'), row.get('line_height')
            if not all([wc, fs, lh]) or fs == 0 or lh == 0:
                continue

            # Compute metric based on mode
            if method == 'word_density':
                score = wc / lh
            elif method == 'density_by_font':
                score = wc / fs
            elif method == 'weighted_density':
                score = (wc ** 2) / fs
            elif method == 'font_scaled':
                score = (wc / lh) * fs
            else:
                raise ValueError(f"Invalid method: {method}")

            scores.append(score)
            locations.append((page_idx, row_idx))

    scores = np.array(scores)

    # Central tendency
    if stat == 'average':
        central = np.mean(scores)
    elif stat == 'median':
        central = np.median(scores)
    else:
        raise ValueError("Stat must be 'average' or 'median'")

    lower_thresh = central / threshold_factor
    upper_thresh = central * threshold_factor

    # Collect dense/sparse locations
    dense_lines = [loc for score, loc in zip(scores, locations) if score > upper_thresh]
    sparse_lines = [loc for score, loc in zip(scores, locations) if score < lower_thresh]

    return {
        'metric_name': method,
        'central_value': round(central, 3),
        'dense_lines': dense_lines,
        'sparse_lines': sparse_lines
    }

def is_center_aligned(left, text, page_width, font_size, tolerance=15): ######make a freaking plot on desmos if required!?!?!?!?!
    tolerance=page_width*(tolerance/100)
    avg_char_width = 0.5 * font_size  # rough estimate
    text_width = len(text) * avg_char_width
    center_text = left + text_width / 2
    center_page = page_width / 2
    return abs(center_page - center_text) #<= tolerance
# is_center_aligned(78.2, dflist[0].iloc[0]['text'], width, 14.3 )
# dflist[0]
# is_center_aligned(121.8, dflist[0].iloc[0]['text'], width, 10)

# is_center_aligned(141.3, dflist[0].iloc[0]['text'], width, 6)
# is_center_aligned(236.7, dflist[0].iloc[0]['text'], width, 14.3)

# body_font, large_fonts = get_large_fonts(font_counter, method='rms')
# page = doc[2]
# table_df, table_label = extract_table_with_title(page)

# if table_label:
#     print("Detected Table Title:", table_label["text"])
# extract_blocks_and_distributions,
# find_alternate_page_repeats_split,
# clean_pages_of_even_odd_repeats,
# merge_paragraphs_by_font


In [448]:
fileinquestion = "C1A/input/E0CCG5S239.pdf" #single page doc E0CCG5S239
fileinquestion = "C1A/input/TOPJUMP-PARTY-INVITATION-20161003-V01.pdf"
fileinquestion = "C1A/input/STEMPathwaysFlyer.pdf"
fileinquestion = "C1A/input/E0H1CM114.pdf"
fileinquestion = "AAMine/killer.pdf"
fileinquestion = "C1A/input/E0H1CM114.pdf"
fileinquestion = "AAMine/killer.pdf"
fileinquestion = "C1A/input/E0CCG5S312.pdf"
fileinquestion = "AAMine/jess401.pdf"


#E0CCG5S312 eazy 12 pages
#hard E0H1CM114
doc = pymupdf.open(fileinquestion)  
page = doc[0]  # first page

width = page.rect.width
height = page.rect.height
doctoc = doc.get_toc()
docmetadata = doc.metadata
totalpage= doc.page_count

dflist , repeated_all, all_font_sizes , font_counter = [], [] ,set() , Counter()
mid = totalpage//2

#if dig.dig is followed then try to use the flow...
for i in range(totalpage):
    page = doc.load_page(i)
    html = page.get_text("html")
    blocks, font_counter, line_counter = extract_blocks_and_distributions(html)
    dflist.append(pd.DataFrame(blocks))
dfl = dflist
dfcrucial = extract_crucial_pattern_lines(dflist)
dfcrucial############################################################################################################################

dflist[0]
get_top_fonts(font_counter)

even_df, odd_df =       find_alternate_page_repeats_split(dflist, mid=totalpage // 2)
dflist =                clean_pages_of_even_odd_repeats(dflist, even_df, odd_df)
dflist =        [filter_gibberish_rows(df) for df in dflist]



font_stats_df = generate_font_stats(dflist)
font_stats_df.sort_values(by=['page_number', 'font_size'], ascending=[True, False]).reset_index(drop=True)
font_counter = get_global_font_counter(font_stats_df)
topfonts = get_top_fonts(font_counter)
# font_level_map =  create_font_level_map(font_counter) #classify_font_size(16, font_level_map)

target_fonts = list(font_counter.keys())

#merging it now
dflist = merge_consecutive_same_font_and_left(dflist, target_fonts)
dflist = recalculate_word_stats(dflist)


font_counter
body_font, large_fonts = get_rare_large_fonts(font_counter, method='rms', freq_filter='average')

# even_df, odd_df = find_alternate_page_repeats_split(dflist, mid=totalpage // 2)
# dflist = clean_pages_of_even_odd_repeats(dflist, even_df, odd_df)
# font_level_map =  create_font_level_map(font_counter) #classify_font_size(16, font_level_map)
font_heading_map = map_fonts_to_heading_levels(large_fonts)

dflist = label_font_levels(dflist, font_heading_map) #returns the dflist
result = analyze_word_density_patterns(
    dflist,
    method='density_by_font',
    stat='median',
    threshold_factor=1.8
)

print(f"Metric: {result['metric_name']}")
print(f"Central Value: {result['central_value']}")
print(f"Dense Lines: {len(result['dense_lines'])}, Sparse Lines: {len(result['sparse_lines'])}")

# for i in range(totalpage):
#     extract_table_with_title(doc[i])
dflist[0]
dfl[0]
dfcrucial.loc[dfcrucial.bold==True]

dfcrucial
extract_heading_summary(dflist, levels=(['H1','H2','H3']))
dflist[1]


Metric: density_by_font
Central Value: 0.821
Dense Lines: 55, Sparse Lines: 54


Unnamed: 0,top,left,line_height,font_size,font_family,bold,italic,text,word_count,word_density,parabool,level
0,71.0,172.2,20.0,20.0,"RotisSansSerif,serif",True,False,Belgium and Sri Lanka,4,0.2,False,H3
1,203.1,57.8,12.0,12.0,"Times New Roman,serif",False,False,I have a simple equation in mind. Sharing powe...,24,2.0,True,Body
2,591.7,49.8,10.0,10.0,"GaramondKursivHalbfett,serif",True,True,Ethnic: A social,3,0.3,False,Body
3,603.7,49.8,10.0,10.0,"GaramondAntiqua,serif",False,False,division based on shared culture. People belon...,38,3.8,True,Body
4,97.8,172.2,11.5,11.5,"GaramondAntiqua,serif",False,False,"Belgium is a small country in Europe, smaller ...",133,11.565217,True,Body
5,371.4,190.2,11.5,11.5,"GaramondAntiqua,serif",False,False,The minority French-speaking,3,0.26087,False,Body
6,97.8,355.8,11.5,11.5,"GaramondAntiqua,serif",False,False,to tensions between the Dutch- speaking and Fr...,123,10.695652,True,Body
7,251.3,373.8,11.5,11.5,"GaramondAntiqua,serif",False,False,Let us compare this to the,6,0.521739,False,Body
8,468.1,447.4,14.0,14.0,"Arial Narrow,sans-serif",False,False,Communities and regions of Belgium,5,0.357143,True,Body
9,665.8,215.6,7.0,7.0,"Arial,sans-serif",False,False,Walloon (French-speaking),2,0.285714,False,Body


Unnamed: 0,top,left,line_height,font_size,font_family,bold,italic,text,word_count,word_density,parabool,level,page
0,391.5,72.0,10.0,10.0,"NimbusRomNo9L,serif",True,False,1. Introduction,2,0.2,False,Body,0
1,499.8,72.0,10.0,10.0,"NimbusRomNo9L,serif",True,False,2. Hermite Wavelet,3,0.3,False,Body,1
2,670.1,72.0,10.0,10.0,"NimbusRomNo9L,serif",True,False,3. Method For Solution,4,0.4,False,Body,1
3,137.4,72.0,10.0,10.0,"NimbusRomNo9L,serif",True,False,4. Convergence Analysis,3,0.3,False,Body,2
4,207.3,72.0,10.0,10.0,"NimbusRomNo9L,serif",True,False,5. Simulations and Results,4,0.4,False,Body,2


In [246]:
dflist = []
for i in range(totalpage):
    page = doc.load_page(i)
    html = page.get_text("html")
    blocks, font_counter, line_counter = extract_blocks_and_distributions(html)
    dflist.append(pd.DataFrame(blocks))

even_df, odd_df = find_alternate_page_repeats_split(dflist, mid=totalpage // 2)
cleaned_dflist = clean_pages_of_even_odd_repeats(dflist, even_df, odd_df)

def find_repeating_rows(dflist, center_page, offsets=[1, 2], tolerance=1.0):#########depriciated
    """
    Finds text lines (rows) in the center page that repeat in neighboring pages at given offsets.
    Returns a DataFrame of repeated rows likely to be headers/footers.
    """
    repeated_all = []
    df_mid = dflist[center_page]

    for offset in offsets:
        before, after = center_page - offset, center_page + offset
        if before < 0 or after >= len(dflist):
            continue

        df_before, df_after = dflist[before], dflist[after]

        for _, row in df_mid.iterrows():
            text, top = row['text'].strip(), row['top']

            def is_match(df):
                return any(
                    (abs(top - r['top']) <= tolerance) and (r['text'].strip() == text)
                    for _, r in df.iterrows()
                )

            if is_match(df_before) and is_match(df_after):
                repeated_all.append(row)

    # Drop duplicates and return as DataFrame
    return pd.DataFrame(repeated_all).drop_duplicates(subset=["text", "top", "font_size"])

repeated_rows_df = find_repeating_rows(dflist, center_page=mid)
# cleaned_dflist = clean_pages_of_repeats(dflist, repeated_rows_df)
# repeated_rows_df
# cleaned_dflist
# # for i,clean in enumerate(cleaned_dflist):
#     # clean.to_csv(f"analysis/hope12/f{i}.csv",index=False,encoding="utf-8")

cleaned_dflist = [filter_gibberish_rows(df) for df in cleaned_dflist]
get_top_fonts(font_counter)
font_level_map =  create_font_level_map(font_counter) #classify_font_size(16, font_level_map)
cleaned_dflist = label_font_levels(cleaned_dflist, font_level_map)
extract_heading_summary(cleaned_dflist, levels=(['H1','H2']))
font_counter
i=0
# repeated_rows_df
cleaned_dflist[1]
font_level_map
extract_heading_summary(cleaned_dflist, levels=(['H1','H2']))


Unnamed: 0,page_num,level,text
0,2,H1,Revision History
1,3,H1,Table of Contents
2,4,H1,Acknowledgements
3,5,H1,1. Introduction to the Foundation Level Extens...
4,6,H1,2. Introduction to Foundation Level Agile Test...
5,6,H2,2.1 Intended Audience
6,6,H2,2.2 Career Paths for Testers
7,6,H2,2.3 Learning Objectives
8,7,H2,2.4 Entry Requirements
9,7,H2,2.5 Structure and Course Duration


In [234]:
fileinquestion = "C1A/input/E0H1CM114.pdf"
#E0CCG5S312 eazy 12 pages
doc = pymupdf.open(fileinquestion)  
doctoc = doc.get_toc()
docmetadata = doc.metadata
totalpage= doc.page_count

dflist , repeated_all, all_font_sizes , font_counter = [], [] ,set() , Counter()
mid = totalpage//2

#if dig.dig is followed then try to use the flow...

for i in range(totalpage):
    page = doc.load_page(i)
    html = page.get_text("html")
    blocks, font_counter, line_counter = extract_blocks_and_distributions(html)
    dflist.append(pd.DataFrame(blocks))

even_df, odd_df = find_alternate_page_repeats_split(dflist, mid=totalpage // 2)
cleaned_dflist = clean_pages_of_even_odd_repeats(dflist, even_df, odd_df)

paragraphs = []
for i, df in enumerate(cleaned_dflist):
    merged = merge_paragraphs_by_font(df, font_min=8.0, font_max=14.0)  # You can define ranges
    for para in merged:
        para['page'] = i
    paragraphs.extend(para)

paragraph_df = pd.DataFrame(paragraphs)
merged = merge_paragraphs_by_font(df, font_min=8.0, font_max=14.0, font_tolerance=1.0, line_gap=3.0)

# repeated_rows_df = find_repeating_rows(dflist, center_page=mid)
# cleaned_dflist = clean_pages_of_repeats(dflist, repeated_rows_df)
# repeated_rows_df
# cleaned_dflist
# # for i,clean in enumerate(cleaned_dflist):
#     # clean.to_csv(f"analysis/hope12/f{i}.csv",index=False,encoding="utf-8")

cleaned_dflist = [filter_gibberish_rows(df) for df in cleaned_dflist]
get_top_fonts(font_counter)
font_level_map =  create_font_level_map(font_counter) #classify_font_size(16, font_level_map)
cleaned_dflist = label_font_levels(cleaned_dflist, font_level_map)
extract_heading_summary(cleaned_dflist, levels=(['H1','H2']))



Unnamed: 0,page_num,level,text
0,0,H2,The Ontario Digital Library will make Ontario ...
1,0,H2,all Ontario citizens have access to the knowle...
2,0,H2,learners and effective contributors towards On...
3,1,H1,Summary
4,1,H2,"St., Suite 303, Toronto, ON M5C 1M3. Proposal..."
...,...,...,...
217,13,H2,-study guides
218,13,H2,-study skill development
219,13,H2,-web-based curricula
220,13,H2,-internet search guides


In [None]:
for i, page_df in enumerate(cleaned_dflist):
    paras = merge_paragraphs_by_font(page_df)
    for p in paras:
        p['page'] = i  # Add page number
    merged_paras.extend(paras)

# Convert to DataFrame for analysis
paragraph_df = pd.DataFrame(merged_paras)


In [88]:
#############EXP
tablezdf
for i in range(totalpage):
    page = doc[i]
    table_df, title = extract_table_with_title(page)

    if table_df is not None:
        print(f"\nTitle: {title}\n")
        print(table_df.head())
        tabs


0 table(s) found.
0 table(s) found.
1 table(s) found.
First table extracted:
[['Version', 'Date', 'Remarks'],
 ['0.1', '18 JUNE 2013', 'Initial version'],
 ['0.2', '23 JULY 2013', 'WG reviewed and confirmed'],
 ['0.3', '6 NOV 2013', 'amended population and diagram'],
 ['0.7', '11 DEC 2013', 'Amended Business Outcomes and Chapters matching'],
 ['0.8', '20 DEC 2013', 'Working group updates on 0.7'],
 ['1.0', '31 MAY 2014', 'GA release for Agile Extension']]
Possible table title: Revision History

Title: Revision History

         0             1                                                2
0  Version          Date                                          Remarks
1      0.1  18 JUNE 2013                                  Initial version
2      0.2  23 JULY 2013                        WG reviewed and confirmed
3      0.3    6 NOV 2013                   amended population and diagram
4      0.7   11 DEC 2013  Amended Business Outcomes and Chapters matching
0 table(s) found.
0 table(s) fo

In [None]:
#GRAVEYARD

# page = doc[1] #not index, its page
# for page in reversed(doc):
# for page in doc.pages(start, stop, step):
locationn= "analysis/hope1"
alldfs,thetexts , linkstotal , linkstotalnext , annotstotal , html = [] , [] , [] , [] , [] , []
def dumpshit(doc):
    for i,page in enumerate(doc): # iterate the document pages
        text = page.get_text() # get plain text encoded as UTF-8
        thetexts.append(text)
        links = page.get_links()
        linkstotal.append(links)
        link = page.first_link  # a `Link` object or `None`
        linktemp =[]
        while link: 
            link = link.next # get next link, last one has `None` in its `next`
            linktemp.append(link)
        linkstotalnext.append(linktemp)
        # for annot in page.annots():
            # print(f'Annotation on page: {page.number} with type: {annot.type} and rect: {annot.rect}')
        # for field in page.widgets():
            # print(f'Widget on page: {page.number} with type: {field.type} and rect: {field.rect}')
        text = page.get_text("html")
            # Use one of the following strings for opt to obtain different formats [2]:
            # “text”: (default) plain text with line breaks. No formatting, no text position details, no images.
            # “blocks”: generate a list of text blocks (= paragraphs).
            # “words”: generate a list of words (strings not containing spaces).
            # “html”: creates a full visual version of the page including any images. This can be displayed with your internet browser.
            # “dict” / “json”: same information level as HTML, but provided as a Python dictionary or resp. JSON string. See TextPage.extractDICT() for details of its structure.
            # “rawdict” / “rawjson”: a super-set of “dict” / “json”. It additionally provides character detail information like XML. See TextPage.extractRAWDICT() for details of its structure.
            # “xhtml”: text information level as the TEXT version but includes images. Can also be displayed by internet browsers.
            # “xml”: contains no images, but full position and font information down to each single text character. Use an XML module to interpret.
        # with open("output.html", "w") as f:
        text = re.sub(r'<img[^>]*>', '<image>', text)
        html.append(text)
        with open(f"analysis/hope1/f{i}.html",'w',encoding="utf-8") as f:
            f.write(text)
 
        # Font
        # Alignment
        # cases
        #indentation, new pages, 
        # 1 1.0
        #toc/index - 
        # blocks of code
        # 
# for i in range(12)
page= doc[0]
text = page.get_text("html")
# pprint(text)
print(doc[1].get_text("blocks") == doc[1].get_text("blocks"))
htmx=(doc[0].get_text("html"))
with open ("html.html", "w") as f:
    f.write(htmx)
htmx

In [None]:
#OLDDDDD
def clean_pages_of_even_odd_repeats(dflist, even_df, odd_df, tolerance=1.0):
    """
    Cleans even-indexed pages using even_df and odd-indexed pages using odd_df.
    If even_df == odd_df, treats all pages the same.
    """
    cleaned_pages = []
    # Check if both are the same
    same_repeats = ( set((r['text'].strip(), round(r['top'], 1)) for _, r in even_df.iterrows()) == set((r['text'].strip(), round(r['top'], 1)) for _, r in odd_df.iterrows()) )
    for idx, df in enumerate(dflist):
        if same_repeats:
            compare_df = even_df  # or odd_df, doesn't matter
        else:
            compare_df = even_df if idx % 2 == 0 else odd_df
        mask = df.apply(
            lambda row: not any(
                (abs(row['top'] - rep['top']) <= tolerance)
                and (row['text'].strip() == rep['text'].strip())
                for _, rep in compare_df.iterrows()
            ),
            axis=1
        )
        cleaned_df = df[mask].reset_index(drop=True)
        cleaned_pages.append(cleaned_df)
    return cleaned_pages

def clean_pages_of_repeats(dflist, repeated_df, tolerance=1.0): ##################depreciated
    """
    Removes rows from each page in dflist that match (in position and text) any row in repeated_df.
    Useful for cleaning common headers/footers.
    """
    cleaned_pages = []

    for df in dflist:
        mask = df.apply(
            lambda row: not any(
                (abs(row['top'] - rep['top']) <= tolerance) and (row['text'].strip() == rep['text'].strip())
                for _, rep in repeated_df.iterrows()
            ),
            axis=1
        )
        cleaned_df = df[mask].reset_index(drop=True)
        cleaned_pages.append(cleaned_df)

    return cleaned_pages


def find_repeating_rows(dflist, center_page, offsets=[1, 2], tolerance=1.0):#########depriciated
    """
    Finds text lines (rows) in the center page that repeat in neighboring pages at given offsets.
    Returns a DataFrame of repeated rows likely to be headers/footers.
    """
    repeated_all = []
    df_mid = dflist[center_page]

    for offset in offsets:
        before, after = center_page - offset, center_page + offset
        if before < 0 or after >= len(dflist):
            continue

        df_before, df_after = dflist[before], dflist[after]

        for _, row in df_mid.iterrows():
            text, top = row['text'].strip(), row['top']

            def is_match(df):
                return any(
                    (abs(top - r['top']) <= tolerance) and (r['text'].strip() == text)
                    for _, r in df.iterrows()
                )

            if is_match(df_before) and is_match(df_after):
                repeated_all.append(row)

    # Drop duplicates and return as DataFrame
    return pd.DataFrame(repeated_all).drop_duplicates(subset=["text", "top", "font_size"])

def merge_paragraphs_by_font0(df, font_tolerance=1, line_gap=2.0):
    """
    Groups consecutive lines with similar font size and small vertical gap into paragraphs.

    Parameters:
        df (pd.DataFrame): A single-page DataFrame containing 'top', 'font_size', 'text' columns.
        font_tolerance (float): Allowed deviation in font size to consider lines as same style.
        line_gap (float): Max vertical gap between lines to be considered part of the same paragraph.

    Returns:
        List of dicts: Each dict represents a merged paragraph block.
    """
    if df.empty:
        return []

    # Sort by vertical position
    # df_sorted = df.sort_values(by='top').reset_index(drop=True)
    df_sorted = df
    paragraphs = []
    current_para = {
        'top': df_sorted.loc[0, 'top'],
        'font_size': df_sorted.loc[0, 'font_size'],
        'text': df_sorted.loc[0, 'text']
    }

    for i in range(1, len(df_sorted)):
        prev = df_sorted.loc[i - 1]
        curr = df_sorted.loc[i]

        same_font = (abs(curr['font_size'] - prev['font_size']) <= font_tolerance)
        # close_enough = abs(curr['top'] - prev['top']) <= (prev['line_height'] + line_gap)

        if same_font : #and close_enough:
            current_para['text'] += ' ' + curr['text']
        else:
            paragraphs.append(current_para)
            current_para = {
                'top': curr['top'],
                'font_size': curr['font_size'],
                'text': curr['text']
            }

    paragraphs.append(current_para)
    return paragraphs

In [146]:
page = doc[1]
print(page.get_text('html'))


<div id="page0" style="width:612.0pt;height:792.0pt">
<p style="top:94.6pt;left:90.0pt;line-height:20.0pt"><span style="font-family:Arial,sans-serif;font-size:20.0pt;color:#000000">Ontario&#x2019;s Digital Library </span></p>
<p style="top:122.0pt;left:90.0pt;line-height:16.0pt"><span style="font-family:Arial,sans-serif;font-size:16.0pt;color:#000000">A Critical Component for Implementing Ontario&#x2019;s Road Map to </span></p>
<p style="top:140.3pt;left:90.0pt;line-height:16.0pt"><span style="font-family:Arial,sans-serif;font-size:16.0pt;color:#000000">Prosperity Strategy </span></p>
<p style="top:199.3pt;left:96.0pt;line-height:12.0pt"><span style="font-family:Arial,sans-serif;font-size:12.0pt;color:#000000">Summary </span></p>
<p style="top:229.7pt;left:96.0pt;line-height:11.0pt"><span style="font-family:Arial,sans-serif;font-size:11.0pt;color:#000000">The purpose of this </span><b><span style="font-family:Arial,sans-serif;font-size:11.0pt;color:#000000">Request for Proposal</span>