In [None]:
"""
PDF Text Extractor and Cleaner using PyMuPDF and Unstructured

This script:
1. Extracts structured text from a PDF using the `unstructured` library.
2. Skips unwanted pages.
3. Cleans common encoding issues using `ftfy` and regex.
4. Computes statistics on chunk and paragraph sizes.
5. Saves cleaned output to CSV.
"""

from typing import Any
from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf
import re
import fitz  # PyMuPDF
import ftfy
import pandas as pd
import numpy as np
import os


def clean_text(text: str) -> str:
    """Clean OCR and encoding artifacts from PDF text."""
    cleaned_text = ftfy.fix_text(text)

    replacements = {
        r'‚Äôs': "'", r'‚Äô': "'", r'‚Äú': '"', r'‚Äù': '"',
        r'‚Äì': '–', r'‚Ä¶': '…', r'‚Ä¢': '•', r'‚Äò': "'",
        r'Äô': "'", r'Ä¢': '•', r'â€"': '–', r'â€œ': '"', r'â€�': '"'
    }

    for pattern, replacement in replacements.items():
        cleaned_text = re.sub(pattern, replacement, cleaned_text)

    cleaned_text = re.sub(r'\b(Page|Pg|P)\s*\d+\b', '', cleaned_text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r'^\d+$', '', cleaned_text, flags=re.MULTILINE)
    cleaned_text = re.sub(r'\bPage\s+\d+\s+of\s+\d+\b', '', cleaned_text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r'‚Ä[^\w\s]', '', cleaned_text)
    cleaned_text = re.sub(r'\s{2,}', ' ', cleaned_text)
    cleaned_text = re.sub(r'\n\s*\n', '\n', cleaned_text)

    return cleaned_text.strip()


def extract_pdf_text(pdf_path: str, skip_first_pages: int = 6, skip_last_page: bool = True):
    """Extract and clean structured text from PDF."""
    doc = fitz.open(pdf_path)
    total_pages = doc.page_count
    doc.close()

    raw_elements = partition_pdf(
        filename=pdf_path,
        extract_images_in_pdf=False,
        infer_table_structure=True,
        chunking_strategy="by_title",
        max_characters=1100,
        new_after_n_chars=900,
        combine_text_under_n_chars=400,
    )

    last_page = total_pages - 1 if skip_last_page else total_pages
    filtered_elements = [
        el for el in raw_elements
        if hasattr(el, 'metadata') and hasattr(el, 'text') and
        el.metadata.page_number and skip_first_pages < el.metadata.page_number < last_page
    ]

    return filtered_elements, total_pages


def analyze_chunks(elements: list[Any], total_pages: int):
    """Analyze chunk and paragraph sizes and print statistics."""
    chunk_lengths, paragraph_lengths = [], []

    for idx, el in enumerate(elements, 1):
        cleaned = clean_text(el.text)
        chunk_lengths.append(len(cleaned))
        paragraph_lengths.extend([len(p) for p in cleaned.split('\n\n') if p])

        print(f"Chunk {idx}:\n{cleaned}\n{'-' * 40}\n")

    print(f"Total number of chunks: {len(chunk_lengths)}")
    print(f"Average chunk size: {np.mean(chunk_lengths):.2f} characters")
    print(f"Average paragraph size: {np.mean(paragraph_lengths):.2f} characters")
    print(f"Average page size: {sum(chunk_lengths) / total_pages:.2f} characters")


def save_chunks_to_csv(elements: list[Any], output_file: str = "cleaned_chunks.csv"):
    """Save cleaned text chunks to CSV."""
    df = pd.DataFrame([{'text': clean_text(el.text)} for el in elements if hasattr(el, 'text')])
    df.to_csv(output_file, index=False)
    print(f" Cleaned chunks saved to: {output_file}")


if __name__ == "__main__":
    PDF_PATH = "FYYP"                       #pdf educational funding guidance docs 
    OUTPUT_FILE = "chunks.csv"

    if not os.path.exists(PDF_PATH):
        raise FileNotFoundError(f"PDF not found at: {PDF_PATH}")

    elements, total_pages = extract_pdf_text(PDF_PATH)
    analyze_chunks(elements, total_pages)
    save_chunks_to_csv(elements, OUTPUT_FILE)
