## Installation des d√©pendances

## Imports

In [None]:
import io
import re
from typing import Dict, List, Optional
from dataclasses import dataclass
from PyPDF2 import PdfReader
from pathlib import Path

## D√©finition des classes et fonctions

In [None]:
@dataclass
class ParsedPage:
    """R√©sultat du parsing d'une page."""
    page_number: int
    markdown_content: str
    char_count: int
    word_count: int
    has_tables: bool
    extraction_quality: float  # 0-1

In [None]:
def clean_text_from_redundant_elements(
    text: str,
    headers: List[str],
    footers: List[str],
    page_patterns: List[str]
) -> str:
    """Nettoie le texte des √©l√©ments redondants."""
    cleaned = text
    
    # Supprimer les headers
    for header in headers:
        cleaned = cleaned.replace(header, "")
    
    # Supprimer les footers
    for footer in footers:
        cleaned = cleaned.replace(footer, "")
    
    # Supprimer les num√©ros de page
    for pattern in page_patterns:
        cleaned = re.sub(pattern, "", cleaned)
    
    return cleaned

In [None]:
def _is_likely_header(line: str, all_lines: List[str], index: int) -> bool:
    """D√©tecte si une ligne est probablement un header."""
    if len(line) > 80:
        return False
    
    if len(line) < 3:
        return False
    
    if line.isupper() and len(line) > 5:
        return True
    
    if re.match(r"^(\d+\.)+\s*\w+|^[IVXLC]+\.\s*\w+", line):
        return True
    
    if len(line) < 50 and not re.search(r"[.!?,;:]$", line):
        if index + 1 < len(all_lines):
            next_line = all_lines[index + 1].strip()
            if not next_line or len(next_line) > len(line):
                return True
    
    return False

In [None]:
def _determine_header_level(text: str) -> int:
    """D√©termine le niveau de header (1-4)."""
    if re.match(r"^\d+\.\d+\.\d+", text):
        return 4
    if re.match(r"^\d+\.\d+", text):
        return 3
    if re.match(r"^\d+\.", text):
        return 2
    
    if text.isupper():
        if len(text) < 20:
            return 1
        return 2
    
    return 3

In [None]:
def _detect_table_structure(text: str) -> bool:
    """D√©tecte si le texte contient des structures tabulaires."""
    patterns = [
        r"\|.*\|.*\|",
        r"\t.*\t.*\t",
        r"^\s*\S+\s{3,}\S+\s{3,}\S+",
    ]
    
    for pattern in patterns:
        if re.search(pattern, text, re.MULTILINE):
            return True
    
    return False

In [None]:
def _calculate_extraction_quality(raw_text: str, cleaned_text: str) -> float:
    """Calcule un score de qualit√© de l'extraction."""
    if not raw_text:
        return 0.0
    
    conservation_ratio = len(cleaned_text) / len(raw_text) if raw_text else 0
    
    if not cleaned_text:
        return 0.0
    
    normal_chars = len(
        re.findall(
            r"[a-zA-Z0-9√†√¢√§√©√®√™√´√Ø√Æ√¥√π√ª√º√ß√Ä√Ç√Ñ√â√à√ä√ã√è√é√î√ô√õ√ú√á\s.,!?;:\-\'\"()\[\]{}]",
            cleaned_text,
        )
    )
    
    char_quality = normal_chars / len(cleaned_text) if cleaned_text else 0
    quality = conservation_ratio * 0.3 + char_quality * 0.7
    
    return min(1.0, max(0.0, quality))

In [None]:
def _text_to_markdown(text: str, page_num: int) -> str:
    """Convertit du texte brut en Markdown structur√©."""
    if not text or not text.strip():
        return ""
    
    lines = text.split("\n")
    markdown_lines = []
    in_list = False
    
    for i, line in enumerate(lines):
        stripped = line.strip()
        
        if not stripped:
            if in_list:
                in_list = False
            markdown_lines.append("")
            continue
        
        if _is_likely_header(stripped, lines, i):
            level = _determine_header_level(stripped)
            markdown_lines.append(f"\n{'#' * level} {stripped}\n")
            continue
        
        if re.match(r"^[\-\‚Ä¢\*\‚Üí\‚ñ∫]\s+", stripped):
            in_list = True
            content = re.sub(r"^[\-\‚Ä¢\*\‚Üí\‚ñ∫]\s+", "", stripped)
            markdown_lines.append(f"- {content}")
            continue
        
        if re.match(r"^\d+[\.\)]\s+", stripped):
            in_list = True
            markdown_lines.append(stripped)
            continue
        
        markdown_lines.append(stripped)
    
    result = "\n".join(markdown_lines)
    result = re.sub(r"\n{3,}", "\n\n", result)
    
    return result.strip()

In [None]:
def pdf_text_parser(pdf_path: str) -> Dict:
    """Extrait le texte d'un PDF et le convertit en Markdown."""
    print(f"üìù Extraction texte par parsing: {pdf_path}")
    
    # Lire le fichier PDF
    with open(pdf_path, 'rb') as f:
        pdf_bytes = f.read()
    
    # Configuration par d√©faut
    headers = []
    footers = []
    page_patterns = [r"Page\s+\d+"]
    
    try:
        pdf_file = io.BytesIO(pdf_bytes)
        pdf_reader = PdfReader(pdf_file)
        total_pages = len(pdf_reader.pages)
        
        parsed_pages: List[ParsedPage] = []
        all_markdown_parts: List[str] = []
        
        for page_num in range(1, total_pages + 1):
            page = pdf_reader.pages[page_num - 1]
            
            try:
                raw_text = page.extract_text() or ""
                cleaned_text = clean_text_from_redundant_elements(
                    raw_text, headers, footers, page_patterns
                )
                
                markdown_content = _text_to_markdown(cleaned_text, page_num)
                
                char_count = len(markdown_content)
                word_count = len(markdown_content.split())
                has_tables = _detect_table_structure(cleaned_text)
                quality = _calculate_extraction_quality(raw_text, cleaned_text)
                
                parsed_page = ParsedPage(
                    page_number=page_num,
                    markdown_content=markdown_content,
                    char_count=char_count,
                    word_count=word_count,
                    has_tables=has_tables,
                    extraction_quality=quality,
                )
                
                parsed_pages.append(parsed_page)
                
                if markdown_content.strip():
                    all_markdown_parts.append(markdown_content)
                    
            except Exception as e:
                print(f"   ‚ö†Ô∏è Erreur page {page_num}: {str(e)}")
                continue
        
        full_markdown = "\n\n---\n\n".join(all_markdown_parts)
        
        total_chars = sum(p.char_count for p in parsed_pages)
        total_words = sum(p.word_count for p in parsed_pages)
        avg_quality = (
            sum(p.extraction_quality for p in parsed_pages) / len(parsed_pages)
            if parsed_pages
            else 0
        )
        
        print(f"   ‚úÖ {len(parsed_pages)} pages pars√©es")
        print(f"   üìä {total_chars} caract√®res, {total_words} mots")
        print(f"   üìä Qualit√© moyenne: {avg_quality:.0%}")
        
        return {
            "error": None,
            "parsed_pages": parsed_pages,
            "parsed_markdown": full_markdown,
            "parsing_stats": {
                "pages_parsed": len(parsed_pages),
                "total_chars": total_chars,
                "total_words": total_words,
                "average_quality": round(avg_quality, 2),
            },
        }
        
    except Exception as e:
        return {
            "error": f"Erreur lors du parsing: {str(e)}",
            "parsed_pages": [],
            "parsed_markdown": "",
        }

## Configuration du fichier PDF

In [None]:
# Sp√©cifiez le nom du fichier PDF dans le dossier ressources/
pdf_filename = "pv_1995-09-29.Zeendoc.pdf"  # Changez ceci avec le nom de votre fichier PDF
pdf_path = Path("ressources") / pdf_filename

# V√©rifier que le fichier existe
if not pdf_path.exists():
    print(f"‚ùå Fichier non trouv√©: {pdf_path}")
    print(f"\nFichiers disponibles dans ressources/:")
    ressources_dir = Path("ressources")
    if ressources_dir.exists():
        for file in ressources_dir.iterdir():
            if file.is_file():
                print(f"  - {file.name}")
else:
    print(f"‚úÖ Fichier trouv√©: {pdf_path}")

## Extraction du PDF vers Markdown

In [None]:
# Ex√©cuter l'extraction
result = pdf_text_parser(str(pdf_path))

if result["error"]:
    print(f"\n‚ùå Erreur: {result['error']}")
else:
    print("\n‚úÖ Extraction r√©ussie!")
    print(f"\nStatistiques:")
    stats = result["parsing_stats"]
    print(f"  - Pages: {stats['pages_parsed']}")
    print(f"  - Caract√®res: {stats['total_chars']}")
    print(f"  - Mots: {stats['total_words']}")
    print(f"  - Qualit√©: {stats['average_quality']:.0%}")

## Affichage du contenu Markdown

In [None]:
# Afficher le contenu Markdown
if result["parsed_markdown"]:
    print("\n" + "="*50)
    print("CONTENU MARKDOWN EXTRAIT:")
    print("="*50 + "\n")
    print(result["parsed_markdown"])
else:
    print("Aucun contenu extrait.")

## Sauvegarde du Markdown dans un fichier

In [None]:
# Sauvegarder le r√©sultat dans un fichier .md
if result["parsed_markdown"]:
    output_filename = pdf_filename.replace(".pdf", ".md")
    output_path = Path("ressources") / output_filename
    
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(result["parsed_markdown"])
    
    print(f"\nüíæ Fichier Markdown sauvegard√©: {output_path}")
else:
    print("\n‚ö†Ô∏è Aucun contenu √† sauvegarder.")

## D√©tails par page (optionnel)

In [None]:
# Afficher les d√©tails de chaque page
if result["parsed_pages"]:
    print("\n" + "="*50)
    print("D√âTAILS PAR PAGE:")
    print("="*50)
    
    for page in result["parsed_pages"]:
        print(f"\nPage {page.page_number}:")
        print(f"  - Caract√®res: {page.char_count}")
        print(f"  - Mots: {page.word_count}")
        print(f"  - Tableaux d√©tect√©s: {'Oui' if page.has_tables else 'Non'}")
        print(f"  - Qualit√©: {page.extraction_quality:.0%}")