In [7]:
# ! pip install pymupdf4llm
import pymupdf4llm
import os
import json
from pathlib import Path
import time
from datetime import datetime

In [8]:
def convert_pdfs_to_markdown_advanced(input_folder, output_folder, create_metadata=True, chunk_size=None):
    """
    Advanced PDF to Markdown converter with RAG-specific features

    Args:
        input_folder (str): Path to folder containing PDF files
        output_folder (str): Path to folder where markdown files will be saved
        create_metadata (bool): Whether to create metadata files for each document
        chunk_size (int): Optional chunk size for splitting documents
    """

    # Create output folders
    os.makedirs(output_folder, exist_ok=True)
    if create_metadata:
        metadata_folder = os.path.join(output_folder, "metadata")
        os.makedirs(metadata_folder, exist_ok=True)

    # Get all PDF files
    pdf_files = [f for f in os.listdir(input_folder) if f.lower().endswith('.pdf')]

    if not pdf_files:
        print("No PDF files found in the input folder!")
        return

    print(f"Found {len(pdf_files)} PDF files to convert:")
    for pdf_file in pdf_files:
        print(f"  - {pdf_file}")

    print("\nStarting conversion...")

    successful_conversions = 0
    failed_conversions = []
    conversion_log = []

    for i, pdf_file in enumerate(pdf_files, 1):
        pdf_path = os.path.join(input_folder, pdf_file)

        # Create filenames
        base_name = pdf_file.rsplit('.', 1)[0]
        md_filename = base_name + '.md'
        md_path = os.path.join(output_folder, md_filename)

        try:
            print(f"[{i}/{len(pdf_files)}] Converting: {pdf_file}...")

            start_time = time.time()

            # Convert PDF to markdown
            md_text = pymupdf4llm.to_markdown(pdf_path)

            # Save markdown file
            with open(md_path, 'w', encoding='utf-8') as f:
                f.write(md_text)

            conversion_time = time.time() - start_time

            # Create metadata if requested
            if create_metadata:
                metadata = {
                    "source_file": pdf_file,
                    "markdown_file": md_filename,
                    "conversion_date": datetime.now().isoformat(),
                    "conversion_time_seconds": round(conversion_time, 2),
                    "character_count": len(md_text),
                    "word_count": len(md_text.split()),
                    "line_count": len(md_text.split('\n')),
                    "file_size_bytes": os.path.getsize(pdf_path)
                }

                metadata_path = os.path.join(metadata_folder, base_name + '_metadata.json')
                with open(metadata_path, 'w', encoding='utf-8') as f:
                    json.dump(metadata, f, indent=2, ensure_ascii=False)

            # Optional: Create chunked versions for RAG
            if chunk_size:
                chunks_folder = os.path.join(output_folder, "chunks")
                os.makedirs(chunks_folder, exist_ok=True)

                # Split text into chunks
                chunks = split_text_into_chunks(md_text, chunk_size)

                for j, chunk in enumerate(chunks):
                    chunk_filename = f"{base_name}_chunk_{j+1:03d}.md"
                    chunk_path = os.path.join(chunks_folder, chunk_filename)

                    with open(chunk_path, 'w', encoding='utf-8') as f:
                        f.write(f"# {base_name} - Chunk {j+1}\n\n")
                        f.write(f"Source: {pdf_file}\n")
                        f.write(f"Chunk: {j+1}/{len(chunks)}\n\n")
                        f.write("---\n\n")
                        f.write(chunk)

            print(f"  ✓ Successfully converted to: {md_filename}")
            successful_conversions += 1

            # Log conversion details
            conversion_log.append({
                "file": pdf_file,
                "status": "success",
                "conversion_time": conversion_time,
                "character_count": len(md_text)
            })

            time.sleep(0.1)

        except Exception as e:
            print(f"  ✗ Error converting {pdf_file}: {str(e)}")
            failed_conversions.append(pdf_file)

            conversion_log.append({
                "file": pdf_file,
                "status": "failed",
                "error": str(e)
            })

    # Save conversion log
    log_path = os.path.join(output_folder, "conversion_log.json")
    with open(log_path, 'w', encoding='utf-8') as f:
        json.dump({
            "conversion_date": datetime.now().isoformat(),
            "total_files": len(pdf_files),
            "successful": successful_conversions,
            "failed": len(failed_conversions),
            "details": conversion_log
        }, f, indent=2, ensure_ascii=False)

    # Summary
    print(f"\n{'='*50}")
    print(f"CONVERSION SUMMARY:")
    print(f"{'='*50}")
    print(f"Total files processed: {len(pdf_files)}")
    print(f"Successful conversions: {successful_conversions}")
    print(f"Failed conversions: {len(failed_conversions)}")

    if failed_conversions:
        print(f"\nFailed files:")
        for failed_file in failed_conversions:
            print(f"  - {failed_file}")

    print(f"\nMarkdown files saved to: {output_folder}")
    print(f"Conversion log saved to: {log_path}")

def split_text_into_chunks(text, chunk_size=1000):
    """Split text into chunks of approximately chunk_size characters"""
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        if current_length + len(word) + 1 <= chunk_size:
            current_chunk.append(word)
            current_length += len(word) + 1
        else:
            if current_chunk:
                chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = len(word)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

In [13]:
# Configuration
input_folder = r'h:\Projects\UVB\PREP MASTER\simple_rag_scratch\pdfs'  # Update to your actual folder path
output_folder = r'h:\Projects\UVB\PREP MASTER\simple_rag_scratch\output'

In [14]:
# Run the advanced conversion
convert_pdfs_to_markdown_advanced(
    input_folder=input_folder,
    output_folder=output_folder,
    create_metadata=True,
    chunk_size=2000  # Optional: create 2000-character chunks for RAG
)

Found 328 PDF files to convert:
  - Accord_de_don_et_de_financement_additionnel_du_projet_filet_sociaux_entre_le_Burkina_Faso_et_l_association_international_de_développement.pdf
  - Accord_de_financement_du_projet_d_urgence_de_développement_territoriale_et_de_résidences_de_financement_additionnel.pdf
  - Accord_de_lassemblée_législative_de_la_transition_pour_lorganisation_des_assises_nationales.pdf
  - Accord_de_prêt_de_financement_partiel_du_projet_d_électrification_de_la_connexion_électricité_PEDECEL.pdf
  - approbation_des_statuts_de_la_Société_Burkinabé_de_Télédiffusion.pdf
  - Attributions_composition_organisation_et_fonctionnement_du_Conseil_supérieur_de_la_communication.pdf
  - attributions_Organisation_et_fonctionnement_du_Médiateur_du_Faso.pdf
  - Attribution_CompositionOrganisation_et_Fonctionnement_de_la_commission_de_la_réconciliation_nationale_et_des_réformes.pdf
  - Attribution_composition_organisation_et_fonctionnement_du_conseil_dorientation_et_de_suivit_de_la_transitio