# Text Extraction Using Docling

In [2]:
from docling.document_converter import DocumentConverter

source = '/home/toni11/Documents/Programming_Projects/AI Projects/GPT2_GEAS/Datasets/Laws/Republic Act No. 9292.pdf'
converter = DocumentConverter()
result = converter.convert(source)

markdown_text = result.document.export_to_markdown()

# Save to a Markdown file
output_path = 'RA9292.md'
with open(output_path, 'w', encoding='utf-8') as f:
    f.write(markdown_text)

print(f"Markdown file saved to: {output_path}")

Markdown file saved to: RA9292.md


In [1]:
import os
from pathlib import Path
from docling.document_converter import DocumentConverter

def extract_pdfs_to_markdown():
    """
    Extract PDFs from Datasets subdirectories and save as markdown files
    in corresponding Datasets_md subdirectories.
    """
    
    # Initialize the document converter
    converter = DocumentConverter()
    
    # Define source and destination directories
    source_dir = Path("Datasets")
    dest_dir = Path("Datasets_md")
    
    # Check if source directory exists
    if not source_dir.exists():
        print(f"Source directory '{source_dir}' does not exist.")
        return
    
    # Create destination directory if it doesn't exist
    dest_dir.mkdir(exist_ok=True)
    
    # Iterate through subdirectories in Datasets
    for directory in os.listdir(source_dir):
        subdir_path = source_dir / directory
        
        # Skip if not a directory
        if not subdir_path.is_dir():
            continue
            
        print(f"Processing directory: {directory}")
        
        # Create corresponding subdirectory in Datasets_md
        dest_subdir = dest_dir / directory
        dest_subdir.mkdir(exist_ok=True)
        
        # Find all PDF files in the current subdirectory
        pdf_files = list(subdir_path.glob("*.pdf"))
        
        if not pdf_files:
            print(f"  No PDF files found in {directory}")
            continue
            
        # Process each PDF file
        for pdf_file in pdf_files:
            # Generate markdown filename (same name, different extension)
            md_filename = pdf_file.stem + ".md"
            md_filepath = dest_subdir / md_filename
            
            # Check if markdown file already exists
            if md_filepath.exists():
                print(f"  Skipping {pdf_file.name} - {md_filename} already exists")
                continue
            try:
                print(f"  Converting {pdf_file.name} to {md_filename}")
                
                # Convert PDF to document
                result = converter.convert(str(pdf_file))
                
                # Export as markdown
                markdown_content = result.document.export_to_markdown()
                
                # Save markdown file
                with open(md_filepath, 'w', encoding='utf-8') as f:
                    f.write(markdown_content)
                
                print(f"  ✓ Successfully converted {pdf_file.name}")
                
            except Exception as e:
                print(f"  ✗ Error converting {pdf_file.name}: {str(e)}")
                continue
    
def main():
    """Main function to run the PDF extraction process."""
    print("Starting PDF to Markdown extraction...")
    extract_pdfs_to_markdown()
    print("Extraction process completed!")

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
Downloading detection model, please wait. This may take several minutes depending upon your network connection.


Starting PDF to Markdown extraction...
Processing directory: Chemistry
  Converting General Chemistry_ The Essential Concepts_Glossary.pdf to General Chemistry_ The Essential Concepts_Glossary.md


Downloading recognition model, please wait. This may take several minutes depending upon your network connection.


  ✓ Successfully converted General Chemistry_ The Essential Concepts_Glossary.pdf
  Converting General chemistry for engineers-Glossary.pdf to General chemistry for engineers-Glossary.md
  ✓ Successfully converted General chemistry for engineers-Glossary.pdf
Processing directory: Econ
  No PDF files found in Econ
Processing directory: EngMan
  No PDF files found in EngMan
Processing directory: EngMan_TQM
  Converting Six Sigma Definition - What is Lean Six Sigma_ASQ.pdf to Six Sigma Definition - What is Lean Six Sigma_ASQ.md
  ✓ Successfully converted Six Sigma Definition - What is Lean Six Sigma_ASQ.pdf
  Converting DMAIC - The 5 Phases of Lean Six Sigma .pdf to DMAIC - The 5 Phases of Lean Six Sigma .md
  ✓ Successfully converted DMAIC - The 5 Phases of Lean Six Sigma .pdf
  Converting ISO-9001-2015.pdf to ISO-9001-2015.md
  ✓ Successfully converted ISO-9001-2015.pdf
Processing directory: Laws
  Converting ECE_Code_of_Ethics.pdf to ECE_Code_of_Ethics.md
  ✓ Successfully converted ECE