In [1]:
pip install PyPDF2

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import glob
# Added imports for PDF text extraction
try:
    import PyPDF2
    PDF_LIBRARY = "PyPDF2"
except ImportError:
    try:
        import fitz  # PyMuPDF
        PDF_LIBRARY = "PyMuPDF"
    except ImportError:
        PDF_LIBRARY = None
        print("⚠️  No PDF library found. Please install: pip install PyPDF2 or pip install PyMuPDF")

def extract_pdf_text(pdf_path):
    """
    Extract text content from PDF file
    """
    try:
        if PDF_LIBRARY == "PyPDF2":
            return extract_with_pypdf2(pdf_path)
        elif PDF_LIBRARY == "PyMuPDF":
            return extract_with_pymupdf(pdf_path)
        else:
            return "❌ No PDF library available for text extraction"
    except Exception as e:
        return f"❌ Error extracting text: {str(e)}"

def extract_with_pypdf2(pdf_path):
    """Extract text using PyPDF2"""
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page_num, page in enumerate(pdf_reader.pages):
                page_text = page.extract_text()
                if page_text.strip():
                    text += f"--- Page {page_num + 1} ---\n"
                    text += page_text + "\n\n"
    except Exception as e:
        text = f"❌ PyPDF2 extraction failed: {str(e)}"
    return text

def extract_with_pymupdf(pdf_path):
    """Extract text using PyMuPDF"""
    text = ""
    try:
        doc = fitz.open(pdf_path)
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            page_text = page.get_text()
            if page_text.strip():
                text += f"--- Page {page_num + 1} ---\n"
                text += page_text + "\n\n"
        doc.close()
    except Exception as e:
        text = f"❌ PyMuPDF extraction failed: {str(e)}"
    return text

def create_output_folder(base_folder):
    """
    Create a new folder for TXT files
    """
    output_folder = os.path.join(base_folder, "marx_chapters_txt_extracted")
    
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        print(f"📁 Created output folder: {output_folder}")
    else:
        print(f"📁 Using existing output folder: {output_folder}")
    
    return output_folder

def create_txt_files(base_folder):
    """
    Create TXT files for all PDF files in marx_chapters folders
    """
    
    # Create output folder for TXT files
    output_folder = create_output_folder(base_folder)
    
    # Find all marx_chapters folders
    chapter_folders = glob.glob(os.path.join(base_folder, "marx_chapters_v*"))
    
    if not chapter_folders:
        print("❌ No marx_chapters_v* folders found")
        return
    
    total_created = 0
    
    print(f"Found {len(chapter_folders)} chapter folders")
    print("-" * 50)
    
    for folder in sorted(chapter_folders):
        folder_name = os.path.basename(folder)
        print(f"📁 Processing: {folder_name}")
        
        # Create subfolder in output directory
        output_subfolder = os.path.join(output_folder, folder_name)
        if not os.path.exists(output_subfolder):
            os.makedirs(output_subfolder)
        
        # Find all PDF files in this folder
        pdf_files = glob.glob(os.path.join(folder, "*.pdf"))
        
        if not pdf_files:
            print(f"   ⚠️  No PDF files found in {folder_name}")
            continue
        
        folder_count = 0
        
        for pdf_file in pdf_files:
            # Get filename without extension
            pdf_name = os.path.splitext(os.path.basename(pdf_file))[0]
            txt_file_path = os.path.join(output_subfolder, f"{pdf_name}.txt")
            
            # Skip if TXT file already exists
            if os.path.exists(txt_file_path):
                print(f"   ⏭️  Skipped: {pdf_name}.txt (already exists)")
                continue
            
            try:
                print(f"   🔄 Extracting: {pdf_name}...")
                
                # Extract PDF content
                pdf_content = extract_pdf_text(pdf_file)
                
                # Create TXT file with extracted content
                with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
                    txt_file.write(f"Title: {pdf_name}\n")
                    txt_file.write(f"Source: {os.path.basename(pdf_file)}\n")
                    txt_file.write(f"Folder: {folder_name}\n")
                    txt_file.write(f"Extraction Library: {PDF_LIBRARY}\n")
                    txt_file.write("\n" + "=" * 50 + "\n\n")
                    txt_file.write("Content:\n\n")
                    txt_file.write(pdf_content)
                
                print(f"   ✅ Created: {pdf_name}.txt")
                folder_count += 1
                total_created += 1
                
            except Exception as e:
                print(f"   ❌ Failed: {pdf_name}.txt - {e}")
        
        print(f"   📊 {folder_name}: {folder_count} files processed")
    
    print("-" * 50)
    print(f"🎉 Complete! Created {total_created} TXT files")
    print(f"📂 Output location: {output_folder}")

def check_dependencies():
    """
    Check if PDF extraction libraries are available
    """
    print("Checking PDF extraction libraries...")
    
    if PDF_LIBRARY:
        print(f"✅ Using {PDF_LIBRARY} for PDF text extraction")
        return True
    else:
        print("❌ No PDF extraction library found!")
        print("Please install one of the following:")
        print("   pip install PyPDF2")
        print("   pip install PyMuPDF")
        return False

def main():
    print("PDF to TXT Converter for Marx Chapters")
    print("=" * 50)
    
    # Check dependencies first
    if not check_dependencies():
        return
    
    # Default to Downloads folder
    downloads_folder = os.path.expanduser("~/Downloads")
    
    folder_path = input(f"Enter folder path (Enter for default): {downloads_folder}\nPath: ").strip()
    
    if not folder_path:
        folder_path = downloads_folder
    
    if not os.path.exists(folder_path):
        print(f"❌ Path does not exist: {folder_path}")
        return
    
    print(f"📂 Using path: {folder_path}")
    
    create_txt_files(folder_path)

if __name__ == "__main__":
    main()

PDF to TXT Converter for Marx Chapters
Checking PDF extraction libraries...
✅ Using PyPDF2 for PDF text extraction


Enter folder path (Enter for default): /Users/sienn/Downloads
Path:  /Users/sienn/Downloads


📂 Using path: /Users/sienn/Downloads
📁 Created output folder: /Users/sienn/Downloads/marx_chapters_txt_extracted
Found 42 chapter folders
--------------------------------------------------
📁 Processing: marx_chapters_v1
   🔄 Extracting: v1_Poems_to_Jenny_06_Transformation...
   ✅ Created: v1_Poems_to_Jenny_06_Transformation.txt
   🔄 Extracting: v1_Letter_04_to_Arnold_Ruge_Mar5_1842...
   ✅ Created: v1_Letter_04_to_Arnold_Ruge_Mar5_1842.txt
   🔄 Extracting: v1_Justification_Correspondent_from_Mosel...
   ✅ Created: v1_Justification_Correspondent_from_Mosel.txt
   🔄 Extracting: v1_Comments_on_Latest_Prussian_Censorship_Instruction...
   ✅ Created: v1_Comments_on_Latest_Prussian_Censorship_Instruction.txt
   🔄 Extracting: v1_Democritean_Epicurean_Philosophy_08_Part2_Ch2_Qualities_of_Atom...
   ✅ Created: v1_Democritean_Epicurean_Philosophy_08_Part2_Ch2_Qualities_of_Atom.txt
   🔄 Extracting: v1_Democritean_Epicurean_Philosophy_14_Notes_Part_Two...
   ✅ Created: v1_Democritean_Epicurean_Phi