In [5]:
from langchain_community.document_loaders import PDFPlumberLoader

loader = PDFPlumberLoader("books/Abnormal Psychology ( PDFDrive ).pdf")

In [2]:
#loader.load()

In [6]:
number_of_pages=len(loader.load())

# new approach

In [3]:
import pdfplumber
import matplotlib.pyplot as plt
import numpy as np
import cv2
from IPython.display import display, Markdown
import os

class RobustPDFExtractor:
    def __init__(self, file_path):
        self.file_path = file_path
        self.pdf = pdfplumber.open(file_path)
        self.total_pages = len(self.pdf.pages)

    def __del__(self):
        if self.pdf:
            self.pdf.close()

    def _get_safe_bbox(self, page):
        """Ensure bounding box stays within valid page boundaries"""
        return (
            max(page.cropbox[0], 0),
            max(page.cropbox[1], 0),
            min(page.cropbox[2], page.width),
            min(page.cropbox[3], page.height)
        )

    def _detect_content_type(self, page):
        """Identify page content type using multiple criteria"""
        try:
            # Detect tables
            if len(page.find_tables()) > 0:
                return "table-page"
            
            # Detect diagrams/images
            if len(page.images) > 1:
                return "diagram-page"
            
            # Detect low-text pages
            if len(page.extract_text()) < 100:
                return "low-text-page"
            
            return "text-page"
        except:
            return "unknown"

    def _detect_columns(self, page):
        """Multi-layered column detection with table awareness"""
        try:
            # Visual detection with OpenCV
            img = page.to_image(resolution=200).original.convert('L')
            edges = cv2.Canny(np.array(img), 50, 150, apertureSize=3)
            
            # Enhanced line detection parameters
            lines = cv2.HoughLinesP(
                edges, 1, np.pi/720, 100,
                minLineLength=int(page.height*0.7),
                maxLineGap=2
            )
            
            if lines is not None:
                verticals = []
                for line in lines:
                    x1, y1, x2, y2 = line[0]
                    # Filter lines in central area and near vertical
                    if abs(x1 - x2) < 5 and (0.35*page.width < x1 < 0.65*page.width):
                        verticals.append(x1)
                
                if verticals:
                    # Use mode to handle multiple similar lines
                    vertical_mode = np.median(verticals)
                    return vertical_mode

            # Text-based gap detection fallback
            words = page.extract_words(x_tolerance=2, y_tolerance=1)
            if len(words) > 10:
                x_coords = sorted([w['x0'] for w in words] + [w['x1'] for w in words])
                gaps = [(x_coords[i] - x_coords[i-1], x_coords[i-1]) 
                       for i in range(1, len(x_coords))]
                
                # Dynamic gap threshold (15% of page width)
                valid_gaps = [g for g in gaps if g[0] > page.width*0.15]
                if valid_gaps:
                    best_gap = max(valid_gaps, key=lambda x: x[0])
                    return best_gap[1] + best_gap[0]/2

            return None
        except Exception as e:
            print(f"Column detection error: {str(e)}")
            return None

    def _extract_columns(self, page, split_x):
        """Safe column extraction with overlap protection"""
        try:
            left_bbox = (
                max(0, split_x - 20),
                page.bbox[1],
                min(page.width, split_x - 5),
                page.bbox[3]
            )
            right_bbox = (
                max(0, split_x + 5),
                page.bbox[1],
                min(page.width, split_x + 20),
                page.bbox[3]
            )
            
            left = page.crop(left_bbox).extract_text(
                layout=True,
                x_tolerance=1.5,
                y_tolerance=1,
                use_text_flow=True
            )
            right = page.crop(right_bbox).extract_text(
                layout=True,
                x_tolerance=1.5,
                y_tolerance=1,
                use_text_flow=True
            )
            
            return f"{left}\n\n{right}", 2
        except Exception as e:
            return f"Column extraction error: {str(e)}", 0

    def _process_table_page(self, page):
        """Handle pages with tables and mixed content"""
        try:
            # Extract tables
            tables = page.find_tables()
            table_content = "\n\n".join([
                "\n".join([" | ".join(row) for row in table.extract()]) 
                for table in tables
            ])
            
            # Extract non-table text
            non_table_area = page.filter(lambda obj: obj["object_type"] != "table")
            text_content = non_table_area.extract_text(
                layout=True,
                x_tolerance=2
            )
            
            # Detect columns in remaining text
            split_x = self._detect_columns(non_table_area)
            if split_x:
                columns = self._extract_columns(non_table_area, split_x)[0]
                return f"Tables:\n{table_content}\n\nText Columns:\n{columns}", 2
            
            return f"Tables:\n{table_content}\n\nText:\n{text_content}", 2
        except Exception as e:
            return f"Table processing error: {str(e)}", 0

    def process_page(self, page_num):
        """Main processing with comprehensive error handling"""
        try:
            page = self.pdf.pages[page_num]
            content_type = self._detect_content_type(page)
            
            if content_type == "table-page":
                return self._process_table_page(page)
            
            if content_type in ["diagram-page", "low-text-page"]:
                return "[NON-TEXT CONTENT]", 0
            
            split_x = self._detect_columns(page)
            if split_x and (0.35*page.width < split_x < 0.65*page.width):
                return self._extract_columns(page, split_x)
                
            # Fallback extraction
            return page.crop(self._get_safe_bbox(page)).extract_text(
                layout=True,
                x_tolerance=2,
                use_text_flow=True
            ), 1
            
        except Exception as e:
            return f"Page processing error: {str(e)}", 0

def visualize_extraction(extractor, page_num):
    """Enhanced visualization with content analysis"""
    try:
        page = extractor.pdf.pages[page_num-1]
        im = page.to_image(resolution=150)
        
        # Detect and draw columns
        split_x = extractor._detect_columns(page)
        if split_x:
            im.draw_line([(split_x, 0), (split_x, page.height)], 
                        stroke="red", stroke_width=3)
        
        # Draw content type overlay
        content_type = extractor._detect_content_type(page)
        im.draw_text((50, 50), f"Content Type: {content_type}", fill="blue")
        
        plt.figure(figsize=(15, 20))
        plt.imshow(im.annotated)
        plt.axis('off')
        plt.show()
        
    except Exception as e:
        print(f"Visualization error: {str(e)}")

def extract_pdf(input_path, output_path, max_pages=None):
    extractor = RobustPDFExtractor(input_path)
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            total = max_pages or extractor.total_pages
            for i in range(total):
                text, cols = extractor.process_page(i)
                f.write(f"=== Page {i+1} ({cols} cols) ===\n{text}\n\n")
                print(f"Processed {i+1}/{total}", end='\r')
        print(f"\nSaved to {os.path.abspath(output_path)}")
    finally:
        del extractor

# Usage Example
if __name__ == "__main__":
    PDF_PATH = "books/Abnormal Psychology ( PDFDrive ).pdf"
    OUTPUT_PATH = "enhanced_output.txt"
    
    extractor = RobustPDFExtractor(PDF_PATH)
    
    # Visual analysis for problematic pages
    for page in [24, 29, 30]:
        print(f"Visual analysis for page {page}:")
        visualize_extraction(extractor, page)
    
    # Full extraction
    print("\nStarting comprehensive extraction...")
    extract_pdf(PDF_PATH, OUTPUT_PATH, max_pages=30)
    
    # Display results
    print("\nExtraction preview:")
    with open(OUTPUT_PATH, 'r', encoding='utf-8') as f:
        display(Markdown(f"```\n{f.read(2000)}\n...```"))

Visual analysis for page 24:
Visualization error: 'PageImage' object has no attribute 'draw_text'
Visual analysis for page 29:
Visualization error: 'PageImage' object has no attribute 'draw_text'
Visual analysis for page 30:
Visualization error: 'PageImage' object has no attribute 'draw_text'

Starting comprehensive extraction...
Processed 30/30
Saved to /Users/mac/Documents/mental_engine_chatbot/therapist_creation/enhanced_output.txt

Extraction preview:


```
=== Page 1 (0 cols) ===
[NON-TEXT CONTENT]

=== Page 2 (0 cols) ===
[NON-TEXT CONTENT]

=== Page 3 (2 cols) ===
Tables:
 | www.CengageBrain.com

Text:
                                                                                         
                                                                                         
                                                                                         
                                                                                         
                                                                                         
                                                                                         
                                                                                         
                                                                                         
                                                                                         
            BBuuyy tthhee wwaayy  yyoouu wwaanntt aanndd  ssaavvee..                     
                                                                                         
                                                                                         
                                                                                         
                      GGeett tthhee bbeesstt ggrraaddee iinn tthhee sshhoorrtteesstt ttiimmee ppoossssiibbllee!!
                                                                                         
                                                                                         
                                                                                         
                                                                                         
                                                                                         
                                                                                         
                          
...```

In [None]:
# extracts everything perfectly but does not extract double column pages properly

import pdfplumber
import numpy as np
import cv2
from IPython.display import display, Markdown
import os

class AdvancedPDFExtractor:
    def __init__(self, file_path):
        self.file_path = file_path
        self.pdf = pdfplumber.open(file_path)
        self.total_pages = len(self.pdf.pages)

    def __del__(self):
        self.pdf.close()

    def _detect_content_type(self, page):
        """Enhanced content type detection with horizontal line handling"""
        try:
            # Check for horizontal lines characteristic of TOC
            horizontal_lines = self._detect_horizontal_lines(page)
            if len(horizontal_lines) > 3:  # Typical TOC has multiple section dividers
                return "toc-page"

            # Existing detection logic
            if len(page.find_tables()) > 0:
                return "table-page"
            if len(page.images) > 2:  # Ignore single-line images
                return "diagram-page"
            if len(page.extract_text()) < 300:  # Increased text threshold
                return "low-text-page"
            
            return "text-page"
        except Exception as e:
            print(f"Content detection error: {str(e)}")
            return "unknown"

    def _detect_horizontal_lines(self, page):
        """Identify horizontal divider lines"""
        try:
            img = page.to_image(resolution=150).original.convert('L')
            edges = cv2.Canny(np.array(img), 50, 150)
            lines = cv2.HoughLinesP(edges, 1, np.pi/2, threshold=50,
                                  minLineLength=page.width*0.5,
                                  maxLineGap=2)
            return lines if lines is not None else []
        except Exception as e:
            print(f"Line detection error: {str(e)}")
            return []

    def _process_toc_page(self, page):
        """Handle table of contents with horizontal dividers"""
        try:
            # Extract text ignoring horizontal lines
            text = page.extract_text(
                layout=True,
                x_tolerance=2,
                y_tolerance=1,
                use_text_flow=True
            )
            return f"[TOC PAGE]\n{text}", 1
        except Exception as e:
            return f"TOC processing error: {str(e)}", 0

    def _process_mixed_page(self, page):
        """Handle pages with tables followed by columns"""
        try:
            # Extract tables
            tables = page.find_tables()
            table_text = "\n\n".join([str(table.extract()) for table in tables])
            
            # Process remaining area for columns
            non_table_area = page.filter(lambda obj: obj["object_type"] != "table")
            split_x = self._detect_columns(non_table_area)
            
            if split_x:
                columns = self._extract_columns(non_table_area, split_x)[0]
                return f"Tables:\n{table_text}\n\nColumns:\n{columns}", 2
                
            # Fallback to text extraction
            text = non_table_area.extract_text(layout=True)
            return f"Tables:\n{table_text}\n\nText:\n{text}", 2
        except Exception as e:
            return f"Mixed page error: {str(e)}", 0

    def _detect_columns(self, page):
        """Improved column detection ignoring horizontal lines"""
        try:
            # Visual detection with line filtering
            img = page.to_image(resolution=200).original.convert('L')
            edges = cv2.Canny(np.array(img), 50, 150)
            
            # Detect vertical lines only
            lines = cv2.HoughLinesP(edges, 1, np.pi/720, 100,
                                   minLineLength=page.height*0.7,
                                   maxLineGap=2)
            verticals = []
            if lines is not None:
                for line in lines:
                    x1, y1, x2, y2 = line[0]
                    if abs(x1 - x2) < 5 and (0.3*page.width < x1 < 0.7*page.width):
                        verticals.append(x1)
            
            if verticals:
                return np.median(verticals)

            # Text-based detection
            words = page.extract_words(x_tolerance=2)
            if len(words) > 15:
                x_coords = sorted([w['x0'] for w in words] + [w['x1'] for w in words])
                gaps = [(x_coords[i] - x_coords[i-1], x_coords[i-1]) 
                       for i in range(1, len(x_coords))]
                valid_gaps = [g for g in gaps if g[0] > page.width*0.2]
                if valid_gaps:
                    return max(valid_gaps, key=lambda x: x[0])[1] + (max(valid_gaps)[0]/2)
            
            return None
        except Exception as e:
            print(f"Column detection error: {str(e)}")
            return None

    def process_page(self, page_num):
        """Final processing pipeline"""
        try:
            page = self.pdf.pages[page_num]
            content_type = self._detect_content_type(page)
            
            if content_type == "toc-page":
                return self._process_toc_page(page)
            if content_type == "table-page":
                return self._process_mixed_page(page)
            if content_type == "diagram-page":
                return "[DIAGRAM PAGE]", 0
            if content_type == "low-text-page":
                return self._process_toc_page(page)  # Try TOC processing
            
            split_x = self._detect_columns(page)
            if split_x:
                return self._extract_columns(page, split_x)
                
            return page.extract_text(layout=True, x_tolerance=2), 1
            
        except Exception as e:
            return f"Processing error: {str(e)}", 0

def visualize_extraction(extractor, page_num):
    """Enhanced visualization with content analysis"""
    try:
        page = extractor.pdf.pages[page_num-1]
        im = page.to_image(resolution=150)
        
        # Detect and draw columns
        split_x = extractor._detect_columns(page)
        if split_x:
            im.draw_line([(split_x, 0), (split_x, page.height)], 
                        stroke="red", stroke_width=3)
        
        # Draw content type overlay
        content_type = extractor._detect_content_type(page)
        im.draw_text((50, 50), f"Content Type: {content_type}", fill="blue")
        
        plt.figure(figsize=(15, 20))
        plt.imshow(im.annotated)
        plt.axis('off')
        plt.show()
        
    except Exception as e:
        print(f"Visualization error: {str(e)}")

def extract_pdf(input_path, output_path, max_pages=None):
    extractor = AdvancedPDFExtractor(input_path)
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            total = max_pages or extractor.total_pages
            for i in range(total):
                text, cols = extractor.process_page(i)
                f.write(f"=== Page {i+1} ({cols} cols) ===\n{text}\n\n")
                print(f"Processed {i+1}/{total}", end='\r')
        print(f"\nSaved to {os.path.abspath(output_path)}")
    finally:
        del extractor
# Usage Example
if __name__ == "__main__":
    PDF_PATH = "books/Abnormal Psychology ( PDFDrive ).pdf"
    OUTPUT_PATH = "final_output.txt"
    
    extractor = AdvancedPDFExtractor(PDF_PATH)
    
    # Process problematic pages
    for page_num in [12, 13, 14, 15, 16, 17, 18, 19, 20, 22]:
        text, cols = extractor.process_page(page_num-1)
        print(f"=== Page {page_num} ({cols} cols) ===")
        print(text[:500])  # Show first 500 characters
    
    # Full extraction
    extract_pdf(PDF_PATH, OUTPUT_PATH, max_pages=30)

=== Page 12 (1 cols) ===
[TOC PAGE]
                                                                                         
                                                                                         
                                                                                         
                                                                                         
                                                                                         
                                       
=== Page 13 (1 cols) ===
[TOC PAGE]
                                                                                         
                                                                                         
                                                                                         
                                                                                         
                                                                              

In [1]:
import pdfplumber
import numpy as np
import cv2
import re
from IPython.display import display, Markdown
import os

class UniversalPDFExtractor:
    def __init__(self, file_path):
        self.file_path = file_path
        self.pdf = pdfplumber.open(file_path)
        self.total_pages = len(self.pdf.pages)

    def __del__(self):
        self.pdf.close()

    def _detect_content_type(self, page):
        """Precise content classification with priority handling"""
        try:
            # First check for tables
            if len(page.find_tables()) > 0:
                return "table-page"
            
            # Check for TOC patterns (horizontal lines + chapter numbers)
            if self._is_toc_page(page):
                return "toc-page"
            
            # Check for diagrams/images
            if len(page.images) > 2 and len(page.extract_text()) < 100:
                return "diagram-page"
            
            # Finally check column layout
            if self._has_column_layout(page):
                return "multi-column"
            
            return "single-column"
        
        except Exception as e:
            print(f"Content detection error: {str(e)}")
            return "unknown"

    def _is_toc_page(self, page):
        """Identify TOC pages using line patterns and text structure"""
        try:
            # Check for multiple horizontal lines
            h_lines = self._detect_horizontal_lines(page)
            if len(h_lines) < 3:
                return False
                
            # Check for chapter/page number patterns
            text = page.extract_text()
            toc_patterns = [
                r"chapter\s+\d+", 
                r"\d+\s+\.\.\.+\s+\d{1,3}",
                r"contents",
                r"section\s+[IVXL]+"
            ]
            return any(re.search(p, text, re.I) for p in toc_patterns)
            
        except:
            return False

    def _has_column_layout(self, page):
        """Column detection with multiple verification methods"""
        try:
            # Visual verification
            if self._detect_vertical_lines(page):
                return True
                
            # Text gap verification
            words = page.extract_words()
            if len(words) < 20:
                return False
                
            x_coords = [w['x0'] for w in words] + [w['x1'] for w in words]
            hist = np.histogram(x_coords, bins=10)
            peaks = np.where(hist[0] > len(words)/4)[0]
            return len(peaks) >= 2
        except:
            return False

    def _detect_vertical_lines(self, page):
        """Robust vertical line detection"""
        try:
            img = page.to_image(resolution=200).original.convert('L')
            edges = cv2.Canny(np.array(img), 50, 150)
            lines = cv2.HoughLinesP(edges, 1, np.pi/720, 100,
                                   minLineLength=page.height*0.7,
                                   maxLineGap=2)
            if not lines:
                return False
                
            verticals = [l for l in lines if abs(l[0][0] - l[0][2]) < 5]
            return len(verticals) >= 1
        except:
            return False

    def _detect_horizontal_lines(self, page):
        """Horizontal line detection for TOC"""
        try:
            img = page.to_image(resolution=150).original.convert('L')
            edges = cv2.Canny(np.array(img), 50, 150)
            return cv2.HoughLinesP(edges, 1, np.pi/2, 50,
                                  minLineLength=page.width*0.5,
                                  maxLineGap=2)
        except:
            return []

    def _process_toc_page(self, page):
        """TOC processing with line awareness"""
        try:
            # Extract text around lines
            return page.extract_text(
                layout=True, 
                x_tolerance=1,
                y_tolerance=1,
                use_text_flow=True
            ), 1
        except:
            return "TOC CONTENT", 1

    def _process_columns(self, page):
        """Precision column extraction"""
        try:
            # Find exact split point
            words = page.extract_words()
            x_coords = sorted([w['x0'] for w in words] + [w['x1'] for w in words])
            gaps = [(x_coords[i]-x_coords[i-1], x_coords[i-1]) 
                   for i in range(1, len(x_coords))]
            best_gap = max([g for g in gaps if g[0] > page.width*0.15], default=None)
            
            if best_gap:
                split_x = best_gap[1] + best_gap[0]/2
                left = page.crop((0, 0, split_x-10, page.height)).extract_text(layout=True)
                right = page.crop((split_x+10, 0, page.width, page.height)).extract_text(layout=True)
                return f"{left}\n\n{right}", 2
            return page.extract_text(layout=True), 1
        except:
            return page.extract_text(), 1

    def _process_table_page(self, page):
        """Table processing with column fallback"""
        try:
            # Extract tables
            tables = page.find_tables()
            table_content = "\n\n".join([str(t.extract()) for t in tables])
            
            # Process remaining content
            non_table = page.filter(lambda obj: obj["object_type"] != "table")
            if self._has_column_layout(non_table):
                text_content, _ = self._process_columns(non_table)
            else:
                text_content = non_table.extract_text(layout=True)
            
            return f"TABLES:\n{table_content}\n\nCONTENT:\n{text_content}", 2
        except:
            return page.extract_text(), 1

    def process_page(self, page_num):
        """Main processing router"""
        try:
            page = self.pdf.pages[page_num]
            content_type = self._detect_content_type(page)
            
            handlers = {
                "toc-page": self._process_toc_page,
                "table-page": self._process_table_page,
                "multi-column": self._process_columns,
                "diagram-page": lambda p: ("[DIAGRAM CONTENT]", 0),
                "single-column": lambda p: (p.extract_text(layout=True), 1)
            }
            
            return handlers.get(content_type, lambda p: (p.extract_text(), 1))(page)
            
        except Exception as e:
            return f"ERROR: {str(e)}", 0

def visualize_layout(extractor, page_num):
    """Enhanced visual debugging"""
    try:
        page = extractor.pdf.pages[page_num-1]
        im = page.to_image(resolution=150)
        
        # Show content type
        content_type = extractor._detect_content_type(page)
        im.draw_text((20, 20), f"Type: {content_type}", fill="red")
        
        # Draw detected elements
        if content_type == "multi-column":
            split_x = extractor._process_columns(page)[1]
            im.draw_line((split_x, 0), (split_x, page.height), stroke="blue")
        
        plt.figure(figsize=(15, 20))
        plt.imshow(im.annotated)
        plt.axis('off')
        plt.show()
    except Exception as e:
        print(f"Visualization error: {str(e)}")

def extract_pdf(input_path, output_path, max_pages=None):
    extractor = UniversalPDFExtractor(input_path)
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            total = max_pages or extractor.total_pages
            for i in range(total):
                text, cols = extractor.process_page(i)
                f.write(f"=== Page {i+1} ({cols} cols) ===\n{text}\n\n")
        print(f"Extraction complete: {output_path}")
    finally:
        del extractor

# Usage
if __name__ == "__main__":
    PDF_PATH = "books/Abnormal Psychology ( PDFDrive ).pdf"
    OUTPUT_PATH = "final_extraction.txt"
    
    extractor = UniversalPDFExtractor(PDF_PATH)
    extract_pdf(PDF_PATH, OUTPUT_PATH)
    
    # Verify problem pages
    for p in [12, 13, 22, 24, 29, 30]:
        print(f"\nPage {p} preview:")
        text, _ = extractor.process_page(p-1)
        print(text[:500])

Extraction complete: final_extraction.txt

Page 12 preview:
                                                                                         
                                                                                         
                                                                                         
                                                                                         
                                                                                         
                                                  

Page 13 preview:
                                                                                         
                                                                                         
                                                                                         
                                                                                         
                                                             

In [10]:
import pdfplumber
import numpy as np
import cv2
import re
from scipy import stats

class UniversalPDFExtractor:
    def __init__(self, file_path):
        self.file_path = file_path
        self.pdf = pdfplumber.open(file_path)  # This might raise exceptions
        self.total_pages = len(self.pdf.pages)

    def __del__(self):
        # Safe destruction with existence check
        if hasattr(self, 'pdf') and self.pdf:
            self.pdf.close()

    def _get_page_stats(self, page):
        """Calculate key page statistics for dynamic processing"""
        words = page.extract_words()
        return {
            'word_count': len(words),
            'x_coords': [w['x0'] for w in words] + [w['x1'] for w in words],
            'width': page.width,
            'height': page.height
        }

    def _detect_column_split(self, stats):
        """Multi-method column detection with dynamic thresholds"""
        # Method 1: Text gap analysis
        sorted_x = sorted(stats['x_coords'])
        gaps = [(sorted_x[i] - sorted_x[i-1], sorted_x[i-1]) 
               for i in range(1, len(sorted_x))]
        max_gap = max(gaps, key=lambda x: x[0]) if gaps else (0, 0)
        
        # Dynamic gap threshold (20% of page width)
        if max_gap[0] > stats['width'] * 0.2:
            return max_gap[1] + max_gap[0]/2

        # Method 2: Visual line detection
        try:
            img = page.to_image(resolution=150).original.convert('L')
            edges = cv2.Canny(np.array(img), 50, 150)
            lines = cv2.HoughLinesP(edges, 1, np.pi/720, 100,
                                   minLineLength=stats['height']*0.7,
                                   maxLineGap=2)
            if lines:
                verts = [l[0][0] for l in lines if abs(l[0][0]-l[0][2]) < 5]
                if verts:
                    return stats.mode(verts).mode[0]
        except:
            pass

        # Method 3: Text density histogram
        hist = np.histogram(stats['x_coords'], bins=np.linspace(0, stats['width'], 10))
        peaks = np.argsort(hist[0])[-2:]
        if abs(hist[1][peaks[0]] - hist[1][peaks[1]]) > stats['width']*0.3:
            return np.mean([hist[1][peaks].max(), hist[1][peaks].min()])

        return None

    def _extract_columns(self, page, split_x):
        """Column-aware text extraction with overlap protection"""
        buffer = max(10, page.width * 0.02)  # Dynamic buffer size
        left = page.crop((0, 0, split_x - buffer, page.height)).extract_text(layout=True, x_tolerance=1.5)
        right = page.crop((split_x + buffer, 0, page.width, page.height)).extract_text(layout=True, x_tolerance=1.5)
        return f"{left}\n\n{right}"

    def _process_special_content(self, page):
        """Handle pages with tables/diagrams without breaking columns"""
        try:
            # Extract tables first
            tables = page.find_tables()
            table_text = "\n".join([str(t.extract()) for t in tables])
            
            # Process remaining text with column detection
            non_table = page.filter(lambda obj: obj["object_type"] != "table")
            stats = self._get_page_stats(non_table)
            split_x = self._detect_column_split(stats)
            
            if split_x:
                columns = self._extract_columns(non_table, split_x)
                return f"Tables:\n{table_text}\n\nColumns:\n{columns}"
            return f"Tables:\n{table_text}\n\nText:\n{non_table.extract_text()}"
        except:
            return page.extract_text()

    def process_page(self, page_num):
        """Universal processing with dynamic adaptation"""
        try:
            page = self.pdf.pages[page_num]
            stats = self._get_page_stats(page)
            
            # First check for special content
            if len(page.find_tables()) > 0 or len(page.images) > 2:
                return self._process_special_content(page), 2
                
            # Column detection
            split_x = self._detect_column_split(stats)
            
            if split_x and (0.3*stats['width'] < split_x < 0.7*stats['width']):
                return self._extract_columns(page, split_x), 2
                
            # Fallback to full layout extraction
            return page.extract_text(layout=True, x_tolerance=2), 1
            
        except Exception as e:
            return f"ERROR: {str(e)}", 0

def extract_pdf(input_path, output_path):
    extractor = UniversalPDFExtractor(input_path)
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            for i in range(extractor.total_pages):
                text, cols = extractor.process_page(i)
                f.write(f"=== Page {i+1} ({cols} cols) ===\n{text}\n\n")
        print(f"Extraction complete: {output_path}")
    finally:
        # Explicit cleanup instead of relying on __del__
        if hasattr(extractor, 'pdf'):
            extractor.pdf.close()


# Usage
if __name__ == "__main__":
    extract_pdf("books/Abnormal Psychology ( PDFDrive ).pdf", "output.txt")

Extraction complete: output.txt


In [14]:
import pdfplumber
import numpy as np
import cv2
from scipy import stats

class RobustPDFExtractor:
    def __init__(self, file_path):
        self.file_path = file_path
        self.pdf = pdfplumber.open(file_path)
        self.total_pages = len(self.pdf.pages)

    def __del__(self):
        if hasattr(self, 'pdf') and self.pdf:
            self.pdf.close()

    def _get_valid_bbox(self, page):
        """Get safe bounding box coordinates"""
        return (
            max(page.cropbox[0], 0),
            max(page.cropbox[1], 0),
            min(page.cropbox[2], page.width),
            min(page.cropbox[3], page.height)
        )

    def _detect_columns(self, page):
        """Improved column detection with statistical validation"""
        try:
            # Method 1: Visual line detection with mode validation
            img = page.to_image(resolution=150).original.convert('L')
            edges = cv2.Canny(np.array(img), 50, 150)
            lines = cv2.HoughLinesP(edges, 1, np.pi/720, 100,
                                   minLineLength=int(page.height*0.7),
                                   maxLineGap=2)
            verts = []
            if lines:
                verts = [l[0][0] for l in lines 
                         if abs(l[0][0]-l[0][2]) < 5 and 
                         (0.3*page.width < l[0][0] < 0.7*page.width)]
                if verts:
                    mode_x = stats.mode(np.round(verts, -1)).mode[0]
                    if 0.3*page.width < mode_x < 0.7*page.width:
                        return mode_x

            # Method 2: Text gap analysis with density check
            words = page.extract_words()
            if len(words) > 15:
                x_coords = sorted([w['x0'] for w in words] + [w['x1'] for w in words])
                gaps = [(x_coords[i] - x_coords[i-1], x_coords[i-1]) 
                       for i in range(1, len(x_coords))]
                
                # Dynamic gap threshold (20% width) with density validation
                valid_gaps = [g for g in gaps 
                             if g[0] > page.width*0.2 and 
                             (0.3*page.width < g[1] < 0.7*page.width)]
                
                if valid_gaps:
                    best_gap = max(valid_gaps, key=lambda x: x[0])
                    split_x = best_gap[1] + best_gap[0]/2
                    # Validate split position
                    left_density = len([w for w in words if w['x1'] < split_x])
                    right_density = len([w for w in words if w['x0'] > split_x])
                    if abs(left_density - right_density)/(left_density + right_density) < 0.3:
                        return split_x

            return None
        except Exception as e:
            print(f"Detection error: {str(e)}")
            return None

    def _safe_crop(self, page, bbox):
        """Safe cropping with bounds checking"""
        x0, y0, x1, y1 = (
            max(bbox[0], page.bbox[0]),
            max(bbox[1], page.bbox[1]),
            min(bbox[2], page.bbox[2]),
            min(bbox[3], page.bbox[3])
        )
        return page.crop((x0, y0, x1, y1))

    def process_page(self, page_num):
        """Robust page processing with error handling"""
        try:
            page = self.pdf.pages[page_num]
            valid_bbox = self._get_valid_bbox(page)
            base_page = page.within_bbox(valid_bbox)
            
            split_x = self._detect_columns(base_page)
            cols = 1  # Default column count
            
            if split_x and (0.3*base_page.width < split_x < 0.7*base_page.width):
                try:
                    # Extract with protected buffer zones
                    left = self._safe_crop(base_page, (0, 0, split_x-15, base_page.height)).extract_text(layout=True, x_tolerance=1.5)
                    right = self._safe_crop(base_page, (split_x+15, 0, base_page.width, base_page.height)).extract_text(layout=True, x_tolerance=1.5)
                    return f"{left}\n\n{right}", 2
                except:
                    cols = 1  # Fallback if extraction fails
                    
            # Fallback to full page extraction
            text = base_page.extract_text(
                layout=True, 
                x_tolerance=2,
                y_tolerance=1,
                use_text_flow=True
            )
            return text.strip() or "CONTENT_NOT_EXTRACTED", cols
            
        except Exception as e:
            return f"PAGE_EXTRACTION_ERROR: {str(e)}", 0

def extract_pdf(input_path, output_path):
    extractor = RobustPDFExtractor(input_path)
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            for i in range(extractor.total_pages):
                text, cols = extractor.process_page(i)
                f.write(f"=== Page {i+1} ({cols} cols) ===\n{text}\n\n")
        print(f"Extraction complete: {output_path}")
    finally:
        if hasattr(extractor, 'pdf'):
            extractor.pdf.close()

# Usage
if __name__ == "__main__":
    extract_pdf("books/Abnormal Psychology ( PDFDrive ).pdf", "output.txt")

Detection error: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Detection error: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Detection error: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Detection error: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Detection error: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Detection error: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Detection error: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Detection error: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Detection error: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Detection error: Th

Exception ignored in: <function UniversalPDFExtractor.__del__ at 0x298331f80>
Traceback (most recent call last):
  File "/var/folders/fs/mbhwhwmx3w18xqxklt9pw6br0000gn/T/ipykernel_3365/3073551090.py", line 14, in __del__
AttributeError: 'UniversalPDFExtractor' object has no attribute 'pdf'


Detection error: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Detection error: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Detection error: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Detection error: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Detection error: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Detection error: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Detection error: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Detection error: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Detection error: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Detection error: Th

In [11]:

for pdf in os.listdir('books'):
    PDF_PATH = os.path.join("books", pdf)
    OUTPUT_PATH= '{pdf}.txt'
    print(PDF_PATH)


books/Before You See Your First Client_ 55 Things Counselors, Therapists and Human Service Workers Need to Know ( PDFDrive ).pdf
books/Abnormal Psychology_ An Integrative Approach ( PDFDrive ).pdf
books/.DS_Store
books/Abnormal Psychology ( PDFDrive ).pdf
books/71.pdf
books/Crisis Intervention Strategies ( PDFDrive ).pdf
books/Counseling and Psychotherapy with Children and Adolescents_ Theory and Practice for School and Clinical Settings ( PDFDrive ).pdf
books/Diagnostic and statistical manual of mental disorders _ DSM-5 ( PDFDrive ).pdf


In [None]:
import pdfplumber

try:
    with pdfplumber.open(PDF_PATH) as pdf:
        pdf_length=len(pdf.pages)
except Exception as e:
    print("Error opening PDF:", e)


In [1]:
text= """
This text provides a comprehensive and evidence-based introduction to psychiatric
       mental health assessment and diagnosis in advanced nursing practice.
         Taking a clinical, case-based approach, this textbook is designed to support
       graduate nursing students who are studying psychiatric mental health nursing as
       they develop their reasoning and decision-making skills. It presents:
                                                             
       ·· Therapeutic communication and psychiatric interviewing techniques,
          alongside basic psychiatric terminologies.         
       ·· The major psychiatric diagnoses, drawing on the DSM-5.
       ·· A step-by-step guide to conducting a comprehensive psychiatric mental
          health assessment.                                 
       ·· Case examples demonstrating assessment across major psychopathologies.
       ·· Good practice for conducting mental health evaluations.
       This is an essential text for all those undertaking psychiatric mental health nurse
       practitioner programs and a valuable reference for advanced practice nurses in
       clinical practice.                                    
                                                             
       Kunsook S. Bernstein is Professor Emerita at Hunter College School of Nursing,
       City University of New York, where she taught advanced psychiatric nurse prac-
       titioner students and coordinated the psychiatric mental health nurse practitioner
       program. Her primary area of research is Asian American immigrants’ mental
       health and healthcare disparities.                    
       Robert Kaplan is Advanced Senior Lecturer and Co-Principal Investigator
       of the Writing Research Lab in the Program in Writing and Rhetoric at Stony
       Brook University, State University of New York. He teaches research writing to
       advanced undergraduate STEM students and is a science editor for public health
       researchers.

"""

In [4]:
tokens = text.split()
tokens

['This',
 'text',
 'provides',
 'a',
 'comprehensive',
 'and',
 'evidence-based',
 'introduction',
 'to',
 'psychiatric',
 'mental',
 'health',
 'assessment',
 'and',
 'diagnosis',
 'in',
 'advanced',
 'nursing',
 'practice.',
 'Taking',
 'a',
 'clinical,',
 'case-based',
 'approach,',
 'this',
 'textbook',
 'is',
 'designed',
 'to',
 'support',
 'graduate',
 'nursing',
 'students',
 'who',
 'are',
 'studying',
 'psychiatric',
 'mental',
 'health',
 'nursing',
 'as',
 'they',
 'develop',
 'their',
 'reasoning',
 'and',
 'decision-making',
 'skills.',
 'It',
 'presents:',
 '··',
 'Therapeutic',
 'communication',
 'and',
 'psychiatric',
 'interviewing',
 'techniques,',
 'alongside',
 'basic',
 'psychiatric',
 'terminologies.',
 '··',
 'The',
 'major',
 'psychiatric',
 'diagnoses,',
 'drawing',
 'on',
 'the',
 'DSM-5.',
 '··',
 'A',
 'step-by-step',
 'guide',
 'to',
 'conducting',
 'a',
 'comprehensive',
 'psychiatric',
 'mental',
 'health',
 'assessment.',
 '··',
 'Case',
 'examples',
 '