In [2]:
import pdfplumber
import json
import statistics

def extract_clean_paragraphs(pdf_path, json_output_path):
    extracted_data = []

    try:
        with pdfplumber.open(pdf_path) as pdf:
            for i, page in enumerate(pdf.pages):
                # 1. GET DIMENSIONS & IGNORE HEADER
                # page.bbox returns (x0, top, x1, bottom)
                # We start 'top' at 50 to skip the header (e.g., "SECTION I...")
                x0, top, x1, bottom = page.bbox
                header_cutoff = top + 50 
                
                width = x1 - x0
                midpoint = x0 + (width / 2)
                
                # Define Column Boxes (Left and Right)
                left_box = (x0, header_cutoff, midpoint, bottom)
                right_box = (midpoint, header_cutoff, x1, bottom)
                
                page_paragraphs = []
                
                # Process Left Column then Right Column
                for col_box in [left_box, right_box]:
                    crop = page.crop(bbox=col_box)
                    words = crop.extract_words(x_tolerance=3, y_tolerance=3, keep_blank_chars=False)
                    
                    if not words:
                        continue

                    # 2. CLUSTER WORDS INTO LINES
                    lines = []
                    current_line = [words[0]]
                    for word in words[1:]:
                        if abs(word['top'] - current_line[-1]['top']) < 5:
                            current_line.append(word)
                        else:
                            lines.append(current_line)
                            current_line = [word]
                    lines.append(current_line)

                    # 3. CALCULATE MARGINS (CRITICAL STEP)
                    all_x0 = [line[0]['x0'] for line in lines]
                    all_x1 = [line[-1]['x1'] for line in lines]
                    
                    # Using median as a robust estimate for flush margin
                    col_left_margin = statistics.median(all_x0)
                    col_right_margin = statistics.median(all_x1)

                    # 4. RECONSTRUCT PARAGRAPHS
                    current_para_text = ""
                    
                    for idx, line in enumerate(lines):
                        line_text = " ".join([w['text'] for w in line])
                        line_start = line[0]['x0']
                        line_end = line[-1]['x1']
                        line_top = line[0]['top']
                        
                        is_new_para = False
                        
                        if idx == 0:
                            is_new_para = True
                        else:
                            prev_line = lines[idx-1]
                            prev_end = prev_line[-1]['x1']
                            prev_bottom = prev_line[0]['bottom']
                            
                            # CHECK A: INDENTATION
                            if (line_start - col_left_margin) > 10:
                                is_new_para = True
                            
                            # CHECK B: PREVIOUS LINE SHORT (End of sentence)
                            elif (col_right_margin - prev_end) > 10:
                                is_new_para = True
                                
                            # CHECK C: VERTICAL GAP (Headers/Sections)
                            elif (line_top - prev_bottom) > 12:
                                is_new_para = True
                                
                            # CHECK D: IS HEADER (e.g. "NEPHRONS")
                            elif line_text.isupper() and len(line_text.split()) < 5:
                                is_new_para = True

                        # APPEND TEXT
                        if is_new_para:
                            if current_para_text:
                                page_paragraphs.append(current_para_text.strip())
                            current_para_text = line_text
                        else:
                            # Merge logic: fix hyphenation
                            if current_para_text.endswith('-'):
                                current_para_text = current_para_text[:-1] + line_text
                            else:
                                current_para_text += " " + line_text
                        
                    # Flush last paragraph of column
                    if current_para_text:
                        page_paragraphs.append(current_para_text.strip())

                extracted_data.append({
                    "page_number": i + 1,
                    "paragraphs": page_paragraphs
                })

        with open(json_output_path, 'w', encoding='utf-8') as f:
            json.dump(extracted_data, f, indent=4, ensure_ascii=False)
            
        print(f"Extraction complete. Saved to {json_output_path}")

    except FileNotFoundError:
        print(f"Error: The PDF file '{pdf_path}' was not found.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


input_pdf = "comprehensive-clinical-nephrology-20-1490.pdf"
output_json = "nephro_chunks.json"

# CORRECTED FUNCTION CALL: No colon, use defined variables
extract_clean_paragraphs(input_pdf, output_json)

KeyboardInterrupt: 