In [7]:
import os
import cv2
import numpy as np
import fitz  # PyMuPDF
from pdf2image import convert_from_path, pdfinfo_from_path
from PIL import Image
import pytesseract
from datetime import datetime

def is_valid_pdf(pdf_path):
    """Check if PDF is valid and can be opened"""
    try:
        with fitz.open(pdf_path) as doc:
            if len(doc) > 0:  # Check if we can get page count
                return True
        return False
    except:
        return False

def extract_tables_and_figures(pdf_path, output_folder, dpi=300):
    """Robust extraction of tables and figures with error handling"""
    os.makedirs(output_folder, exist_ok=True)
    results = {
        'tables': [],
        'figures': [],
        'errors': []
    }

    try:
        # First try with PyMuPDF for metadata and embedded figures
        try:
            with fitz.open(pdf_path) as doc:
                # Extract embedded figures
                for page_num in range(len(doc)):
                    page = doc.load_page(page_num)
                    images = page.get_images(full=True)
                    
                    for img_index, img in enumerate(images):
                        try:
                            xref = img[0]
                            base_image = doc.extract_image(xref)
                            img_path = os.path.join(
                                output_folder,
                                f"page_{page_num+1}_fig_{img_index+1}.{base_image['ext']}"
                            )
                            with open(img_path, "wb") as f:
                                f.write(base_image["image"])
                            results['figures'].append({
                                'page': page_num + 1,
                                'path': img_path,
                                'type': 'embedded_figure'
                            })
                        except Exception as e:
                            results['errors'].append(f"Page {page_num+1} figure error: {str(e)}")
        except Exception as e:
            results['errors'].append(f"PyMuPDF error: {str(e)}")

        # Then try with pdf2image for table extraction
        try:
            # Check if we can get page count first
            info = pdfinfo_from_path(pdf_path)
            pages = convert_from_path(pdf_path, dpi=dpi)
            
            for page_num, page in enumerate(pages):
                try:
                    temp_path = os.path.join(output_folder, f"temp_page_{page_num}.png")
                    page.save(temp_path, "PNG")
                    img = cv2.imread(temp_path)
                    
                    if img is not None:
                        # Table detection
                        tables = detect_tables(img)
                        for i, table in enumerate(tables):
                            table_path = os.path.join(
                                output_folder,
                                f"page_{page_num+1}_table_{i+1}.png"
                            )
                            cv2.imwrite(table_path, table['image'])
                            results['tables'].append({
                                'page': page_num + 1,
                                'path': table_path,
                                'bbox': table['bbox'],
                                'type': 'table'
                            })
                    
                    os.remove(temp_path)
                except Exception as e:
                    results['errors'].append(f"Page {page_num+1} processing error: {str(e)}")
                    if os.path.exists(temp_path):
                        os.remove(temp_path)
        except Exception as e:
            results['errors'].append(f"PDF2Image error: {str(e)}")

    except Exception as e:
        results['errors'].append(f"General processing error: {str(e)}")

    return results

def detect_tables(img):
    """Improved table detection with multiple methods"""
    tables = []
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Method 1: Line detection for bordered tables
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    
    # Detect lines
    horz_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50, 1))
    vert_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 50))
    horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horz_kernel, iterations=2)
    vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vert_kernel, iterations=2)
    table_mask = cv2.add(horizontal, vertical)
    
    # Find table contours
    contours, _ = cv2.findContours(table_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 200 and h > 100:  # Minimum table size
            table_img = img[y:y+h, x:x+w]
            tables.append({
                'image': table_img,
                'bbox': [x, y, w, h]
            })
    
    return tables

def process_pdf_folder(input_folder, output_base):
    """Process all PDFs in a folder with robust error handling"""
    os.makedirs(output_base, exist_ok=True)
    all_results = []
    
    for pdf_file in sorted(f for f in os.listdir(input_folder) if f.lower().endswith('.pdf')):
        pdf_path = os.path.join(input_folder, pdf_file)
        paper_id = os.path.splitext(pdf_file)[0]
        output_folder = os.path.join(output_base, paper_id)
        
        print(f"\nProcessing: {pdf_file}")
        
        if not is_valid_pdf(pdf_path):
            print(f"Skipping invalid/corrupted PDF: {pdf_file}")
            all_results.append({
                'paper': pdf_file,
                'status': 'failed',
                'error': 'Invalid/corrupted PDF'
            })
            continue
            
        try:
            results = extract_tables_and_figures(pdf_path, output_folder)
            
            # Save metadata
            metadata = {
                'paper': pdf_file,
                'tables_extracted': len(results['tables']),
                'figures_extracted': len(results['figures']),
                'errors': results['errors'],
                'timestamp': datetime.now().isoformat()
            }
            
            with open(os.path.join(output_folder, 'metadata.json'), 'w') as f:
                import json
                json.dump(metadata, f, indent=2)
            
            all_results.append(metadata)
            print(f"Extracted {len(results['tables'])} tables and {len(results['figures'])} figures")
            
        except Exception as e:
            print(f"Fatal error processing {pdf_file}: {str(e)}")
            all_results.append({
                'paper': pdf_file,
                'status': 'failed',
                'error': str(e)
            })
    
    return all_results

if __name__ == "__main__":
    # Configuration
    INPUT_FOLDER = "/Users/Snigdha/Desktop/NEU/NLP/Project/PeerRead-master/data/acl_2017/train/pdfs"
    OUTPUT_FOLDER = "/Users/Snigdha/Desktop/NEU/NLP/Project/PeerRead-master/data/acl_2017/train/output"
    
    # Create timestamped output
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    final_output = os.path.join(OUTPUT_FOLDER, f"extraction_{timestamp}")
    
    # Process all PDFs
    print(f"Starting extraction from {INPUT_FOLDER}")
    results = process_pdf_folder(INPUT_FOLDER, final_output)
    
    # Print summary
    success = sum(1 for r in results if r.get('status') != 'failed')
    print(f"\nExtraction complete! Processed {success}/{len(results)} PDFs successfully")
    print(f"Results saved to: {final_output}")

Starting extraction from /Users/Snigdha/Desktop/NEU/NLP/Project/PeerRead-master/data/acl_2017/train/pdfs

Processing: 104.pdf
Extracted 17 tables and 7 figures

Processing: 105.pdf
Extracted 10 tables and 6 figures

Processing: 107.pdf
Extracted 12 tables and 2 figures

Processing: 108.pdf
Extracted 9 tables and 0 figures

Processing: 117.pdf
Extracted 7 tables and 0 figures

Processing: 12.pdf
Extracted 10 tables and 1 figures

Processing: 122.pdf
Extracted 3 tables and 3 figures

Processing: 128.pdf
Extracted 3 tables and 0 figures

Processing: 130.pdf
Extracted 5 tables and 25 figures

Processing: 134.pdf
Extracted 2 tables and 0 figures

Processing: 145.pdf
Extracted 0 tables and 5 figures

Processing: 150.pdf
Extracted 11 tables and 1 figures

Processing: 16.pdf
Extracted 8 tables and 3 figures

Processing: 169.pdf
Extracted 13 tables and 0 figures

Processing: 178.pdf
Extracted 14 tables and 3 figures

Processing: 18.pdf
Extracted 5 tables and 0 figures

Processing: 180.pdf
Extra

In [2]:
import os
import cv2
import numpy as np
import fitz  # PyMuPDF
from pdf2image import convert_from_path, pdfinfo_from_path
from PIL import Image
import pytesseract
from datetime import datetime

def is_valid_pdf(pdf_path):
    """Check if PDF is valid and can be opened"""
    try:
        with fitz.open(pdf_path) as doc:
            if len(doc) > 0:  # Check if we can get page count
                return True
        return False
    except:
        return False

def extract_tables_and_figures(pdf_path, output_folder, dpi=300):
    """Robust extraction of tables and figures with error handling"""
    os.makedirs(output_folder, exist_ok=True)
    results = {
        'tables': [],
        'figures': [],
        'errors': []
    }

    try:
        # First try with PyMuPDF for metadata and embedded figures
        try:
            with fitz.open(pdf_path) as doc:
                # Extract embedded figures
                for page_num in range(len(doc)):
                    page = doc.load_page(page_num)
                    images = page.get_images(full=True)
                    
                    for img_index, img in enumerate(images):
                        try:
                            xref = img[0]
                            base_image = doc.extract_image(xref)
                            img_path = os.path.join(
                                output_folder,
                                f"page_{page_num+1}_fig_{img_index+1}.{base_image['ext']}"
                            )
                            with open(img_path, "wb") as f:
                                f.write(base_image["image"])
                            results['figures'].append({
                                'page': page_num + 1,
                                'path': img_path,
                                'type': 'embedded_figure'
                            })
                        except Exception as e:
                            results['errors'].append(f"Page {page_num+1} figure error: {str(e)}")
        except Exception as e:
            results['errors'].append(f"PyMuPDF error: {str(e)}")

        # Then try with pdf2image for table extraction
        try:
            # Check if we can get page count first
            info = pdfinfo_from_path(pdf_path)
            pages = convert_from_path(pdf_path, dpi=dpi)
            
            for page_num, page in enumerate(pages):
                try:
                    temp_path = os.path.join(output_folder, f"temp_page_{page_num}.png")
                    page.save(temp_path, "PNG")
                    img = cv2.imread(temp_path)
                    
                    if img is not None:
                        # Table detection
                        tables = detect_tables(img)
                        for i, table in enumerate(tables):
                            table_path = os.path.join(
                                output_folder,
                                f"page_{page_num+1}_table_{i+1}.png"
                            )
                            cv2.imwrite(table_path, table['image'])
                            results['tables'].append({
                                'page': page_num + 1,
                                'path': table_path,
                                'bbox': table['bbox'],
                                'type': 'table'
                            })
                    
                    os.remove(temp_path)
                except Exception as e:
                    results['errors'].append(f"Page {page_num+1} processing error: {str(e)}")
                    if os.path.exists(temp_path):
                        os.remove(temp_path)
        except Exception as e:
            results['errors'].append(f"PDF2Image error: {str(e)}")

    except Exception as e:
        results['errors'].append(f"General processing error: {str(e)}")

    return results

def detect_tables(img):
    """Improved table detection with multiple methods"""
    tables = []
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Method 1: Line detection for bordered tables
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    
    # Detect lines
    horz_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50, 1))
    vert_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 50))
    horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horz_kernel, iterations=2)
    vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vert_kernel, iterations=2)
    table_mask = cv2.add(horizontal, vertical)
    
    # Find table contours
    contours, _ = cv2.findContours(table_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 200 and h > 100:  # Minimum table size
            table_img = img[y:y+h, x:x+w]
            tables.append({
                'image': table_img,
                'bbox': [x, y, w, h]
            })
    
    return tables

def process_pdf_folder(input_folder, output_base):
    """Process all PDFs in a folder with robust error handling"""
    os.makedirs(output_base, exist_ok=True)
    all_results = []
    
    for pdf_file in sorted(f for f in os.listdir(input_folder) if f.lower().endswith('.pdf')):
        pdf_path = os.path.join(input_folder, pdf_file)
        paper_id = os.path.splitext(pdf_file)[0]
        output_folder = os.path.join(output_base, paper_id)
        
        print(f"\nProcessing: {pdf_file}")
        
        if not is_valid_pdf(pdf_path):
            print(f"Skipping invalid/corrupted PDF: {pdf_file}")
            all_results.append({
                'paper': pdf_file,
                'status': 'failed',
                'error': 'Invalid/corrupted PDF'
            })
            continue
            
        try:
            results = extract_tables_and_figures(pdf_path, output_folder)
            
            # Save metadata
            metadata = {
                'paper': pdf_file,
                'tables_extracted': len(results['tables']),
                'figures_extracted': len(results['figures']),
                'errors': results['errors'],
                'timestamp': datetime.now().isoformat()
            }
            
            with open(os.path.join(output_folder, 'metadata.json'), 'w') as f:
                import json
                json.dump(metadata, f, indent=2)
            
            all_results.append(metadata)
            print(f"Extracted {len(results['tables'])} tables and {len(results['figures'])} figures")
            
        except Exception as e:
            print(f"Fatal error processing {pdf_file}: {str(e)}")
            all_results.append({
                'paper': pdf_file,
                'status': 'failed',
                'error': str(e)
            })
    
    return all_results

if __name__ == "__main__":
    # Configuration
    INPUT_FOLDER = "/Users/Snigdha/Desktop/NEU/NLP/Project/PeerRead-master/data/acl_2017/train/pdfs-1"
    OUTPUT_FOLDER = "/Users/Snigdha/Desktop/NEU/NLP/Project/PeerRead-master/data/acl_2017/train/output"
    
    # Create timestamped output
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    final_output = os.path.join(OUTPUT_FOLDER, f"extraction_{timestamp}")
    
    # Process all PDFs
    print(f"Starting extraction from {INPUT_FOLDER}")
    results = process_pdf_folder(INPUT_FOLDER, final_output)
    
    # Print summary
    success = sum(1 for r in results if r.get('status') != 'failed')
    print(f"\nExtraction complete! Processed {success}/{len(results)} PDFs successfully")
    print(f"Results saved to: {final_output}")

Starting extraction from /Users/Snigdha/Desktop/NEU/NLP/Project/PeerRead-master/data/acl_2017/train/pdfs-1

Processing: 1607.01400v1.pdf
Extracted 10 tables and 0 figures

Processing: 1612.04858v1.pdf
Extracted 27 tables and 11 figures

Processing: 1811.04422v1.pdf
Extracted 0 tables and 0 figures

Processing: 1906.06821v2.pdf
Extracted 0 tables and 0 figures

Processing: 1907.08908v1.pdf
Extracted 3 tables and 5 figures

Processing: 1909.03550v1.pdf
Extracted 9 tables and 6 figures

Processing: 2007.01503v1.pdf
Extracted 5 tables and 5 figures

Processing: 2007.14206v1.pdf
Extracted 15 tables and 6 figures

Processing: 2104.10201v2.pdf
Extracted 5 tables and 0 figures

Processing: 2206.13446v1.pdf
Extracted 24 tables and 1 figures

Processing: 2401.04155v2.pdf
Extracted 60 tables and 9 figures

Processing: 2402.06196v3.pdf
Extracted 48 tables and 50 figures

Processing: 2405.15251v1.pdf
Extracted 8 tables and 3 figures

Extraction complete! Processed 13/13 PDFs successfully
Results sa