In [13]:
import pdfplumber
import pandas as pd
import os
import re
import fitz



# Function to clear previous CSV files in the output directory
def clear_previous_outputs(output_dir):
    for filename in os.listdir(output_dir):
        if filename.endswith(".csv"):
            try:
                os.remove(os.path.join(output_dir, filename))
                print(f"Deleted: {filename}")
            except PermissionError:
                print(f"Failed to delete: {filename}. It may be open in another program.")
# Function to extract text within a bounding box
def extract_text_within_bbox(page, bbox):
    return page.within_bbox(bbox).extract_text()

def get_table_bboxes(page):
    table_bboxes = []
    for table in page.find_tables():
        table_bboxes.append(table.bbox)  # Add bounding box for each table found
    return table_bboxes
    
# Function to process the PDF and extract all table titles
def extract_all_titles(pdf, page_number):
    extracted_titles = []
    page = pdf.pages[page_number]
    table_bboxes = get_table_bboxes(page)
    if table_bboxes:
        titles = extract_titles_above_tables(page, table_bboxes)
        if titles:
            extracted_titles.extend([title for title in titles])
    return extracted_titles
    
# Function to extract possible table titles
# Function to extract titles from a page, given the table bounding boxes
def extract_titles_above_tables(page, table_bboxes):
    titles = []
    for table_bbox in table_bboxes:
        title_bbox = (0, 0, page.width, table_bbox[1])  # Bounding box from top of page to top of table
        title_text = extract_text_within_bbox(page, title_bbox)
        if title_text:
            title_lines = title_text.split('\n')
            collecting = False
            combined_title = []
            for line in title_lines:
                line = line.strip()
                if line.startswith("Table"):
                    collecting = True
                    combined_title = [line]
                elif collecting:
                    if line and not line.startswith("Table"):
                        combined_title.append(line)
                    else:
                        collecting = False
                        if combined_title:
                            titles.append(" ".join(combined_title))
                            print(combined_title)
                            combined_title = []
            # Ensure to add the last title if any
            if combined_title:
                titles.append(" ".join(combined_title))
    return titles
    

def get_text_from_fitz(fitz_page, bbox):
    """Extract text from a specified bounding box using PyMuPDF."""
    rect = fitz.Rect(bbox[0], bbox[1], bbox[2], bbox[3])
    text = fitz_page.get_text("text", clip=rect)  # Extract text within the bounding box
    return text.strip()
def sanitize_filename(filename):
    sanitized = re.sub(r'[<>:"/\\|?.,*)(=-]', '_', filename)  # Replace invalid characters
    sanitized = re.sub(r'\s+', '_', sanitized)  # Replace spaces with underscores
    sanitized = sanitized.replace('__','_')
    return sanitized[:255]  # Limit to 255 characters
    

def extract_tables_from_pdf(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            document = fitz.open(pdf_path)
            for page_number in range(0,len(pdf.pages)):
                page = pdf.pages[page_number]
                pdfplumber_page = pdf.pages[page_number]
                tables = page.find_tables()
                fitz_page = document.load_page(page_number)
                titles = extract_all_titles(pdf, page_number)
                for table_index, table in enumerate(tables):
                    row_table = []            
                    for rows in table.rows:
                        row_cells = []
                        for  bbox in rows.cells:
                            if bbox:
                                _text = get_text_from_fitz(fitz_page, bbox)
                                row_cells.append(_text)
                            else:
                                row_cells.append('')
                        row_table.append(row_cells)
                        
                    # corrected_table = correct_table_cells(filtered_table, text_elements)
                    corrected_table = row_table.copy()
                            # Create a DataFrame from the corrected table
                    if corrected_table and len(corrected_table) > 1:  # Ensure there is data to save
                        df = pd.DataFrame(corrected_table[1:], columns=corrected_table[0])  # Use the first row as header
        
                        # Ensure all column names are strings
                        df.columns = [str(col) if col is not None else "Unnamed" for col in df.columns]
                        
                        # Find the title for the current table
                        table_title = titles[table_index]
                        
                        # Generate a file name based on the table title and append pagenumber and table number
                        sanitized_title = sanitize_filename(f"{table_title}_Page{page_number + 1}_Table{table_index + 1}")
                        table_name = f"{sanitized_title}.csv"
                        
                        # Save the DataFrame to CSV
                        file_path = os.path.join(output_dir, table_name)
                        
                        print('file will be stored as ' ,file_path)
                        df.to_csv(file_path, index=False)
        return "Success",200
    except Exception as e:
        return "Error Occured"+ str(e), 500

pdf_path = "New folder/fourth.pdf"  
output_dir = "extracted_tables"  
# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
empty_output_dir = True
if empty_output_dir:
    clear_previous_outputs(output_dir)
extract_tables_from_pdf(pdf_path)     

Deleted: Table_3_5___Typical_current_consumption_in_Run__with_different_codes_running_from_flash__ART_enable__Cache_ON_Prefetch_OFF__and_power_supplied_by_external_SMPS__V___1_05_V__DD12_Page1_Table1.csv
Deleted: Table_36__Typical_current_consumption_in_Run_and_Low_power_run_modes__with_different_codes_running_from_flash__ART_disable_Page1_Table2.csv
file will be stored as  extracted_tables/Table_3_5__Typical_current_consumption_in_Run_with_different_codes_running_from_flash_ART_enable_Cache_ON_Prefetch_OFF_and_power_supplied_by_external_SMPS_V__1_05_V_DD12_Page1_Table1.csv
file will be stored as  extracted_tables/Table_36_Typical_current_consumption_in_Run_and_Low_power_run_modes_with_different_codes_running_from_flash_ART_disable_Page1_Table2.csv


('Success', 200)