In [25]:
import fitz  # PyMuPDF

# Open the document (PDF in this case)
doc = fitz.open(r"C:\Users\pavan\OneDrive\Desktop\table.pdf")
# Load the first page (0-indexed)
page = doc[0]

# Look for tables on this page
table_finder = page.find_tables()  # This returns a TableFinder object

# Extract the tables list from the TableFinder object
tabs = table_finder.tables  # Now, tabs is a list of table objects

# Display the number of tables found
print(f"{len(tabs)} table(s) found on page 1")

# Extract the first table
tab = tabs[0]  # Get the first table

# Inspect the table attributes
print(f"Table object: {tab}")
print(f"Number of rows: {len(tab.rows)}")

# Number of columns can be inferred from the first row's number of cells
if tab.rows:
    num_columns = len(tab.rows[0].cells)
    print(f"Number of columns: {num_columns}")

# Iterate over the rows and extract cell values
for row_idx, row in enumerate(tab.rows):
    row_data = []
    for col_idx, cell in enumerate(row.cells):
        if cell is None:
            continue  # Skip empty cells
        else:
            # Extract text within the bounding box of the cell
            bbox = fitz.Rect(cell)  # Create a rectangle from the bounding box
            cell_text = page.get_text("text", clip=bbox).strip()  # Extract text within the bounding box
            if cell_text:  # Only add non-empty text
                row_data.append(cell_text)
    
    # Only print rows with meaningful content
    if row_data:
        print(f"Row {row_idx + 1}: {row_data}")


1 table(s) found on page 1
Table object: <pymupdf.table.Table object at 0x00000216AE1493D0>
Number of rows: 14
Number of columns: 18
Row 1: ['Results']
Row 2: ['Ballots']
Row 3: ['Disability', 'Ballots']
Row 4: ['Participants', 'Incomplete/']
Row 5: ['y\nCategory', 'Completed', 'Accuracy', 'Time to']
Row 6: ['Terminated']
Row 7: ['complete']
Row 11: ['Blind', '5', '1', '4', '34.5%, n=1', '1199 sec, n=1']
Row 12: ['Low Vision', '5', '2', '3', '98.3% n=2 \n(97.7%, n=3)', '1716 sec, n=3 \n(1934 sec, n=2)']
Row 13: ['Dexterity', '5', '4', '1', '98.3%, n=4', '1672.1 sec, n=4']
Row 14: ['Mobility', '3', '3', '0', '95.4%, n=3', '1416 sec, n=3']


In [7]:
import fitz  # PyMuPDF

# Open the document (PDF in this case)
doc = fitz.open(r"C:\Users\pavan\OneDrive\Desktop\table1.pdf")
# Load the second page (page index 1)
page = doc[1]  # 0-based index, so page 1 is the second page

# Look for tables on this page
table_finder = page.find_tables()  # This returns a TableFinder object

# Extract the tables list from the TableFinder object
tables = table_finder.tables  # Now, tables is a list of table objects

# Display the number of tables found
print(f"{len(tables)} table(s) found on page 2")

# Process each table found
for table_index, table in enumerate(tables):
    print(f"\nTable {table_index + 1}:")
    
    # Inspect the table attributes
    rows = table.rows
    print(f"Number of rows: {len(rows)}")

    # Number of columns can be inferred from the first row's number of cells
    if rows:
        num_columns = len(rows[0].cells)
        print(f"Number of columns: {num_columns}")

    # Iterate over the rows and extract cell values
    for row_idx, row in enumerate(rows):
        row_data = []
        for col_idx, cell in enumerate(row.cells):
            if cell is None:
                row_data.append(' ')  # Add space for empty cells
            else:
                # Extract text within the bounding box of the cell
                bbox = fitz.Rect(cell)  # Create a rectangle from the bounding box
                cell_text = page.get_text("text", clip=bbox).strip()  # Extract text within the bounding box
                row_data.append(cell_text if cell_text else ' ')  # Add space for empty text
        # Only print rows with meaningful content
        if any(cell != ' ' for cell in row_data):  # Print row if it contains non-space content
            print(f"Row {row_idx + 1}: {row_data}")



2 table(s) found on page 2

Table 1:
Number of rows: 14
Number of columns: 5
Row 1: ['Speed (mph)', 'Driver', 'Car', 'Engine\nDate', ' ']
Row 2: ['407.447', 'Craig Breedlove', 'Spirit of America', 'GE J47', '8/5/63']
Row 3: ['413.199', 'Tom Green', 'Wingfoot Express', 'WE J46', '10/2/64']
Row 4: ['434.22', 'Art Arfons', 'Green Monster', 'GE J79', '10/5/64']
Row 5: ['468.719', 'Craig Breedlove', 'Spirit of America', 'GE J79', '10/13/64']
Row 6: ['526.277', 'Craig Breedlove', 'Spirit of America', 'GE J79', '10/15/65']
Row 7: ['536.712', 'Art Arfons', 'Green Monster', 'GE J79', '10/27/65']
Row 8: ['555.127', 'Craig Breedlove', 'Spirit of America, Sonic 1', 'GE J79', '11/2/65']
Row 9: ['576.553', 'Art Arfons', 'Green Monster', 'GE J79', '11/7/65']
Row 10: ['600.601', 'Craig Breedlove', 'Spirit of America, Sonic 1', 'GE J79', '11/15/65']
Row 11: ['622.407', 'Gary Gabelich', 'Blue Flame', 'Rocket', '10/23/70']
Row 12: ['633.468', 'Richard Noble', 'Thrust 2', 'RR RG 146', '10/4/83']
Row 13: [