# Table Extractor

In [1]:
!pip install paddlepaddle
!pip install paddleocr

Collecting paddlepaddle
  Downloading paddlepaddle-2.6.2-cp311-cp311-manylinux1_x86_64.whl.metadata (8.6 kB)
Collecting astor (from paddlepaddle)
  Downloading astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting opt-einsum==3.3.0 (from paddlepaddle)
  Downloading opt_einsum-3.3.0-py3-none-any.whl.metadata (6.5 kB)
Downloading paddlepaddle-2.6.2-cp311-cp311-manylinux1_x86_64.whl (126.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading astor-0.8.1-py2.py3-none-any.whl (27 kB)
Installing collected packages: opt-einsum, astor, paddlepaddle
  Attempting uninstall: opt-einsum
    Found existing installation: opt_einsum 3.4.0
    Uninstalling opt_einsum-3.4.0:
      Successfully uninstalled opt_einsum-3.4.0
Successful

In [3]:
from paddleocr import PaddleOCR
import cv2
import json
import csv
import numpy as np

def process_image(image_path):
    # Initialize PaddleOCR
    ocr = PaddleOCR(lang='en', show_log=False)

    # Read image
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Perform OCR
    result = ocr.ocr(gray)

    # Extract text and coordinates
    extracted_data = []
    for line in result:
        for word in line:
            text = word[1][0]
            bbox = word[0]
            x_coords = [point[0] for point in bbox]
            y_coords = [point[1] for point in bbox]
            extracted_data.append({
                'text': text,
                'x': np.mean(x_coords),
                'y': np.mean(y_coords)
            })

    # Sort data into rows and columns
    extracted_data.sort(key=lambda x: (x['y'], x['x']))

    # Group into rows (using y-coordinate clustering)
    current_y = None
    table_data = []
    row = []
    y_threshold = 10  # Adjust based on your table's row height

    for item in extracted_data:
        if current_y is None or abs(item['y'] - current_y) <= y_threshold:
            row.append(item)
        else:
            row.sort(key=lambda x: x['x'])
            table_data.append(row)
            row = [item]
        current_y = item['y']

    if row:
        row.sort(key=lambda x: x['x'])
        table_data.append(row)

    # Convert to 2D array
    final_table = []
    for row in table_data:
        final_row = [cell['text'] for cell in row]
        final_table.append(final_row)

    # Export to JSON
    with open('table_data.json', 'w') as f:
        json.dump(final_table, f)

    # Export to CSV
    with open('table_data.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(final_table)

    return final_table

if __name__ == '__main__':
    process_image('/content/table_4.png')

download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 3910/3910 [00:16<00:00, 237.22it/s] 


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10000/10000 [00:18<00:00, 535.32it/s]


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2138/2138 [00:14<00:00, 145.87it/s]




# Inference Visualise Table

In [None]:
import json
import csv
from tabulate import tabulate

def reconstruct_from_json(json_file):
    with open(json_file) as f:
        data = json.load(f)
    print(tabulate(data, tablefmt='grid'))

# def reconstruct_from_csv(csv_file):
#     with open(csv_file) as f:
#         reader = csv.reader(f)
#         data = [row for row in reader]
#     print(tabulate(data, headers='firstrow', tablefmt='grid'))

if __name__ == '__main__':
    # Example usage:
    reconstruct_from_json('table_data.json')
    # reconstruct_from_csv('table_data.csv')

+----------+------------+----------+----------+
| Beyond   | Move       | Really   | Mind     |
+----------+------------+----------+----------+
| 102      | Technology | 649.31   | 417      |
+----------+------------+----------+----------+
| 260.74   | Project    | Remember | 364      |
+----------+------------+----------+----------+
| 785.18   | Stock      | 269      | Become   |
+----------+------------+----------+----------+
| Father   | 651.64     | Word     | 246      |
+----------+------------+----------+----------+
| Actually | Serious    | 491      | Although |
+----------+------------+----------+----------+
| 512.01   | 263.16     | Free     | 619      |
+----------+------------+----------+----------+
