In [None]:
!pip install pdf2image layoutparser camelot-py[cv] paddleocr paddlepaddle-gpu opencv-python torch langchain langchain_community

!pip install 'git+https://github.com/facebookresearch/detectron2.git@v0.4#egg=detectron2'

In [None]:
!pip install paddlepaddle
!pip install paddlepaddle-gpu
!apt-get install -y poppler-utils

In [1]:
import os
import cv2
import numpy as np
import time
from pdf2image import convert_from_path
from paddleocr import PaddleOCR

pdf_path = "/kaggle/input/d2k-dataset/TIMPL-Annual-Report-2023-24.pdf"
temp_image_dir = "./temp_images"
temp_table_dir = "./temp_tables"
dpi = 300  

os.makedirs(temp_image_dir, exist_ok=True)
os.makedirs(temp_table_dir, exist_ok=True)

# ----------------------------
# STEP 1: Convert PDF pages to images
# ----------------------------
print("Converting PDF pages to images...")
start_time = time.time()
pages = convert_from_path(pdf_path, dpi=dpi)
image_paths = []
for i, page in enumerate(pages):
    image_file = os.path.join(temp_image_dir, f"page_{i+1}.jpg")
    page.save(image_file, "JPEG")
    image_paths.append(image_file)
print(f"Converted {len(image_paths)} pages in {time.time() - start_time:.2f} seconds.")

# ----------------------------
# STEP 2: Initialize PaddleOCR (using GPU if available)
# ----------------------------
ocr = PaddleOCR(use_gpu=True, lang="en")  # Use GPU if available

# ----------------------------
# STEP 3: Define table detection using OpenCV
# ----------------------------
def detect_tables_opencv(image):
    """
    Detects table regions in a scanned image using morphological operations.
    Returns a list of bounding boxes (x, y, w, h) for potential tables.
    """
    # Convert image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Invert and apply adaptive thresholding.
    thresh = cv2.adaptiveThreshold(~gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C,
                                   cv2.THRESH_BINARY, blockSize=15, C=-2)
    
    # Detect horizontal lines:
    horizontal = thresh.copy()
    cols = horizontal.shape[1]
    horizontal_size = max(1, cols // 30)  # Adjust kernel size as needed
    horizontal_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
    horizontal = cv2.erode(horizontal, horizontal_structure)
    horizontal = cv2.dilate(horizontal, horizontal_structure)
    
    # Detect vertical lines:
    vertical = thresh.copy()
    rows = vertical.shape[0]
    vertical_size = max(1, rows // 30)  # Adjust kernel size as needed
    vertical_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size))
    vertical = cv2.erode(vertical, vertical_structure)
    vertical = cv2.dilate(vertical, vertical_structure)
    
    # Combine horizontal and vertical lines to obtain a mask of table structures.
    table_mask = cv2.add(horizontal, vertical)
    
    # Find contours on the table mask.
    contours, _ = cv2.findContours(table_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    table_boxes = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        # Filter out small regions that are unlikely to be tables.
        if w > 50 and h > 50:
            table_boxes.append((x, y, w, h))
    return table_boxes

# ----------------------------
# STEP 4: Process each page: detect tables, run OCR, and save table images
# ----------------------------
all_page_text = []  # store full-page text (if needed)
all_tables = []     # store detected table info

print("Processing pages for table extraction...")
for image_path in image_paths:
    print(f"\nProcessing {image_path}...")
    image = cv2.imread(image_path)
    if image is None:
        print(f"Warning: Could not read image from {image_path}.")
        continue

    # Run table detection
    table_boxes = detect_tables_opencv(image)
    print(f"Detected {len(table_boxes)} potential table region(s).")
    
    page_text = ""
    for idx, box in enumerate(table_boxes, start=1):
        x, y, w, h = box
        table_region = image[y:y+h, x:x+w]
        
        # Save the table image to the temp_table_dir
        table_img_filename = f"{os.path.splitext(os.path.basename(image_path))[0]}_table_{idx}.jpg"
        table_img_path = os.path.join(temp_table_dir, table_img_filename)
        cv2.imwrite(table_img_path, table_region)
        print(f"Saved table region {idx} as {table_img_path}.")
        
        # Run OCR on the table region.
        ocr_result = ocr.ocr(table_region, cls=True)
        table_text_lines = []
        if ocr_result:
            # Flatten the nested list and extract text lines.
            for result in ocr_result:
                if result is not None:
                    for line in result:
                        if len(line) > 1 and line[1] is not None:
                            table_text_lines.append(line[1][0])
        else:
            print("No OCR result returned for this table region.")
        
        table_text = "\n".join(table_text_lines)
        print(f"Extracted text from table {idx}:\n{table_text}\n")
        all_tables.append({
            "page": image_path,
            "bbox": box,
            "table_text": table_text,
            "table_image_path": table_img_path
        })
        page_text += f"\n[Table {idx} Detected]\n" + table_text + "\n"
    
    all_page_text.append(page_text)

document_text = "\n\n".join(all_page_text)

# ----------------------------
# STEP 5: Output results
# ----------------------------
print("\nFinal Extracted Table Text (excerpt):")
for idx, table in enumerate(all_tables, start=1):
    print(f"\nTable {idx} (from {table['page']}):")
    print("Bounding Box:", table['bbox'])
    print("Extracted Text:\n", table['table_text'])
    print("Table Image Saved at:", table['table_image_path'])

ModuleNotFoundError: No module named 'paddleocr'

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch import softmax
import torch

RuntimeError: Failed to import transformers.models.bert.modeling_bert because of the following error (look up to see its traceback):
operator torchvision::nms does not exist