# Documentation

This notebook is designed to process cropped images and extract them into a table format, saving the results as Excel files. The tool employed for this task is Img2Table, a Python library for table identification and extraction from PDFs and images. Img2Table relies on OpenCV for image processing. For the cropped images, we utilized the outcomes of the model, which was trained using a random state of 88 for splitting the data into training, test, and validation sets.

## Installing Requirements & Importing Libraries

In [None]:
! pip install img2table
! apt install tesseract-ocr
! pip install pytesseract

Collecting img2table
  Downloading img2table-1.2.8-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.4/91.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting pymupdf>=1.19.1 (from img2table)
  Downloading PyMuPDF-1.23.20-cp310-none-manylinux2014_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
Collecting xlsxwriter>=3.0.6 (from img2table)
  Downloading XlsxWriter-3.1.9-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.8/154.8 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting PyMuPDFb==1.23.9 (from pymupdf>=1.19.1->img2table)
  Downloading PyMuPDFb-1.23.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.6/30.6 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xlsxwriter, PyMuPDFb, pymupdf,

## Cloning Github Repository

In [None]:
! git clone https://github.com/xavctn/img2table.git

Cloning into 'img2table'...
remote: Enumerating objects: 2520, done.[K
remote: Counting objects: 100% (1575/1575), done.[K
remote: Compressing objects: 100% (869/869), done.[K
remote: Total 2520 (delta 935), reused 1102 (delta 662), pack-reused 945[K
Receiving objects: 100% (2520/2520), 6.31 MiB | 17.64 MiB/s, done.
Resolving deltas: 100% (1304/1304), done.


## Other OCR Engines to Choose


In [None]:
# Tesseract OCR
from img2table.ocr import TesseractOCR

tesseract_ocr = TesseractOCR(n_threads=1, lang="eng")

In [None]:
! pip install img2table[paddle]

# PaddleOCR
from img2table.ocr import PaddleOCR

paddle_ocr = PaddleOCR(lang="en", kw={"use_dilation": True})

In [None]:
! pip install img2table[easyocr]

# EasyOCR
from img2table.ocr import EasyOCR

easyocr = EasyOCR(lang=["en"], kw={"gpu": False})

In [None]:
!pip install --upgrade img2table

time: 15.4 s (started: 2023-11-10 17:35:04 +00:00)


In [None]:
! pip install doctr

# docTR
from img2table.ocr import DocTR

doctr = DocTR(detect_language=True, kw={"detect_orientation": True})

In [None]:
# Google Vision OCR
from img2table.ocr import VisionOCR

vision_ocr = VisionOCR(api_key="***")

In [None]:
# AWS Textract OCR
from img2table.ocr import TextractOCR

textract_ocr = TextractOCR(aws_access_key_id="***",
                           aws_secret_access_key="***",
                           aws_session_token="***",
                           region="eu-west-1")

In [None]:
# Azure Cognitive Services OCR
from img2table.ocr import AzureOCR

azure_ocr = AzureOCR(endpoint="abc.azure.com",
                           subscription_key="***")

## Setting the Working Directory

In [None]:
import os

# set the working directory (this version is created for Google Colab)
my_wd = "/content"
os.chdir(my_wd)
print("Changed working directory to:", os.getcwd())

Changed working directory to: /content


## Mounting Google Drive

In [None]:
from google.colab import drive

# mount google drive
drive.mount(os.path.join(my_wd,"drive"))

Mounted at /content/drive


## Loading and Unzipping the Cropped Images from Google Drive

In [None]:
# define the google drive location
drive_location = "drive/your_path.zip"
# create the final path
zip_file_path = os.path.join(my_wd, drive_location)

# keep the name of the zipped folder
zip_basename = os.path.splitext(os.path.basename(drive_location))[0]

# unzip the folder
!unzip -q "{zip_file_path}" -d "{zip_basename}"

In [None]:
from tqdm import tqdm
from img2table.document import Image as Img2TableImage
from img2table.ocr import TesseractOCR, PaddleOCR
import os
from IPython.display import display_html, display
from PIL import Image as PILImage
import sys
sys.path.append('/content/img2table/examples/')
from utils import display_borderless_tables
import pandas as pd


def process_and_save_images(input_folder, output_folder, processing_function, ocr):
    # List all files in the input folder
    input_files = [f for f in os.listdir(input_folder) if f.endswith('.png')]

    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Create a subfolder for Excel files
    excel_folder = os.path.join(output_folder, 'excel files')
    os.makedirs(excel_folder, exist_ok=True)

    # Wrap the input_files list with tqdm to add a progress bar
    for input_file in tqdm(input_files, desc="Processing images", unit="image"):
        # Load the image
        image_path = os.path.join(input_folder, input_file)
        img = Img2TableImage(image_path)  # Use Img2TableImage to load images

        # Perform the processing
        processed_img = processing_function(img, ocr)

        # Save results to the output folder
        output_file = os.path.join(output_folder, f"{input_file}")
        PILImage.fromarray(processed_img).save(output_file)

        # Extract tables with Tesseract and PaddleOCR
        tables = img.extract_tables(ocr=ocr, borderless_tables=True, implicit_rows=True, min_confidence=10)

        # Create a Pandas Excel writer
        excel_writer = pd.ExcelWriter(os.path.join(excel_folder, f"{input_file}.xlsx"), engine='xlsxwriter')

        # Save each table to a separate sheet in the same Excel file
        for i, table in enumerate(tables):
            result = table.df
            sheet_name = f'Table_{i+1}'
            result.to_excel(excel_writer, sheet_name=sheet_name, index=False, header=False)

        # Save and close the Excel writer
        excel_writer.save()

    print("Processing and saving completed.")

    # Define the processing function
def process_image(img, ocr):
    return display_borderless_tables(img=img, ocr=ocr)


# With TesseractOCR

In [None]:
# Define the input and output folders
input_folder = 'your_path/cropped_images'


input_folder = os.path.join(my_wd, input_folder)

output_folder = 'Img2Table_results_on_cropped_images_tesseractOCR'
output_folder = os.path.join(my_wd, output_folder)

# Load the OCR (you can choose either TesseractOCR or PaddleOCR)
ocr = TesseractOCR(n_threads=1, lang="eng")

# Call the process_and_save_images function
process_and_save_images(input_folder, output_folder, process_image, ocr)

  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_wr

Processing and saving completed.





# Zipping the Results

In [None]:
# Define the path
folder_name = "your_path"

# Create the final path
zip_folder_path = os.path.join(my_wd, folder_name + ".zip")

# Zip the folder
!zip -r -q "{zip_folder_path}" "{folder_name}"

# With PaddleOCR

In [None]:
! pip install img2table[paddle]

# PaddleOCR
from img2table.ocr import PaddleOCR


Collecting paddlepaddle (from img2table[paddle])
  Downloading paddlepaddle-2.6.0-cp310-cp310-manylinux1_x86_64.whl (125.7 MB)
[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m79.5/125.7 MB[0m [31m52.7 MB/s[0m eta [36m0:00:01[0m
[?25h[31mERROR: Operation cancelled by user[0m[31m
[0m

In [None]:
# Define the input and output folders
input_folder = 'your_path/cropped_images'


input_folder = os.path.join(my_wd, input_folder)

output_folder = 'your output folder path'
output_folder = os.path.join(my_wd, output_folder)

# Load the OCR
ocr = PaddleOCR(lang="en", kw={"use_dilation": True})

# Call the process_and_save_images function
process_and_save_images(input_folder, output_folder, process_image, ocr)

download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 4.00M/4.00M [00:14<00:00, 276kiB/s] 


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10.2M/10.2M [00:15<00:00, 642kiB/s] 


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2.19M/2.19M [00:12<00:00, 174kiB/s]
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
  excel_writer.save()
 

Processing and saving completed.
time: 16min 28s (started: 2024-01-04 20:24:42 +00:00)





# Zipping the Results

In [None]:
# Define the path
folder_name = "your_path"

# Create the final path
zip_folder_path = os.path.join(my_wd, folder_name + ".zip")

# Zip the folder
!zip -r -q "{zip_folder_path}" "{folder_name}"

time: 909 ms (started: 2024-01-04 20:43:30 +00:00)
