<a href="https://colab.research.google.com/github/Strojove-uceni/2024-final-pr-team/blob/main/TabuVision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Instal modules

In [None]:
!apt-get update
!apt-get install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-ces
!wget https://github.com/tesseract-ocr/tessdata/raw/main/osd.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata/
!pip install pytesseract opencv-python pillow numpy scikit-image
!pip install pdf2image
!apt-get install -y poppler-utils
!pip install ultralytics

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Connecting to security.ubuntu.com (185.125.190.83)] [Connected to cloud.r-project.org (3.171.85.                                                                                                    Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
0% [2 InRelease 43.1 kB/128 kB 34%] [Connecting to security.ubuntu.com (185.125.190.83)] [Connected                                                                                                     Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
0% [2 InRelease 79.3 kB/128 kB 62%] [Waiting for headers] [3 InRelease 0 B/3,626 B 0%] [Connected to0% [2 InRelease 79.3 kB/128 kB 62%] [Waiting for headers] [Connected to r2u.stat.illinois.edu (192.10% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois.edu (192.17.190.167)]                                                   

## Download files from cloud

In [None]:
!pip install gdown

# Sdílený odkaz na složku Drive
shared_folder_url = "https://drive.google.com/drive/folders/1iv-WoXXZHMABZWwk4y3YCojuTmzGbYkh?usp=drive_link"

# ID složky (získáno z odkazu sdílení)
folder_id = "1iv-WoXXZHMABZWwk4y3YCojuTmzGbYkh"

# Stažení složky pomocí gdown
!gdown --folder "$folder_id" -O TabuVision

# Zobrazení stažených souborů
!ls TabuVision


Retrieving folder contents
Retrieving folder 1zigpFrXSmVEqQquok5KN9v0WbG6DyJFE backend
Retrieving folder 1WlR3cLduzeF1bEk1YcpPRXJNQAz_fpWF TableDetection_utils
Processing file 11EgsPte-c4LrrzYdkbqeOFv1vVV1tX-7 RotateDetection.py
Processing file 1qvhosKQuvSXTtFXqycZ2wOF_7xWQSSaE SkewDetection.py
Retrieving folder 1ZFVxW78hp5sWNzXv8xnHwM52f4yy62w2 weights
Processing file 1v5gbjRz99BOasajuGZbGG3VYePrtmUPV table_detection_yolov8m.pt
Processing file 1SfaF2TN3Hj6_Xxo-xqBTxQC2lw14Os9D table_structure_detection_yolo8m.pt
Processing file 1TfAzkdQN2nfEu84OqmhXHKJ_I4dgjBny ContentDetection.py
Processing file 1QaDlXfrQVnlrNcBiKZQDl0yrZnLpoT_c ModelHandler.py
Processing file 1fhfX6JBFhtISA_J7We_DA2gPoMx21CHM prediction.JPEG
Processing file 1ENATimUUq1G2hwUW8MVjLB8cWkWGERvs StructureDetection.py
Processing file 1uSniouyWthtaW61eR0_NGVDTCnqBC8AN TableDetection.py
Retrieving folder 13fxnJ9pIRhzudlBroy5eSbNJfCaV-5Gw images
Processing file 1dhGF7WyYFHXICS0aJfw69DwcDDE83HLk 245.jpg
Retrieving folder 1Pn9

In [None]:
!ls shared_project

17.jpg		 PDFs			 PMC497044_table_0.xml	table_test.pdf
backend		 PMC1079800_table_2.jpg  prediction.JPEG	TabuVision.py
inspector.ipynb  PMC1079800_table_2.xml  __pycache__		test_image.png
output.html	 PMC497044_table_0.jpg	 Table.py		utils


# TabuVision
## TabuVision demo

In [None]:
from backend.utils.TableExtractor import TableExtractorCluster, extract_cells
from backend.TableDetection import TableDetection
from backend.StructureDetection import StructureDetection
from backend.ContentDetection import ContentDetection
from PIL import Image
from pathlib import Path
from backend.utils.utils import PDFFormatToPIL, clean_dir_files
import os


class TabuVision:
    def __init__(self, format: str, debug: bool = False):
        """
        TabuVision class handles table transformation pipeline. It primarily uses classes from backed folder.
        :param format: output format of the extracted tables.
        :param debug: boolean flag whether to print logs, show log images and other information.
        """

        # Initialize models or other attributes as needed
        self.debug = None
        self.table_name = None
        self.TableDetectionUnit = TableDetection(debug=debug)
        self.StructureDetectionUnit = StructureDetection(debug=debug)
        self.ContentDetectionUnit = ContentDetection(debug=debug)

        # Initialize table extractor
        self.TableExtractorClusterUnit = TableExtractorCluster(debug=debug)

        # Set attributes
        self.allowed_suffix_image = ['.jpeg', '.jpg', '.png']
        self.cache_dir = 'cache'
        self.output_dir = 'output'
        self.format = format

        # Set allowed file formats
        # In case of adding new formats, you only need to specify the file suffix and
        # provide a function that takes a file_path as input and returns a list of PIL.Image objects.
        PDFToImage = PDFFormatToPIL(debug=debug)
        self.allowed_suffix_others = {'.pdf': PDFToImage}

        # Clean cache and output dirs
        self.setup_dirs()

    def __call__(self, filepath: str, table_name: str):
        """
        Run method which starts tables extraction.

        :param filepath: filepath of the file to be processed.
        :return: extracted tables if given format.
        """
        return self.run(filepath, table_name)

    def setup_dirs(self):
        """
        Setup cache and output directories.
        """

        # Create or clean cache dir
        if not os.path.exists(self.cache_dir):
            os.makedirs(self.cache_dir)
        else:
            clean_dir_files(self.cache_dir)

        # Create or clean output dir
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
        else:
            clean_dir_files(self.output_dir)

    def run(self, filepath: str, table_name: str = 'table'):
        """
        Run file extraction and pass it to table extraction pipeline.
        :param table_name: name of the table to be processed (optional).
        :param filepath: filepath of the file to be processed.
        :return: list of extracted tables in given format.
        """

        self.table_name = table_name

        # Extract page images from file
        images = self.extract(filepath)

        if images is None or len(images) == 0:
            print(f'No tables found for file {filepath}!')
            return None

        # Pass images to extraction pipeline
        output_list = []
        for image in images:
            html_table = self.to_pipeline(image)
            output_list.append(html_table)

        return output_list

    def to_pipeline(self, page_img: Image = None):
        """
        Complete pipeline of processing image of the page and extracting tables.

        :param page_img: Input image of the page.
        :return: list of extracted tables in given format.
        """

        # Process an image
        #

        # Step 1: Detect the tables
        table_images = self.TableDetectionUnit.to_pipeline(page_img)

        if len(table_images) == 0:
            print('No tables detected!')
            return None

        # Analyse structure of each table
        table_idx = 1
        processed_table_list = []

        for table_img in table_images:

            # Step 2: Detect table structure and return predicted objects (class, bbox)
            predicted_objects = self.StructureDetectionUnit.to_pipeline(table_img)

            # Step 3: Retrieve table from predicted objects
            table_object = self.TableExtractorClusterUnit(predicted_objects, f'{self.table_name}_{table_idx}', image_size=table_img.size)

            # Print detected table structure
            if self.debug:
                table_object.plot_table(image=table_img)

            # Step 3: Extract cell content
            # Detects content of each cell using OCR.
            # Parameter 'fill_on_error' indicates whether cell image should be retrieved when OCR detection fails.
            table_object = extract_cells(
                table_img,
                table_object,
                mode='ocr',
                fill_on_error=True,
                ContentDetectionUnit=self.ContentDetectionUnit,
                cache_dir=self.cache_dir,
                log_progress=True
            )

            # Step 4: Build table in given format out of general table object.
            if self.format == 'html':
                table_html = table_object.to_html(file_name=f'{self.output_dir}/{table_object.filename}_.html', cache_dir=self.cache_dir)
                processed_table_list.append(table_html)

            table_idx += 1

        return processed_table_list

    def extract(self, file_path: str):
        """
        Extract pages from a file in format of PIL.Image list. Valid file formats can be either images or more
        complex files (containing more pages) - for example PDF file.

        :param file_path: path to the file to be
        extracted. :return: list of pages in PIL.Image format.
        """

        file_path = Path(file_path)
        file_suffix = file_path.suffix.lower()

        # Image file
        if file_suffix in self.allowed_suffix_image:
            if self.debug:
                print(f"Processing image file: {file_path}")

            image = Image.open(file_path)
            return [image]

        # Other file types
        elif file_suffix in self.allowed_suffix_others.keys():
            try:
                transformation_func = self.allowed_suffix_others[file_suffix]
                images = transformation_func(file_path)
                return images

            except Exception as e:
                print(f'During extracting file with suffix {file_suffix} following error occurred: {e}.')
                return None

        else:
            raise ValueError(f"Unsupported file type: {file_path.suffix}")

Processing image file: /content/TabuVision/images/245.jpg
Image name: /content/TabuVision/images/245.jpg
Prediction speed: {'preprocess': 13.765335083007812, 'inference': 3503.561496734619, 'postprocess': 1.394510269165039}
Image has been saved to file 'prediction.JPEG'
Deskewed image: detected angle -0.4162766037009366deg.
Deskewed image: detected angle -0.4162766037009366deg.


In [None]:
# *printing HTML code*
from IPython.display import HTML

def display_pretty_table(table_html):
    STYLE = """
            <style>
          body {
            font-family: Arial, sans-serif;
            background-color: #f9f9f9;
            margin: 20px;
          }

          table {
            width: 100%;
            border-collapse: collapse;
            margin: 20px 0;
            background-color: white;
            box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1);
            border-radius: 8px;
            overflow: hidden;
          }

          th, td {
            padding: 12px 15px;
            text-align: left;
          }

          th {
            background-color: #f2f2f2;
            color: #333;
            font-weight: bold;
            text-transform: uppercase;
            font-size: 14px;
            border-bottom: 2px solid #e0e0e0;
          }

          tr {
            border-bottom: 1px solid #e0e0e0;
          }

          tr:nth-of-type(even) {
            background-color: #f9f9f9;
          }

          td {
            color: #555;
            font-size: 14px;
          }

          caption {
            margin-bottom: 10px;
            font-size: 18px;
            font-weight: bold;
            color: #333;
          }
        </style>
        """
    display(HTML(STYLE+' '+table_html))


## Lets initialize TabuVision
Just specify table's output format.

In [None]:
TabuVisionApp = TabuVision(
        format='html'
)

## TabuVision can extract table from an image ...

In [None]:
output_list = \
    TabuVisionApp(
        filepath='/Users/vojtechremis/Desktop/Projects/TabuVision/PDFs/test_img_3.png',
        table_name='tabuvision_demo'
    )

In [None]:
table_idx = 1
for page in output_list:
    for table_html in page:
        print(f'Extracted table #{table_idx}:')
        display_pretty_table(table_html)
        print('\n\n')
        table_idx += 1

## ... Or process whole PDF

In [None]:
output_list = \
    TabuVisionApp(
        filepath='/Users/vojtechremis/Desktop/Projects/TabuVision/PDFs/test_img_3.pn',
        table_name='tabuvision_demo'
    )

In [None]:
table_idx = 1
for page in output_list:
    for table_html in page:
        print(f'Extracted table #{table_idx}:')
        display_pretty_table(table_html)
        print('\n\n')
        table_idx += 1