In [None]:
import pdf2image
import os
import glob
import cv2
import numpy as np
import layoutparser as lp
import matplotlib.pyplot as plt

### Open pdf

In [None]:
path_pdf = "/resources/datasets/image_recognition/sentencias/FALLO_TOMO CAF 023440_2022_CS001.pdf"

In [None]:
doc = pdf2image.convert_from_path(path_pdf)
len(doc)

### Create png by page

In [None]:
OUTPUT_PATH = '/resources/datasets/image_recognition/sentencias/images/'

In [None]:
def create_image_from_pdf(pdf_path: str, OUTPUT_PATH: str):
    image_path = pdf2image.convert_from_path(
        pdf_path,
        dpi=600,
        output_folder=OUTPUT_PATH,
        fmt='png',
        paths_only=True
    )[0]

    target_name = f'{os.path.basename(pdf_path)}.png'
    target_path = f'{OUTPUT_PATH}/{target_name}'
    os.rename(image_path, target_path)

In [None]:
create_image_from_pdf(path_pdf, OUTPUT_PATH)

### Open images and Images detect

In [None]:
def apply_image_detect(img_array):
    model = lp.Detectron2LayoutModel("lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config",
                                     extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
                                     label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"})
    return model.detect(img_array)
    
def transform_images(folder_images_path: str):
    """Transforma imagenes de una carpeta en array"""
    images = [cv2.imread(file) for file in glob.glob(folder_images_path)]
    imgs_array = [np.asarray(image) for image in images]
    return imgs_array

def classifier(img_array):
    """Return True si hay figura"""
    layuout = apply_image_detect(img_array)
    for ly in layuout.get_info('type'):
        if ly =='Figure':
            return True
        else:
            return False

In [None]:
images = transform_images("/resources/datasets/image_recognition/sentencias/images/*.png")

In [None]:
img_fig = [img for img in images if classifier(img)]

In [None]:
len(img_fig)

In [None]:
# plt.figure(figsize=(10,40))
# plt.imshow(img_fig[4])

### Analize images

In [None]:
def get_images_boxes(img_array):
    df_lyt = apply_image_detect(img_array).to_dataframe()
    df_lyt_fig = df_lyt[df_lyt['type']=='Figure']
    d = df_lyt_fig.to_dict()
    box = {
        'x_1': int(d['x_1'][0]),
        'y_1': int(d['y_1'][0]),
        'x_2': int(d['x_2'][0]),
        'y_2': int(d['y_2'][0])
    }
    return box

In [None]:
img_test = img_fig[4]
box = get_images_boxes(img_test)
box

In [None]:
cv2.rectangle(img_test, (box['x_1'], box['y_1']), (box['x_2'], box['y_2']), (255, 0, 0), 0)
plt.figure(figsize=(10,40))
plt.imshow(img_test)

### Analize figure in images

In [None]:
def face_recognition(img_array,box_fig):
    face_cascade = cv2.CascadeClassifier("/resources/datasets/image_recognition/haarcascade_frontalface_default.xml")
    fig = img_test[box_fig['y_1']:box_fig['y_2'], box_fig['x_1']:box_fig['x_2']]
    gray = cv2.cvtColor(fig, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.1, 5)
    return fig,faces

def blurear(img_array, list_area):
    for (x,y,w,h) in list_area:
        area_color = img_array[y:y + h, x:x + w]
        blur = cv2.GaussianBlur(area_color, (1001, 1001), 0)
        img_array[y:y + h, x:x + w] = blur
    return img_array

In [None]:
fig, faces = face_recognition(img_test, box)

In [None]:
blurear(fig,faces)
plt.figure(figsize=(10,40))
plt.imshow(img_test)