In [None]:
import pandas as pd
import json
import os
import matplotlib.pyplot as plt
import cv2
from PIL import Image
import pytesseract
from shapely import LineString, Point, Polygon

# Dataset ingest

In [None]:
class Caja(object):
    """Clase que define la estructura de datos del segmento etiquetado (bounding-box) como unidad."""
    def __init__(self, archivo, bounding_box, etiqueta, contenido):
        self.file = archivo
        self.x_1 = bounding_box['x']
        self.y_1 = bounding_box['y']
        self.x_2 = bounding_box['x'] + bounding_box['w'] 
        self.y_2 = bounding_box['y'] + bounding_box['h']
        self.content = contenido
        self.label = etiqueta
        
    def to_json(self):
        return self.__dict__
    
    def to_pd_series(self):
        return pd.Series(self.__dict__)

In [None]:
def get_segments_from_annotations(path_json, df):
    """Función que extrae cada segmento de cada artículo existente en formato json en el 
    directorio 'path_json'. Retorna un dataframe con la totalidad de los segmentos."""
    
    features = ['file', 'x_1','y_1','x_2','y_2','label', 'content']

    df = pd.DataFrame(columns=features)

    for filename in os.listdir(path_json):
        if filename.endswith(".json"):
            file_path = os.path.join(path_json, filename)
            with open(file_path, "r") as json_file:
                datos = json.load(json_file)
            
            for nota in datos['Notas']:
                for segmento in nota:
                    for detalle in nota[segmento]:
                        try:
                            caja = Caja(filename.replace('.json', '.tif'), detalle['bounding_box'], segmento, detalle['text'])
                            df = df.append(caja.to_pd_series(), ignore_index=True)
                            #print(f'Archivo: {caja.file} | Segmento: {segmento}  | Estado OK!')
                        except:
                            print(f'Archivo: {caja.file} | Segmento: {segmento}  | Estado ERROR!')
    return df

In [None]:
def load_images_from_dir(path_tif, df):
    """Función que extrae cada segmento de cada nota. Retorna un dataframe con la totalidad de los segmentos."""
    
    df = pd.DataFrame()

    for filename in os.listdir(path_tif):
        if filename.endswith(".tif"):
            tif_file = os.path.join(path_tif, filename)
            
        imagen_cv = cv2.imread(tif_file)
        img_dict = {}
        img_dict['file'] = filename
        img_dict['bitmap'] = imagen_cv
        img_dict['height'] = imagen_cv.shape[0]
        img_dict['width'] = imagen_cv.shape[1]

        df = df.append(img_dict, ignore_index=True)

    return df

def show_image(imagen_cv):
    """Función que imprime una imagen previamente cargada con cv2."""
    plt.axis('off')
    plt.imshow(imagen_cv)
    
def draw_rectangle_annotations(s_imagen, df_segmentos):
    """Función que sobreimprime los bounding boxes desde la tabla df_segmentos
    sobre la imagen correspondiente al registro de imagen s_imagen."""
    img_file = s_imagen['file']
    imagen = s_imagen['bitmap']
    segmentos = df_segmentos[df_segmentos['file']==img_file]
    for segmento in segmentos.itertuples():
        x_1 = segmento[2]
        y_1 = segmento[3]
        x_2 = segmento[4]
        y_2 = segmento[5]
        cv2.rectangle(imagen, (x_1, y_1), (x_2, y_2), (36,255,12), 2)
    show_image(imagen)

In [None]:
path_json = 'iaxidentidad_json'
path_tif = 'iaxidentidad_tif'

df_segmentos = pd.DataFrame()
df_imagenes = pd.DataFrame()

df_segmentos = get_segments_from_annotations(path_json, df_segmentos)
df_imagenes = load_images_from_dir(path_tif, df_imagenes)

In [None]:
df_segmentos.head()

In [None]:
df_imagenes.head()

In [None]:
articulo = df_imagenes.iloc[0]
articulo

In [None]:
show_image(articulo['bitmap'])

In [None]:
draw_rectangle_annotations(articulo, df_segmentos)

# Obtener tokens

In [None]:
articulo

In [None]:
mi_cfg = r"-l spa --psm 3 --oem 3"

In [None]:
#cajas = pytesseract.image_to_boxes(imagen_cv, config=mi_cfg)

In [None]:
df_data = pytesseract.image_to_data(articulo['bitmap'], config=mi_cfg, output_type=pytesseract.Output.DATAFRAME)

In [None]:
df_data

In [None]:
import shapely

In [None]:
def get_poly_from_xywh(row):
    p1 = Point(row['left'], row['top'])
    p2 = Point(row['left']+row['width'], row['top'])
    p3 = Point(row['left']+row['width'], row['top']+row['height'])
    p4 = Point(row['left'], row['top']+row['height'])
    p_list = [p1, p2, p3, p4]
    row['poligono'] = Polygon([p.x, p.y] for p in p_list)
    return row

def get_poly_from_xyxy(row):
    p1 = Point(row['x_1'], row['y_1'])
    p2 = Point(row['x_2'], row['y_1'])
    p3 = Point(row['x_2'], row['y_2'])
    p4 = Point(row['x_1'], row['y_2'])
    p_list = [p1, p2, p3, p4]
    row['poligono'] = Polygon([p.x, p.y] for p in p_list)
    return row

In [None]:
df_data = df_data.apply(lambda row: get_poly_from_xywh(row), axis=1)
df_data

In [None]:
df_segmentos = df_segmentos.apply(lambda row: get_poly_from_xyxy(row), axis=1)
df_segmentos