In [None]:
import numpy as np
import pandas as pd
import json
import os
import matplotlib.pyplot as plt
import cv2
from PIL import Image
import pytesseract
from shapely import LineString, Point, Polygon, box

# Importar datos desde csv

In [None]:
data_segmentos = '../01_ingest_labeled_data/output_csv/data_segments.csv'
data_imagenes = '../01_ingest_labeled_data/output_csv/data_images.csv'
df_segmentos = pd.read_csv(data_segmentos).drop(columns=['Unnamed: 0'])
df_imagenes = pd.read_csv(data_imagenes).drop(columns=['Unnamed: 0'])

In [None]:
df_segmentos.head()

In [None]:
df_imagenes.head()

## Procesamiento de un artículo puntual

In [None]:
articulo = df_imagenes.iloc[0]
articulo

In [None]:
segmentos_articulo = df_segmentos[df_segmentos['file']==articulo['file']]
segmentos_articulo

# Obtener tokens

In [None]:
path_images = '../01_ingest_labeled_data/iaxidentidad_tif'

In [None]:
mi_cfg = r"-l spa --psm 3 --oem 3"

In [None]:
df_tokens = pytesseract.image_to_data(path_images + '/' + articulo['file'], config=mi_cfg, output_type=pytesseract.Output.DATAFRAME)

In [None]:
df_tokens.info()

## Limpieza

In [None]:
def confidence_filter(df_data, confianza_min):
    """Devuelve df con palabras que superan X umbral de confianza en la detección. Parámetros: df original y confianza mínima."""
    mask_conf = (df_data['conf'].astype('float') > confianza_min)
    print('Eliminados ', df_data[~mask_conf].shape[0])
    #print(df_data[~mask_conf])
    return df_data[mask_conf]

def blank_filter(df_data):
    """Devuelve df sin elementos que fueron detectados como espacios. Parámetro: df original."""    
    mask_not_blank = (df_data['text'].str.strip() != '') & (df_data['text'] != np.NaN)
    print('Eliminados ', df_data[~mask_not_blank].shape[0])
    #print(df_data[~mask_not_blank])
    return df_data[mask_not_blank]

def height_filter(df_data):
    """Devuelve df sin outliers detectados en función de la altura del elemento. Parámetro: df original."""    
    highThresh_h = round(df_data['height'].quantile(.90) * 6)
    lowThresh_h = round(df_data['height'].quantile(.90) / 5)
    mask_height = (df_data['height'] > lowThresh_h) & (df_data['height'] < highThresh_h)
    print('Eliminados ', df_data[~mask_height].shape[0])
    #print(df_data[~mask_height])
    return df_data[mask_height]

In [None]:
df_tokens = confidence_filter(df_tokens, 20)

In [None]:
df_tokens = blank_filter(df_tokens)

## Generar poligonos de bounding boxes a partir de coordenadas

In [None]:
df_tokens['poligono'] = df_tokens.apply(lambda row: box(row.left, row.top, row.left + row.width, row.top + row.height), axis=1)
df_tokens

In [None]:
df_segmentos['poligono'] = df_segmentos.apply(lambda row: box(row.x_1, row.y_1, row.x_2, row.y_2), axis=1)
df_segmentos

In [None]:
segmentos_articulo = df_segmentos[df_segmentos['file']==articulo['file']][['poligono', 'label']]

In [None]:
fig, ax = plt.subplots()
ax.set_aspect('equal', adjustable='box')
for r in df_tokens['poligono']:
    r_coords = list(r.exterior.coords)

    ax.plot(*zip(*r_coords), color='b')

for r in segmentos_articulo['poligono']:
    r_coords = list(r.exterior.coords)

    ax.plot(*zip(*r_coords), color='r', linestyle='--')
    
plt.axis([0, articulo['width'], articulo['height'], 0])
plt.show()

## Obtener etiqueta de tokens a partir de etiquetado manual

In [None]:
def get_label_token(poligono_token, segmentos_articulo):
    #segmentos_articulo = df_segmentos[df_segmentos['file']==file][['poligono', 'label']]
    #print(segmentos_articulo.shape)
    #etiqueta = np.nan
    label_candidates = {'etiqueta': -1, 'perc': -1}
    i=0
    for _, segmento in segmentos_articulo.iterrows():
        #print('Poligono token: ')
        #display(poligono_token)
        #print('Poligono segmento: ')
        #display(_, segmento['poligono'])
        if poligono_token.intersects(segmento['poligono']):
            i+=1
            perc = poligono_token.intersection(segmento['poligono']).area
            #print ('(label_candidates[perc]', label_candidates['perc'])
            #print('new perc', perc)
            if (label_candidates['perc'] < perc):
                label_candidates = {'etiqueta': segmento['label'], 'perc': perc}
                #print(i)
    
    return label_candidates['etiqueta']

In [None]:
segmentos_articulo = df_segmentos[df_segmentos['file']==articulo['file']][['poligono', 'label']]
df_tokens['label'] = df_tokens['poligono'].apply(lambda x: get_label_token(x, segmentos_articulo))

In [None]:
df_tokens

## Debug

In [None]:
import matplotlib.colors as mcolors

In [None]:
colores = list(mcolors.TABLEAU_COLORS.values())
colores.append('#ff0000')
colores.append('#00ff00')
colores

In [None]:
partes = ['Diario', 'Fecha', 'Volanta', 'Copete', 'Destacado', 'Título', 'Cuerpo', 'Fotografía', 'Epígrafe', 'Firma', 'Página', '-1']

In [None]:
color_dict = dict(zip(partes,colores))

In [None]:
plt.rcParams['figure.figsize'] = [30, 30]
fig, ax = plt.subplots()
ax.set_aspect('equal', adjustable='box')
for l in df_tokens['label'].value_counts().keys().to_list():
    for r in df_tokens[df_tokens['label']==l]['poligono']:
        linewidth = 1
        if l == -1: linewidth = 5
        r_coords = list(r.exterior.coords)
        ax.plot(*zip(*r_coords), color=color_dict[str(l)], linewidth=linewidth)
        
    for r in segmentos_articulo[segmentos_articulo['label']==l]['poligono']:
        linewidth = 1
        if l == -1: linewidth = 5
        r_coords = list(r.exterior.coords)
        ax.plot(*zip(*r_coords), color=color_dict[str(l)], linewidth=linewidth, linestyle='--')
        
plt.axis([0, articulo['width'], articulo['height'], 0])
plt.savefig("output_debug.jpg")
plt.show()