<a href="https://colab.research.google.com/github/Rodrigo-Lara-Gilles/Rodrigo-Lara-Gilles/blob/main/Pr%C3%A1ctico01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Código Completo

In [None]:
#############################################
# INSTALACIONES Y CONFIGURACIÓN (SECCIÓN 1)
#############################################
# Se agrega whoosh para indexación de texto y un ejemplo de PDF forms
!apt-get install -y tesseract-ocr
!apt-get install -y poppler-utils
!pip install pytesseract pdf2image PyMuPDF Pillow requests
!apt-get install -y tesseract-ocr-spa
!pip install tabulate pdfplumber camelot-py ghostscript ipywidgets
!pip install whoosh  # (A) para indexar y buscar texto

########################
# IMPORTS (SECCIÓN 2)
########################
import os
import json
import fitz
import requests
import pdfplumber
import camelot
import pytesseract
from pytesseract import Output
from PIL import Image
from pdf2image import convert_from_path
from tabulate import tabulate
from google.colab import files
from IPython.display import display, clear_output
import ipywidgets as widgets
import shutil
# (A) Librería para indexación:
from whoosh import index
from whoosh.fields import Schema, TEXT, ID

###################################
# FUNCIONES AUXILIARES (SECCIÓN 3)
###################################

# Sin cambios, salvo que ahora devolvemos tablas en estructura separada.
def descargar_pdf(url, output_file="temp.pdf"):
    r = requests.get(url)
    if r.status_code == 200:
        with open(output_file, "wb") as f:
            f.write(r.content)
        return output_file
    else:
        raise ValueError(f"No se pudo descargar PDF. Estado: {r.status_code}")

def calcular_precision_aproximada(texto):
    t = texto.strip()
    if not t:
        return 0
    letras_numeros = sum(c.isalnum() for c in t)
    return round(letras_numeros / len(t), 2)

def bounding_boxes_a_tabla(img, threshold_vertical=10, threshold_horizontal=60):
    df = pytesseract.image_to_data(img, output_type=Output.DATAFRAME, lang='spa')
    df = df.dropna(subset=["text"])
    df = df[df.conf != -1].reset_index(drop=True)
    df = df.sort_values(by="top").reset_index(drop=True)
    filas = []
    fila_actual = []
    last_top = None
    for _, row in df.iterrows():
        if last_top is None:
            fila_actual.append(row)
            last_top = row["top"]
        else:
            if abs(row["top"] - last_top) < threshold_vertical:
                fila_actual.append(row)
            else:
                filas.append(fila_actual)
                fila_actual = [row]
            last_top = row["top"]
    if fila_actual:
        filas.append(fila_actual)

    tabla_final = []
    for fila in filas:
        orden = sorted(fila, key=lambda x: x["left"])
        celdas = []
        current_col = [orden[0]["text"]]
        last_right = orden[0]["left"] + orden[0]["width"]
        for w in orden[1:]:
            gap = w["left"] - last_right
            if gap > threshold_horizontal:
                celdas.append(" ".join(current_col))
                current_col = [w["text"]]
            else:
                current_col.append(w["text"])
            last_right = w["left"] + w["width"]
        celdas.append(" ".join(current_col))
        tabla_final.append(celdas)

    max_cols = max(len(row) for row in tabla_final) if tabla_final else 0
    headers = [f"Col{i+1}" for i in range(max_cols)]
    ajustada = []
    for row in tabla_final:
        if len(row) < max_cols:
            row += [""] * (max_cols - len(row))
        ajustada.append(row)
    return tabulate(ajustada, headers=headers, tablefmt="grid")

def extraer_tablas_camelot(pdf_path, page_number):
    try:
        tables = camelot.read_pdf(pdf_path, pages=str(page_number), flavor="lattice")
        ascii_tables = []
        for t in tables:
            df = t.df
            ascii_table = tabulate(df.values.tolist(), tablefmt="grid")
            ascii_tables.append(ascii_table)
        return ascii_tables
    except:
        return []

def extraer_tablas_pdfplumber(plumber_page):
    ascii_tables = []
    tbls = plumber_page.extract_tables()
    if tbls:
        for tbl in tbls:
            ascii_table = tabulate(tbl, tablefmt="grid")
            ascii_tables.append(ascii_table)
    return ascii_tables

# (B) Nueva función para guardar tablas en un JSON adicional
def guardar_tablas_separadas(tablas, carpeta_salida, nombre_pag):
    if not tablas:
        return None
    # Almacena cada tabla como lista de líneas
    data_tablas = []
    for idx, tab in enumerate(tablas, start=1):
        data_tablas.append({"tabla_num": idx, "contenido": tab.split("\n")})
    # Se genera un archivo JSON por página
    path_tablas = os.path.join(carpeta_salida, f"tablas_pag_{nombre_pag}.json")
    with open(path_tablas, "w", encoding="utf-8") as f:
        json.dump(data_tablas, f, indent=2, ensure_ascii=False)
    return path_tablas

# (C) Nueva función para extraer formularios/anexos (campos PDF):
def extraer_formularios(doc):
    formularios = []
    # Recorre todas las páginas buscando widgets
    for i, page in enumerate(doc):
        if page.widgets:
            temp = []
            for w in page.widgets:
                campo = {
                    "pagina": i+1,
                    "campo_name": w.field_name,
                    "campo_value": w.field_value
                }
                temp.append(campo)
            if temp:
                formularios.extend(temp)
    return formularios

# (D) Funciones para indexar y buscar con Whoosh
def crear_indice_y_indexar(carpeta_indice, texto_global):
    if not os.path.exists(carpeta_indice):
        os.mkdir(carpeta_indice)
    schema = Schema(id=ID(stored=True), content=TEXT(stored=False))
    ix = index.create_in(carpeta_indice, schema)
    writer = ix.writer()
    writer.add_document(id="documento_pdf", content=texto_global)
    writer.commit()

def buscar_en_indice(carpeta_indice, consulta):
    ix = index.open_dir(carpeta_indice)
    with ix.searcher() as searcher:
        from whoosh.qparser import QueryParser
        parser = QueryParser("content", ix.schema)
        query = parser.parse(consulta)
        results = searcher.search(query, limit=10)
        return [r.fields() for r in results]

######################################
# PROCESAMIENTO DEL PDF (SECCIÓN 4)
######################################
def procesar_pdf(pdf_path, carpeta_salida, idioma="spa"):
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"No se encontró: {pdf_path}")
    os.makedirs(carpeta_salida, exist_ok=True)

    doc = fitz.open(pdf_path)
    plumber_pdf = pdfplumber.open(pdf_path)
    npages = doc.page_count
    md = doc.metadata or {}

    # (C) Extraemos formularios
    formularios_detectados = extraer_formularios(doc)

    pags_ocr = 0
    pags_texto = 0
    info_paginas = []
    texto_global_completo = []  # (D) Para indexar luego

    for i in range(npages):
        page_num = i + 1
        py_page = doc[i]
        txt_raw = py_page.get_text().strip()
        plumber_page = plumber_pdf.pages[i]
        contenido = ""
        ocr_flag = False
        tablas_pagina = []

        if txt_raw:
            pags_texto += 1
            contenido = txt_raw
            # (B) Extraemos tablas y las guardamos aparte
            camelot_tables = extraer_tablas_camelot(pdf_path, page_num)
            if camelot_tables:
                tablas_pagina.extend(camelot_tables)
            else:
                plumber_tables = extraer_tablas_pdfplumber(plumber_page)
                if plumber_tables:
                    tablas_pagina.extend(plumber_tables)
        else:
            images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
            if images:
                contenido = bounding_boxes_a_tabla(images[0])
            ocr_flag = True
            pags_ocr += 1

        prec = calcular_precision_aproximada(contenido)
        info_paginas.append({
            "pagina": page_num,
            "texto": contenido,
            "ocr": ocr_flag,
            "precision_aproximada": prec
        })
        texto_global_completo.append(contenido)

        # (B) Si hay tablas, se guardan en JSON adicional
        if tablas_pagina:
            guardar_tablas_separadas(tablas_pagina, carpeta_salida, str(page_num))

    doc.close()
    plumber_pdf.close()

    total_pags = pags_ocr + pags_texto
    if total_pags == 0:
        raise ValueError("No se procesaron páginas.")

    ocr_ratio = round(pags_ocr / total_pags, 2)
    data_final = {
        "archivo_procesado": os.path.basename(pdf_path),
        "metadata_pdf": {
            "titulo": md.get("title", ""),
            "autor": md.get("author", ""),
            "num_paginas": npages
        },
        "estadisticas": {
            "paginas_totales": total_pags,
            "paginas_con_ocr": pags_ocr,
            "paginas_texto_digital": pags_texto,
            "ocr_ratio": ocr_ratio
        },
        "contenido_paginas": info_paginas,
        "formularios": formularios_detectados  # (C) Añadimos campos detectados
    }

    json_path = os.path.join(carpeta_salida, "resultado.json")
    with open(json_path, "w", encoding="utf-8") as fj:
        json.dump(data_final, fj, indent=2, ensure_ascii=False)

    texto_path = os.path.join(carpeta_salida, "resultado.txt")
    lineas_txt = []
    for p in info_paginas:
        lineas_txt.append(f"[Página {p['pagina']}]\n{p['texto']}")
    with open(texto_path, "w", encoding="utf-8") as ft:
        ft.write("\n\n".join(lineas_txt))

    # (D) Creamos índice para búsqueda en todo el contenido
    indice_dir = os.path.join(carpeta_salida, "indice_whoosh")
    todo_el_texto = "\n".join(texto_global_completo)
    crear_indice_y_indexar(indice_dir, todo_el_texto)

    return json_path, texto_path

#########################
# INTERFAZ (SECCIÓN 5)
#########################
lbl_info = widgets.Label(value="Escoge método para tu PDF:")
lbl_error = widgets.Label(value="", layout=widgets.Layout(width="50%"))

btn_url = widgets.Button(description="Ingresar URL")
btn_upload = widgets.Button(description="Subir Archivo")
btn_procesar = widgets.Button(description="Procesar PDF", disabled=True)
txt_url = widgets.Text(description="URL PDF:", layout=widgets.Layout(width='50%'))

def show_main_buttons():
    clear_output()
    lbl_error.value = ""
    display(lbl_info, lbl_error)
    display(btn_url, btn_upload, btn_procesar)

def on_btn_url_click(b):
    lbl_error.value = ""
    def on_descargar_click(_):
        global pdf_local_path
        if not txt_url.value.strip():
            lbl_error.value = "Ingresa una URL."
            return
        try:
            pdf_local_path = "temp.pdf"
            descargar_pdf(txt_url.value.strip(), pdf_local_path)
            lbl_error.value = "Descargado con éxito."
            btn_procesar.disabled = False
        except Exception as e:
            lbl_error.value = f"Error: {e}"
    btn_descargar = widgets.Button(description="Descargar PDF")
    btn_descargar.on_click(on_descargar_click)
    clear_output()
    display(widgets.HTML("<h4>Ingresa la URL del PDF</h4>"), txt_url, btn_descargar, lbl_error)
    display(btn_procesar)

def on_btn_upload_click(b):
    lbl_error.value = ""
    file_uploader = widgets.FileUpload(accept=".pdf", multiple=False)
    def on_upload_change(change):
        global pdf_local_path
        up_file = file_uploader.value
        if up_file:
            fname = list(up_file.keys())[0]
            with open(fname, 'wb') as f:
                f.write(up_file[fname]['content'])
            pdf_local_path = fname
            lbl_error.value = f"Archivo '{fname}' subido."
            btn_procesar.disabled = False
    file_uploader.observe(on_upload_change, names='value')
    clear_output()
    display(widgets.HTML("<h4>Subir PDF local</h4>"), file_uploader, lbl_error)
    display(btn_procesar)

def on_btn_procesar_click(b):
    global pdf_local_path
    if not pdf_local_path or not os.path.exists(pdf_local_path):
        lbl_error.value = "No hay PDF."
        return
    base_name = os.path.splitext(os.path.basename(pdf_local_path))[0]
    folder_name = base_name[:10]
    os.makedirs(folder_name, exist_ok=True)

    json_path, txt_path = procesar_pdf(pdf_local_path, folder_name)

    original_pdf_path = os.path.join(folder_name, "original.pdf")
    shutil.copy(pdf_local_path, original_pdf_path)

    import subprocess
    subprocess.run(["zip","-j","resultado.zip",
                    json_path, txt_path, original_pdf_path], check=True)
    files.download("resultado.zip")
    lbl_error.value = "Proceso completado."
    btn_procesar.disabled = True
    show_main_buttons()

btn_url.on_click(on_btn_url_click)
btn_upload.on_click(on_btn_upload_click)
btn_procesar.on_click(on_btn_procesar_click)

show_main_buttons()

Label(value='Escoge método para tu PDF:')

Label(value='', layout=Layout(width='50%'))

Button(description='Ingresar URL', style=ButtonStyle())

Button(description='Subir Archivo', style=ButtonStyle())

Button(description='Procesar PDF', disabled=True, style=ButtonStyle())