In [1]:
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
import numpy as np
import re
import chardet
import urllib.request
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
import collections
from datetime import datetime

In [None]:
import ssl
# Lower security level to allow weak DH keys
ssl_context = ssl.create_default_context()
ssl_context.set_ciphers('DEFAULT:@SECLEVEL=1')


In [2]:
# --- Configuration ---
POPPLER_PATH = '/usr/bin'
TESSERACT_CMD = '/usr/bin/tesseract'
BASE_PATH = '/workspace'
PATH_DATA = BASE_PATH + '/data/'
PATH_PDF = BASE_PATH + '/pdf/'
PATH_TXT = BASE_PATH + '/text/'
PATH_TMP = BASE_PATH + '/temp/'

In [3]:
pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD

In [None]:
# --- Utility Functions ---

def detect_encoding(filepath):
    with open(filepath, 'rb') as rawdata:
        return chardet.detect(rawdata.read(10000)).get("encoding")

def download_pdfs(data_list, path_pdf):
    pdf_list = [join(path_pdf, i) for i in listdir(path_pdf)]
    pdf_list = [w.replace('.pdf', '') for w in pdf_list]
    pdf_list = [w.replace(path_pdf, 'https://prodapp2.seace.gob.pe/portalseace-uiwd-pub/DownloadContratosFileServlet?fileName=') for w in pdf_list]
    todownload_list = list(set(data_list) - set(pdf_list))
    print(f"PDFs to download: {len(todownload_list)}")
    for idx, url in enumerate(todownload_list, 1):
        pdf_name = url.rsplit('=', 1)[-1]
        try:
            req = urllib.request.Request(url)
            with urllib.request.urlopen(req, context=ssl_context) as response, open(join(path_pdf, pdf_name ), 'wb') as out_file:
                out_file.write(response.read())
            print(f"[{idx}/{len(todownload_list)}] Downloaded: {pdf_name}")
        except Exception as e:
            print(f"[{idx}/{len(todownload_list)}] Failed: {pdf_name} ({e})")

def pdf_to_txt(path_pdf, path_txt, path_tmp, poppler_path):
    os.chdir(path_tmp)
    pdf_list = [join(path_pdf, i) for i in listdir(path_pdf)]
    pdf_list = [w.replace(path_pdf, '').replace('.pdf', '') for w in pdf_list]
    parsed_list = [join(path_txt, i) for i in listdir(path_txt)]
    parsed_list = [w.replace(path_txt, '').replace('.txt', '') for w in parsed_list]
    diff_list = list(set(pdf_list) - set(parsed_list))
    for idx, file in enumerate(diff_list, 1):
        PDF_file = path_pdf + file + '.pdf'
        outfile = path_txt + file + '.txt'
        try:
            pages = convert_from_path(PDF_file, 500, poppler_path=poppler_path)
            for i, page in enumerate(pages, 1):
                filename = f"page_{i}.jpg"
                page.save(filename, 'JPEG')
            with open(outfile, "a") as f:
                for i in range(1, len(pages) + 1):
                    filename = f"page_{i}.jpg"
                    text = pytesseract.image_to_string(Image.open(filename)).replace('-\n', '')
                    f.write(text)
            print(f"[{idx}/{len(diff_list)}] Converted: {file}")
        except Exception as e:
            print(f"[{idx}/{len(diff_list)}] Failed: {file} ({e})")

def clean_text_column(df, col='columna_unica'):
    df[col] = df[col].str.upper()
    df[col] = df[col].str.replace('[^a-zA-Z0-9]', ' ', regex=True)
    df[col] = df[col].str.strip()
    df[col] = df[col].str.replace('   ', ' ', regex=False)
    df[col] = df[col].str.replace('  ', ' ', regex=False)
    return df

def txt_to_bow(path_txt, path_data):
    txt_list = [join(path_txt, i) for i in listdir(path_txt)]
    txt_list = [w.replace(path_txt, '').replace('.txt', '') for w in txt_list]
    df_bow_append = pd.DataFrame()
    stop_words = set([
        "ALGÚN","ALGUNA","ALGUNAS","ALGUNO","ALGUNOS","AMBOS","AMPLEAMOS","ANTE","ANTES","AQUEL",
                      "AQUELLAS","AQUELLOS","AQUI","ARRIBA","ATRAS","BAJO","BASTANTE","BIEN","CADA","CIERTA",
                      "CIERTAS","CIERTO","CIERTOS","COMO","CON","CONSEGUIMOS","CONSEGUIR","CONSIGO","CONSIGUE",
                      "CONSIGUEN","CONSIGUES","CUAL","CUANDO","DENTRO","DESDE","DONDE","DOS","EL","ELLAS","ELLOS",
                      "EMPLEAIS","EMPLEAN","EMPLEAR","EMPLEAS","EMPLEO","EN","ENCIMA","ENTONCES","ENTRE","ERA",
                      "ERAMOS","ERAN","ERAS","ERES","ES","ESTA","ESTABA","ESTADO","ESTAIS","ESTAMOS","ESTAN","ESTOY",
                      "FIN","FUE","FUERON","FUI","FUIMOS","GUENO","HA","HACE","HACEIS","HACEMOS","HACEN","HACER","HACES","HAGO",
                      "INCLUSO","INTENTA","INTENTAIS","INTENTAMOS","INTENTAN","INTENTAR","INTENTAS","INTENTO","IR","LA","LARGO",
                      "LAS","LO","LOS","MIENTRAS","MIO","MODO","MUCHOS","MUY","NOS","NOSOTROS","OTRO","PARA","PERO","PODEIS",
                      "PODEMOS","PODER","PODRIA","PODRIAIS","PODRIAMOS","PODRIAN","PODRIAS","POR","POR QUÉ","PORQUE","PRIMERO",
                      "PUEDE","PUEDEN","PUEDO","QUIEN","SABE","SABEIS","SABEMOS","SABEN","SABER","SABES","SER","SI","SIENDO",
                      "SIN","SOBRE","SOIS","SOLAMENTE","SOLO","SOMOS","SOY","SU","SUS","TAMBIÉN","TENEIS","TENEMOS","TENER",
                      "TENGO","TIEMPO","TIENE","TIENEN","TODO","TRABAJA","TRABAJAIS","TRABAJAMOS","TRABAJAN","TRABAJAR","TRABAJAS","TRABAJO",
                      "TRAS","TUYO","ULTIMO","UN","UNA","UNAS","UNO","UNOS","USA","USAIS","USAMOS","USAN","USAR","USAS","USO","VA","VAIS",
                      "VALOR","VAMOS","VAN","VAYA","VERDAD","VERDADERA","VERDADERO","VOSOTRAS","VOSOTROS","VOY","YO","ÉL",
                      "ÉSTA","ÉSTAS","ÉSTE","ÉSTOS","ÚLTIMA","ÚLTIMAS","ÚLTIMO","ÚLTIMOS","A","AÑADIÓ","AÚN","ACTUALMENTE","ADELANTE",
                      "ADEMÁS","AFIRMÓ","AGREGÓ","AHÍ","AHORA","AL","ALGO","ALREDEDOR","ANTERIOR","APENAS","APROXIMADAMENTE","AQUÍ","ASÍ",
                      "ASEGURÓ","AUNQUE","AYER","BUEN","BUENA","BUENAS","BUENO","BUENOS","CÓMO","CASI","CERCA","CINCO","COMENTÓ","CONOCER",
                      "CONSIDERÓ","CONSIDERA","CONTRA","COSAS","CREO","CUALES","CUALQUIER","CUANTO","CUATRO","CUENTA","DA","DADO","DAN","DAR",
                      "DE","DEBE","DEBEN","DEBIDO","DECIR","DEJÓ","DEL","DEMÁS","DESPUÉS","DICE","DICEN","DICHO","DIERON","DIFERENTE","DIFERENTES",
                      "DIJERON","DIJO","DIO","DURANTE","E","EJEMPLO","ELLA","ELLO","EMBARGO","ENCUENTRA","ESA","ESAS","ESE","ESO","ESOS",
                      "ESTÁ","ESTÁN","ESTABAN","ESTAR","ESTARÁ","ESTAS","ESTE","ESTO","ESTOS","ESTUVO","EX","EXISTE","EXISTEN","EXPLICÓ",
                      "EXPRESÓ","FUERA","GRAN","GRANDES","HABÍA","HABÍAN","HABER","HABRÁ","HACERLO","HACIA","HACIENDO","HAN","HASTA","HAY","HAYA",
                      "HE","HECHO","HEMOS","HICIERON","HIZO","HOY","HUBO","IGUAL","INDICÓ","INFORMÓ","JUNTO","LADO","LE","LES","LLEGÓ",
                      "LLEVA","LLEVAR","LUEGO","LUGAR","MÁS","MANERA","MANIFESTÓ","MAYOR","ME","MEDIANTE","MEJOR","MENCIONÓ","MENOS",
                      "MI","MISMA","MISMAS","MISMO","MISMOS","MOMENTO","MUCHA","MUCHAS","MUCHO","NADA","NADIE","NI",
                      "NINGÚN","NINGUNA","NINGUNAS","NINGUNO","NINGUNOS","NO","NOSOTRAS","NUESTRA","NUESTRAS","NUESTRO","NUESTROS",
                      "NUEVA","NUEVAS","NUEVO","NUEVOS","NUNCA","O","OCHO","OTRA","OTRAS","OTROS","PARECE","PARTE","PARTIR","PASADA","PASADO",
                      "PESAR","POCA","POCAS","POCO","POCOS","PODRÁ","PODRÁN","PODRÍA","PODRÍAN","PONER","POSIBLE","PRÓXIMO","PRÓXIMOS",
                      "PRIMER","PRIMERA","PRIMEROS","PRINCIPALMENTE","PROPIA","PROPIAS","PROPIO","PROPIOS","PUDO","PUEDA","PUES","QUÉ","QUE",
                      "QUEDÓ","QUEREMOS","QUIÉN","QUIENES","QUIERE","REALIZÓ","REALIZADO","REALIZAR","RESPECTO","SÍ","SÓLO","SE","SEÑALÓ",
                      "SEA","SEAN","SEGÚN","SEGUNDA","SEGUNDO","SEIS","SERÁ","SERÁN","SERÍA","SIDO","SIEMPRE","SIETE","SIGUE","SIGUIENTE","SINO",
                      "SOLA","SOLAS","SOLOS","SON","TAL","TAMPOCO","TAN","TANTO","TENÍA","TENDRÁ","TENDRÁN","TENGA","TENIDO","TERCERA",
                      "TODA","TODAS","TODAVÍA","TODOS","TOTAL","TRATA","TRAVÉS","TRES","TUVO","USTED","VARIAS","VARIOS","VECES",
                      "VER","VEZ","Y","YA",
                      "DE"
    ])

    # Prepare contratos_text.csv (overwrite if exists)
    output_text_path = join(path_data, 'silver/contratos_text.csv')
    with open(output_text_path, 'w', encoding='utf-8-sig') as f:
        f.write('file_name|line_number|columna_unica\n')  # write header only once

    for idx, file in enumerate(txt_list, 1):
        outfile = path_txt + file + '.txt'
        try:
            df = pd.read_fwf(outfile, dtype=object, header=None)
            list_columns = df.columns
            df['file_name'] = file
            df['line_number'] = df.index + 1
            df['columna_unica'] = ''
            for i in list_columns:
                df['columna_unica'] = df['columna_unica'].fillna('') + ' ' + df[i].fillna('')
            df = clean_text_column(df, 'columna_unica')
            df = df[['file_name', 'line_number', 'columna_unica']]
            df = df.query('(columna_unica != "")')
            df.to_csv(output_text_path, mode='a', header=False, sep='|', index=False, encoding='utf-8-sig')
            
            df['cu_wo_sw'] = [
                ' '.join([item for item in x.split() if item not in stop_words])
                for x in df['columna_unica']
            ]
            df['line_text'] = range(1, len(df) + 1)
            df_bow = collections.Counter(
                [y for x in df.cu_wo_sw.values.flatten() for y in x.split()]
            )
            df_bow = pd.DataFrame.from_dict(df_bow, orient='index')
            df_bow.reset_index(level=0, inplace=True)
            df_bow.columns = ['bow', 'freq']
            df_bow['file_name'] = file
            df_bow['len'] = df_bow['bow'].str.len()
            df_bow_append = pd.concat([df_bow_append, df_bow], ignore_index=True)
            print(f"[{idx}/{len(txt_list)}] Parsed: {file}")
        except Exception as e:
            print(f"[{idx}/{len(txt_list)}] Error parsing {file}: {e}")
    df_bow_append.to_excel(join(path_data, 'gold/bow_contratos.xlsx'), index=False)


In [50]:
df = pd.read_excel(join(PATH_DATA, 'bronze/CONOSCE_CONTRATOS2025_0.xlsx'), dtype=str)
df = df.query('codigoentidad == "000047"')
contract_list = df['urlcontrato']

In [26]:
download_pdfs(contract_list, PATH_PDF)

PDFs to download: 24
[1/24] Downloaded: 85ce2e6b-4f25-4ab7-b789-9cb641262172.pdf
[2/24] Downloaded: 796855d7-663b-45d9-90fa-1826c372ee56.pdf
[3/24] Downloaded: beabb84f-5fa3-4ab5-ae24-c01f60a459a6.pdf
[4/24] Downloaded: ee038584-d734-4c54-a7db-83a879453532.pdf
[5/24] Downloaded: b7fc6c18-f5c4-453f-9ec5-a8078f7ea0ca.pdf
[6/24] Downloaded: 8541d3dc-1d7a-40d9-adc6-346af9faecb2.pdf
[7/24] Downloaded: d73ecf69-9b93-4b84-94fc-6a46235e689a.pdf
[8/24] Downloaded: bad4b9f0-f33f-4182-850e-ed2c4b76bdd4.pdf
[9/24] Downloaded: 5b59ae63-d336-49ef-927e-123e3452cc53.pdf
[10/24] Downloaded: 659e7ca5-54b4-40f4-9066-ecc8ade6743a.pdf
[11/24] Downloaded: 834c6dbf-2dbb-4469-bc06-94e73e500d40.pdf
[12/24] Downloaded: d9c25bee-4dba-4b6c-92f4-cc0836eaac64.pdf
[13/24] Downloaded: 3ba41be6-7653-4824-8c28-00da3711882b.pdf
[14/24] Downloaded: da97c5b1-b66a-4308-9796-5912c4e60cd4.pdf
[15/24] Downloaded: fbad6e70-934f-491d-8bb3-043d3c1cf800.pdf
[16/24] Downloaded: 93b42373-06c1-4cf7-a2fc-5759edfae8f4.pdf
[17/24] Down

In [47]:
pdf_to_txt(PATH_PDF, PATH_TXT, PATH_TMP, POPPLER_PATH)

[1/1] Failed: # (place your PDF files here) (Unable to get page count.
I/O Error: Couldn't open file '/workspace/pdf/# (place your PDF files here).pdf': No such file or directory.
)


In [49]:
txt_to_bow(PATH_TXT, PATH_DATA)

[1/25] Error parsing # (output TXT files): [Errno 2] No such file or directory: '/workspace/text/# (output TXT files).txt'
[2/25] Parsed: 02833e39-3d26-417a-a2d1-52f3b80df264
[3/25] Parsed: 24dddc44-fb64-40e4-bb2b-0d6ad01766ff
[4/25] Parsed: 3ba41be6-7653-4824-8c28-00da3711882b
[5/25] Parsed: 45e90837-55da-4a57-bb26-8af9c249a35e
[6/25] Parsed: 47585f1c-16ae-4bed-9687-ddaa556ad211
[7/25] Parsed: 48fdb45c-1f41-435e-a38b-7a42a46a0040
[8/25] Parsed: 5b59ae63-d336-49ef-927e-123e3452cc53
[9/25] Parsed: 659e7ca5-54b4-40f4-9066-ecc8ade6743a
[10/25] Parsed: 796855d7-663b-45d9-90fa-1826c372ee56
[11/25] Parsed: 834c6dbf-2dbb-4469-bc06-94e73e500d40
[12/25] Parsed: 8541d3dc-1d7a-40d9-adc6-346af9faecb2
[13/25] Parsed: 85ce2e6b-4f25-4ab7-b789-9cb641262172
[14/25] Parsed: 93b42373-06c1-4cf7-a2fc-5759edfae8f4
[15/25] Parsed: 974c6af2-9b77-4f3c-905f-58021bbbe66b
[16/25] Parsed: a3b87479-8e6a-49ed-935c-ed6067a54729
[17/25] Parsed: b7fc6c18-f5c4-453f-9ec5-a8078f7ea0ca
[18/25] Parsed: bad4b9f0-f33f-4182-85