In [5]:
import pandas as pd
import requests
from lxml import etree
from io import BytesIO

# Load the XML file
file_path = r"C:\Users\Rodrigo\Documents\TFM\Documentos\licitacionesPerfilesContratanteCompleto3_202401\licitacionesPerfilesContratanteCompleto3.atom"
with open(file_path, 'rb') as file:  # Leer como bytes
    xml_content = file.read()

In [6]:
parser = etree.XMLParser(ns_clean=True)
tree = etree.parse(BytesIO(xml_content), parser)  # Usar BytesIO
root = tree.getroot()

In [7]:
# Diccionario con los enlaces
links_dict = {
    "contract_codes": "https://contrataciondelestado.es/codice/cl/2.08/ContractCode-2.08.gc",
    "service_contract_codes": "http://contrataciondelestado.es/codice/cl/1.04/ServiceContractCode-1.04.gc",
    "goods_contract_codes": "https://contrataciondelestado.es/codice/cl/1.04/GoodsContractCode-1.04.gc",
    "patrimonial_contract_codes": "https://contrataciondelestado.es/codice/cl/2.02/PatrimonialContractCode-2.02.gc",
    "country_codes": "http://contrataciondelestado.es/codice/cl/2.08/CountryIdentificationCode-2.08.gc",
    "country_subentity_codes": "http://contrataciondelestado.es/codice/cl/2.08/NUTS-2021.gc"
}

# Creamos un diccionario para almacenar un DataFrame por cada entrada
dataframes_dict = {}

# Espacio de nombres para OASIS Genericode
ns_tables = {'gc': 'http://docs.oasis-open.org/codelist/ns/genericode/1.0/'}

for key, url in links_dict.items():
    print(f"Descargando y parseando: {key} -> {url}")
    
    # Descargamos el contenido del XML
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error al descargar {url} (status code: {response.status_code})")
        # Si no se pudo descargar, creamos un DataFrame vacío o pasamos de largo
        dataframes_dict[key] = pd.DataFrame()
        continue
    
    xml_content = response.content
    
    # Parseamos el contenido con lxml
    table_root = etree.XML(xml_content)
    
    # Usamos 'local-name()' en vez de 'gc:' para SimpleCodeList y Row
    rows = table_root.xpath('//gc:CodeList//*[local-name()="SimpleCodeList"]//*[local-name()="Row"]', namespaces=ns_tables)
    
    data = []
    for row in rows:
        # Extraemos valores usando local-name() en Value y SimpleValue
        code_list = row.xpath('./*[local-name()="Value"][@ColumnRef="code"]/*[local-name()="SimpleValue"]/text()')
        nombre_list = row.xpath('./*[local-name()="Value"][@ColumnRef="nombre"]/*[local-name()="SimpleValue"]/text()')
        # name_list = row.xpath('./*[local-name()="Value"][@ColumnRef="name"]/*[local-name()="SimpleValue"]/text()')
        
        code_val = code_list[0] if code_list else None
        nombre_val = nombre_list[0] if nombre_list else None
        # name_val = name_list[0] if name_list else None
        

        data.append({
            key[:-1]: code_val,
            key[:-6]+'_name': nombre_val,
        })
    
    # Creamos el DataFrame para la clave actual
    df = pd.DataFrame(data)
    dataframes_dict[key] = df

Descargando y parseando: contract_codes -> https://contrataciondelestado.es/codice/cl/2.08/ContractCode-2.08.gc
Descargando y parseando: service_contract_codes -> http://contrataciondelestado.es/codice/cl/1.04/ServiceContractCode-1.04.gc
Descargando y parseando: goods_contract_codes -> https://contrataciondelestado.es/codice/cl/1.04/GoodsContractCode-1.04.gc
Descargando y parseando: patrimonial_contract_codes -> https://contrataciondelestado.es/codice/cl/2.02/PatrimonialContractCode-2.02.gc
Descargando y parseando: country_codes -> http://contrataciondelestado.es/codice/cl/2.08/CountryIdentificationCode-2.08.gc
Descargando y parseando: country_subentity_codes -> http://contrataciondelestado.es/codice/cl/2.08/NUTS-2021.gc


In [8]:
df_contract_codes = dataframes_dict["contract_codes"]
df_service_contract_codes = dataframes_dict["service_contract_codes"]
df_goods_contract_codes = dataframes_dict["goods_contract_codes"]
df_patrimonial_contract_codes = dataframes_dict["patrimonial_contract_codes"]
df_country_codes = dataframes_dict["country_codes"]
df_country_subentity_codes = dataframes_dict["country_subentity_codes"]

In [9]:
df_country_subentity_codes["country_code"] = df_country_subentity_codes["country_subentity_code"].str[:2]

# Eliminar las filas que correspondan a países "puros"
# Es decir, aquellos códigos de longitud == 2:
df_country_subentity_codes = df_country_subentity_codes[df_country_subentity_codes["country_subentity_code"].str.len() > 2]

# Ahora df_country_subentity_codes tiene sólo subentidades con sus códigos y nombre
df_country_subentity_codes

Unnamed: 0,country_subentity_code,country_subentity_name,country_code
1,AT1,Ostösterreich,AT
2,AT11,Burgenland,AT
3,AT12,Niederösterreich,AT
4,AT13,Wien,AT
5,AT2,Südösterreich,AT
...,...,...,...
625,TRC1,"Gaziantep, Adıyaman, Kilis",TR
626,TRC2,"Şanlıurfa, Diyarbakır",TR
627,TRC3,"Mardin, Batman, Şırnak, Siirt",TR
628,TRZ,Extra-Regio NUTS 1,TR


In [19]:
# Define namespaces
ns = {
    'atom': 'http://www.w3.org/2005/Atom',
    'cac': 'urn:dgpe:names:draft:codice:schema:xsd:CommonAggregateComponents-2',
    'cbc': 'urn:dgpe:names:draft:codice:schema:xsd:CommonBasicComponents-2',
    'cac-place-ext': 'urn:dgpe:names:draft:codice-place-ext:schema:xsd:CommonAggregateComponents-2',
    'cbc-place-ext': 'urn:dgpe:names:draft:codice-place-ext:schema:xsd:CommonBasicComponents-2',
}

# Initialize DataFrames
entries_df = pd.DataFrame(columns=['entry_id', 'link', 'summary', 'title', 'updated', 'contract_folder_id', 'contract_folder_status_code', 'procurement_project_type_code', 'procurement_project_subtype_name', 'budget_currency', 'estimated_overall_contract_amount', 'total_amount', 'tax_exclusive_amount'])
parties_df = pd.DataFrame(columns=['party_dir3', 'party_nif', 'party_id_plataforma', 'party_name', 'website_uri', 'type_code', 'activity_code', 'party_country_identification_code', 'party_city_name', 'party_postal_zone', 'party_address_line', 'telephone', 'telefax', 'email'])
country_df = pd.DataFrame(columns=['identification_code', 'name'])
documents_df = pd.DataFrame(columns=[])



# Parse each entry
for entry in root.findall('atom:entry', ns):
    entry_id, link, title, summary, updated, contract_folder_id, contract_folder_status_code, pp_type_name, pp_subtype_name, pp_subentity_code, pp_budget_currency_id, pp_estimated_overall_contract_amount, pp_total_amount, pp_tax_exclusive_amount, party_nif = [None] * 15
    party_dir3, party_nif, party_id_plataforma, party_name, website_uri, contracting_party_type_code, activity_code, city_name, postal_zone, address_line, country_identification_code, telephone, telefax, electronic_mail = [None] * 14
    # Extract entry-level data
    entry_id = entry.find('atom:id', ns).text
    link_elem = entry.find('atom:link', ns)
    link = link_elem.get('href') if link_elem is not None else None
    summary = entry.find('atom:summary', ns).text
    title = entry.find('atom:title', ns).text
    updated = entry.find('atom:updated', ns).text
    write_row = True
    cfs_elem = entry.find('cac-place-ext:ContractFolderStatus', ns)
    if cfs_elem is not None:
        contract_folder_id = cfs_elem.find('cbc:ContractFolderID', ns).text if cfs_elem.find('cbc:ContractFolderID', ns) is not None else None
        contract_folder_status_code = cfs_elem.find('cbc-place-ext:ContractFolderStatusCode', ns).text if cfs_elem.find('cbc-place-ext:ContractFolderStatusCode', ns) is not None else None

        if len(entries_df[entries_df['contract_folder_id'] == contract_folder_id].values) > 0:
            if entries_df['updated'][entries_df['contract_folder_id'] == contract_folder_id].values.max() > updated:
                write_row = False

        if write_row:
            pp_elem = cfs_elem.find('cac:ProcurementProject', ns)
            if pp_elem is not None:
                pp_type_code = pp_elem.find('cbc:TypeCode', ns).text if pp_elem.find('cbc:TypeCode', ns) is not None else None
                pp_type_name = df_contract_codes[df_contract_codes['contract_code'] == pp_type_code]['contract_name'].values[0] if pp_type_code is not None else None
                pp_subtype_code = pp_elem.find('cbc:SubTypeCode', ns).text if pp_elem.find('cbc:SubTypeCode', ns) is not None else None
                
                if pp_elem.find('cbc:SubTypeCode', ns) is not None:
                    if pp_elem.find('cbc:SubTypeCode', ns).get('listURI') == 'http://contrataciondelestado.es/codice/cl/1.04/ServiceContractCode-1.04.gc':
                        pp_subtype_name = df_service_contract_codes[df_service_contract_codes['service_contract_code'] == pp_subtype_code]['service_contract_name'].values[0] if pp_subtype_code is not None else None
                    elif pp_elem.find('cbc:SubTypeCode', ns).get('listURI') == 'http://contrataciondelestado.es/codice/cl/1.04/GoodsContractCode-1.04.gc':
                        pp_subtype_name = df_goods_contract_codes[df_goods_contract_codes['goods_contract_code'] == pp_subtype_code]['goods_contract_name'].values[0] if pp_subtype_code is not None else None
                    elif pp_elem.find('cbc:SubTypeCode', ns).get('listURI') == 'http://contrataciondelestado.es/codice/cl/2.02/PatrimonialContractCode-2.02.gc':
                        pp_subtype_name = df_patrimonial_contract_codes[df_patrimonial_contract_codes['patrimonial_contract_code'] == pp_subtype_code]['patrimonial_contract_name'].values[0] if pp_subtype_code is not None else None
                    else:
                        pp_subtype_name = None

                pp_budget_amount = pp_elem.find('cac:BudgetAmount', ns)
                if pp_budget_amount is not None:
                    pp_budget_currency_id = pp_budget_amount.find('cbc:EstimatedOverallContractAmount', ns).get('currencyID') if pp_budget_amount.find('cbc:EstimatedOverallContractAmount', ns) is not None else None
                    pp_estimated_overall_contract_amount = pp_budget_amount.find('cbc:EstimatedOverallContractAmount', ns).text if pp_budget_amount.find('cbc:EstimatedOverallContractAmount', ns) is not None else None
                    pp_total_amount = pp_budget_amount.find('cbc:TotalAmount', ns).text if pp_budget_amount.find('cbc:TotalAmount', ns) is not None else None
                    pp_tax_exclusive_amount = pp_budget_amount.find('cbc:TaxExclusiveAmount', ns).text if pp_budget_amount.find('cbc:TaxExclusiveAmount', ns) is not None else None
                pp_realized_location = pp_elem.find('cac:RealizedLocation', ns)
                if pp_realized_location is not None:
                    pp_subentity_code = pp_realized_location.find('cbc:CountrySubentityCode', ns).text if pp_realized_location.find('cbc:CountrySubentityCode', ns) is not None else None
    

            lcp_elem = cfs_elem.find('cac-place-ext:LocatedContractingParty', ns)
            if lcp_elem is not None:
                party_elem = lcp_elem.find('cac:Party', ns)
                if party_elem is not None:
                    for party_identification_elem in party_elem.findall('cac:PartyIdentification', ns):
                        party_identification_id_elem = party_identification_elem.find('cbc:ID', ns) if party_identification_elem is not None else None
                        if party_identification_id_elem is not None and party_identification_id_elem.get('schemeName') == 'DIR3':
                            party_dir3 = party_identification_id_elem.text if party_identification_id_elem is not None else None
                        elif party_identification_id_elem is not None and party_identification_id_elem.get('schemeName') == 'NIF':
                            party_nif = party_identification_id_elem.text if party_identification_id_elem is not None else None
                        elif party_identification_id_elem is not None and party_identification_id_elem.get('schemeName') == 'ID_PLATAFORMA':
                            party_id_plataforma = party_identification_id_elem.text if party_identification_id_elem is not None else None
                    if party_nif not in parties_df['party_nif'].values:
                        party_name_elem = party_elem.find('cac:PartyName', ns)
                        party_name = party_name_elem.find('cbc:Name', ns).text if party_name_elem is not None else None
                        website_uri = party_elem.find('cbc:WebsiteURI', ns).text if party_elem.find('cbc:WebsiteURI', ns) is not None else None
                        contracting_party_type_code_elem = lcp_elem.find('cbc:ContractingPartyTypeCode', ns)
                        contracting_party_type_code = contracting_party_type_code_elem.text if contracting_party_type_code_elem is not None else None
                        activity_code_elem = lcp_elem.find('cbc:ActivityCode', ns)
                        activity_code = activity_code_elem.text if activity_code_elem is not None else None

                        postal_address_elem = party_elem.find('cac:PostalAddress', ns)
                        if postal_address_elem is not None:
                            city_name = postal_address_elem.find('cbc:CityName', ns).text if postal_address_elem.find('cbc:CityName', ns) is not None else None
                            postal_zone = postal_address_elem.find('cbc:PostalZone', ns).text if postal_address_elem.find('cbc:PostalZone', ns) is not None else None
                            address_line_elem = postal_address_elem.find('cac:AddressLine', ns)
                            address_line = address_line_elem.find('cbc:Line', ns).text if address_line_elem is not None else None
                            country_elem = postal_address_elem.find('cac:Country', ns)
                            
                            if country_elem is not None:
                                country_identification_code = country_elem.find('cbc:IdentificationCode', ns).text if country_elem.find('cbc:IdentificationCode', ns) is not None else None

                        contact_elem = party_elem.find('cac:Contact', ns)
                        if contact_elem is not None:
                            telephone = contact_elem.find('cbc:Telephone', ns).text if contact_elem.find('cbc:Telephone', ns) is not None else None
                            telefax = contact_elem.find('cbc:Telefax', ns).text if contact_elem.find('cbc:Telefax', ns) is not None else None
                            electronic_mail = contact_elem.find('cbc:ElectronicMail', ns).text if contact_elem.find('cbc:ElectronicMail', ns) is not None else None
                        parties_df = pd.concat([parties_df, pd.DataFrame([{
                            'dir3': party_dir3,
                            'nif': party_nif,
                            'id_plataforma': party_id_plataforma,
                            'name': party_name,
                            'website_uri': website_uri,
                            'type_code': contracting_party_type_code,
                            'activity_code': activity_code,
                            'party_city_name': city_name,
                            'party_postal_zone': postal_zone,
                            'party_address_line': address_line,
                            'party_country_identification_code': country_identification_code,
                            'telephone': telephone,
                            'telefax': telefax,
                            'email': electronic_mail
                        }])], ignore_index=True)
            # Parseo de documentos
            ldr_elem = cfs_elem.find('cac:LegalDocumentReference', ns)
            if ldr_elem is not None:
                legal_document_reference_id = ldr_elem.find('cbc:ID', ns).text if ldr_elem.find('cbc:ID', ns) is not None else None
                legal_document_uri = ldr_elem.find('cac:Attachment', ns).find('cac:ExternalReference', ns).find('cbc:URI', ns).text if ldr_elem.find('cac:Attachment', ns).find('cac:ExternalReference', ns).find('cbc:URI', ns) is not None else None
                documents_df = pd.concat([documents_df, pd.DataFrame([{
                    'contract_id': contract_folder_id,
                    'document_reference_id': legal_document_reference_id,
                    'document_uri': legal_document_uri,
                    'document_type': 'Pliego Administrativo'
                }])], ignore_index=True)
            tdr_elem = cfs_elem.find('cac:TechnicalDocumentReference', ns)
            if tdr_elem is not None:
                technical_document_reference_id = tdr_elem.find('cbc:ID', ns).text if tdr_elem.find('cbc:ID', ns) is not None else None
                technical_document_uri = tdr_elem.find('cac:Attachment', ns).find('cac:ExternalReference', ns).find('cbc:URI', ns).text if tdr_elem.find('cac:Attachment', ns).find('cac:ExternalReference', ns).find('cbc:URI', ns) is not None else None
                documents_df = pd.concat([documents_df, pd.DataFrame([{
                    'contract_id': contract_folder_id,
                    'document_reference_id': technical_document_reference_id,
                    'document_uri': technical_document_uri,
                    'document_type': 'Pliego Técnico'
                }])], ignore_index=True)
            if cfs_elem.findall('cac:AdditionalDocumentReference', ns):    
                for adr_elem in cfs_elem.findall('cac:AdditionalDocumentReference', ns):
                    additional_document_reference_id = adr_elem.find('cbc:ID', ns).text if adr_elem.find('cbc:ID', ns) is not None else None
                    additional_document_uri = adr_elem.find('cac:Attachment', ns).find('cac:ExternalReference', ns).find('cbc:URI', ns).text if adr_elem.find('cac:Attachment', ns).find('cac:ExternalReference', ns).find('cbc:URI', ns) is not None else None
                    documents_df = pd.concat([documents_df, pd.DataFrame([{
                        'contract_id': contract_folder_id,
                        'document_reference_id': additional_document_reference_id,
                        'document_uri': additional_document_uri,
                        'document_type': 'Documento adicional'
                    }])], ignore_index=True)



            entries_df = pd.concat([entries_df, pd.DataFrame([{
                'entry_id': entry_id,
                'link': link,
                'summary': summary,
                'title': title,
                'updated': updated,
                'contract_folder_id': contract_folder_id,
                'contract_folder_status_code': contract_folder_status_code,
                'procurement_project_type_code': pp_type_name,
                'procurement_project_subtype_name': pp_subtype_name,
                'procurement_project_subentity_code': pp_subentity_code,
                'budget_currency': pp_budget_currency_id,
                'estimated_overall_contract_amount': pp_estimated_overall_contract_amount,
                'total_amount': pp_total_amount,
                'tax_exclusive_amount': pp_tax_exclusive_amount,
                'party_nif': party_nif
            }])], ignore_index=True)


# Crear Base de Datos Postgres

get required packages and set environment variables:

In [56]:
%%capture --no-stderr
%pip install --upgrade --quiet langchain-community langchainhub langgraph

In [57]:
%pip install --upgrade --quiet langchain-openai

Note: you may need to restart the kernel to use updated packages.


Suponiendo que ya contamos con una instancia de PostgreSQL en funcionamiento.


In [1]:
%pip install psycopg2-binary sqlalchemy

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.10-cp311-cp311-win_amd64.whl.metadata (5.0 kB)
Downloading psycopg2_binary-2.9.10-cp311-cp311-win_amd64.whl (1.2 MB)
   ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
   ---------------------------------------- 1.2/1.2 MB ? eta 0:00:00
Installing collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.10
Note: you may need to restart the kernel to use updated packages.


In [16]:
from sqlalchemy import create_engine

# Reemplaza con tus credenciales
usuario = 'postgres'
password = 'place_rag_password'
host = 'localhost'     # o la IP/URL de tu servidor
puerto = '5432'        # puerto por defecto de PostgreSQL
base_datos = 'place_rag_db'

# Crear la URL de conexión
conexion_str = f"postgresql+psycopg2://{usuario}:{password}@{host}:{puerto}/{base_datos}"

# Crear el motor
engine = create_engine(
    conexion_str,
    connect_args={"options": "-c client_encoding=UTF8"}
)

In [20]:
# Creamos 4 tablas en PostgreSQL (si ya existen, las reemplazamos)
entries_df.to_sql("expedientes", engine, if_exists="replace", index=False)
documents_df.to_sql("documentos", engine, if_exists="replace", index=False)
parties_df.to_sql("entidades", engine, if_exists="replace", index=False)
df_country_codes.to_sql("paises", engine, if_exists="replace", index=False)
df_country_subentity_codes.to_sql("regiones", engine, if_exists="replace", index=False)

593