In [None]:
%%capture --no-stderr
%pip install --upgrade --quiet langchain-openai langchain-community langchainhub langgraph

In [6]:
%pip install sqlglot

Collecting sqlglot
  Downloading sqlglot-26.2.1-py3-none-any.whl.metadata (19 kB)
Downloading sqlglot-26.2.1-py3-none-any.whl (443 kB)
Installing collected packages: sqlglot
Successfully installed sqlglot-26.2.1
Note: you may need to restart the kernel to use updated packages.


In [35]:
%pip install protobuf

Collecting protobuf
  Using cached protobuf-5.29.3-cp310-abi3-win_amd64.whl.metadata (592 bytes)
Using cached protobuf-5.29.3-cp310-abi3-win_amd64.whl (434 kB)
Installing collected packages: protobuf
Successfully installed protobuf-5.29.3
Note: you may need to restart the kernel to use updated packages.


In [40]:
%pip install sentencepiece

Collecting sentencepiece
  Using cached sentencepiece-0.2.0-cp311-cp311-win_amd64.whl.metadata (8.3 kB)
Using cached sentencepiece-0.2.0-cp311-cp311-win_amd64.whl (991 kB)
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0
Note: you may need to restart the kernel to use updated packages.


## Conexión con base de datos

In [12]:
from langchain_community.utilities import SQLDatabase

usuario = 'postgres'
password = 'place_rag_password'
host = 'localhost'     # o la IP/URL de tu servidor
puerto = '5432'        # puerto por defecto de PostgreSQL
base_datos = 'place_rag_db'

# Crear la URL de conexión
uri = f"postgresql+psycopg2://{usuario}:{password}@{host}:{puerto}/{base_datos}"

db = SQLDatabase.from_uri(uri)

In [13]:
import os
api_key= os.environ.get("HF_API_KEY")

## System Prompt

In [None]:
print(db.table_info)

In [17]:
system_prompt = f"""
Dada una pregunta de entrada, crea una consulta de postgresql sintácticamente correcta.
Usa solo los nombres de las columnas que puedes ver en la descripción del esquema.
No consultes columnas que no existen.
Utiliza únicamente las siguientes tablas: 'entidades', 'expedientes', 'paises', 'regiones'
Esquema de la base de datos:
{db.table_info}
"""

# Qwen2

In [None]:
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    repo_id="Qwen/Qwen2-0.5B",
    task="text-generation",
    max_new_tokens=512,
    do_sample=False,
    repetition_penalty=1.03,
)

chat_model = ChatHuggingFace(llm=llm)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [9]:
from langchain_core.messages import (
    HumanMessage,
    SystemMessage,
    ChatMessage
)
question = "Muéstrame todas las licitaciones de Navarra que excedan 277000€."

messages = [
    SystemMessage(content=system_prompt),
    HumanMessage(
        content=question
    ),
    ChatMessage(role="assistant", content="SELECT *"),
]
ai_msg = chat_model.invoke(messages)

# AceInstruct

In [33]:
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    repo_id="nvidia/AceInstruct-1.5B",
    task="text-generation",
    max_new_tokens=512,
    do_sample=False,
    repetition_penalty=1.03,
)

chat_model = ChatHuggingFace(llm=llm)

In [34]:
from langchain_core.messages import (
    HumanMessage,
    SystemMessage,
    ChatMessage
)
question = "Muéstrame todas las licitaciones de Navarra que excedan 277000€."

messages = [
    SystemMessage(content=system_prompt),
    HumanMessage(
        content=question
    ),
    ChatMessage(role="assistant", content="SELECT *"),
]
ai_msg = chat_model.invoke(messages)

# Openai o1-Mini

In [21]:
from langchain_openai import ChatOpenAI
os.environ.get("OPENAI_API_KEY")
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0)

In [22]:
from langchain_core.messages import (
    HumanMessage,
    SystemMessage,
    ChatMessage
)
question = "Muéstrame todas las licitaciones de Navarra que excedan 277000€."

messages = [
    SystemMessage(content=system_prompt),
    HumanMessage(
        content=question
    ),
    ChatMessage(role="assistant", content="SELECT *"),
]
ai_msg = llm.invoke(messages)

# Microsoft Phi 3

In [54]:
from huggingface_hub import InferenceClient

client = InferenceClient(

	api_key=api_key
)
question = "Muéstrame todas las licitaciones de Navarra que excedan 277000€."
message = f"{system_prompt} {question}"
messages = [
	{
		"role": "user",
		"content": message
	},
    {
		"role": "assistant",
		"content": "SELECT *"
	}
]

completion = client.chat.completions.create(
    model="microsoft/Phi-3-mini-4k-instruct", 
	messages=messages, 
	max_tokens=500
)

# AÑADIR UN STOP CUNADO SE LLEGUE A ;

print(completion.choices[0].message)

ChatCompletionOutputMessage(role='assistant', content="\nFROM expedientes AS e\nJOIN regiones AS r ON e.party_nif = r.country_subentity_code\nJOIN paises AS p ON r.country_code = p.country_code\nJOIN documentos AS d ON e.contract_folder_id = d.contract_id\nWHERE r.country_subentity_name = 'Navarra' AND e.total_amount > 277000;", tool_calls=None)


# Embeddings

In [None]:
import ast
import re

def query_as_list(db, query):
    res = db.run(query)
    res = [el for sub in ast.literal_eval(res) for el in sub if el]
    res = [re.sub(r"\b\d+\b", "", string).strip() for string in res]
    return list(set(res))

entidades = query_as_list(db, "SELECT name FROM entidades")
tipo_contrato = query_as_list(db, "SELECT procurement_project_type_code FROM expedientes")
subtipo_contrato = query_as_list(db, "SELECT procurement_project_subtype_name FROM expedientes")
estado_expediente = query_as_list(db, "SELECT contract_folder_status_code FROM expedientes")
paises = query_as_list(db, "SELECT country_name FROM paises")
regiones = query_as_list(db, "SELECT country_subentity_name FROM regiones")

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore

embeddings = OpenAIEmbeddings()

vector_store = InMemoryVectorStore(embeddings)

In [None]:
from langchain.agents.agent_toolkits import create_retriever_tool

_ = vector_store.add_texts(entidades + tipo_contrato + subtipo_contrato + estado_expediente + paises + regiones)
retriever = vector_store.as_retriever(search_kwargs={"k": 5})
description = (
    "Use to look up values to filter on. Input is an approximate spelling "
    "of the proper noun, output is valid proper nouns. Use the noun most "
    "similar to the search."
)
retriever_tool = create_retriever_tool(
    retriever,
    name="search_proper_nouns",
    description=description,
)

## Medir precisión

### Ejecutar query

In [None]:
from langchain_community.tools.sql_database.tool import QuerySQLDatabaseTool


def execute_query(state: State):
    """Execute SQL query."""
    execute_query_tool = QuerySQLDatabaseTool(db=db)
    return {"result": execute_query_tool.invoke(state["query"])}

In [None]:
import sqlglot
from sqlglot import parse_one, exp

def normalize_sql(query):
    """
    Normaliza una consulta SQL parseándola y generando su representación estándar.
    """
    try:
        parsed = parse_one(query)
        return parsed.to_sql()
    except Exception as e:
        print(f"Error al parsear la consulta: {e}")
        return None

def son_consultas_equivalentes(sql1, sql2):
    """
    Compara dos consultas SQL para determinar si son estructuralmente equivalentes.
    """
    norm_sql1 = normalize_sql(sql1)
    norm_sql2 = normalize_sql(sql2)
    
    if norm_sql1 is None or norm_sql2 is None:
        return False
    
    return norm_sql1.lower() == norm_sql2.lower()

# Ejemplos de consultas
consulta1 = "SELECT a, b FROM tabla WHERE a > 10 ORDER BY b DESC"
consulta2 = "select b, a from tabla where a > 10 order by b desc"

equivalente = son_consultas_equivalentes(consulta1, consulta2)
print(f"¿Las consultas son equivalentes? {'Sí' if equivalente else 'No'}")
