In [15]:
import os
import json
import cohere
import re
import logging
from datetime import datetime

from langchain_openai import AzureOpenAIEmbeddings
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.models import QueryType
from azure.search.documents._generated.models import QueryCaptionResult

from openai import AzureOpenAI
import tiktoken

from dotenv import load_dotenv

load_dotenv()

def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

class AzureEmbeddings:

    def __init__(self):
        pass

    @staticmethod
    def get_embedding():
        return AzureOpenAIEmbeddings(
            azure_deployment=os.getenv("OPENAI_AZURE_DEPLOYMENT"), 
            openai_api_version="2023-08-01-preview",
            openai_api_key=os.getenv("OPENAI_API_KEY"),
            azure_endpoint=os.getenv("OPEN_AI_AZURE_URL")
        )

    @staticmethod
    def generate_embeddings(content: str):
        embeddings = AzureOpenAIEmbeddings(
            azure_deployment=os.getenv("OPENAI_AZURE_DEPLOYMENT"), 
            openai_api_version="2023-08-01-preview",
            openai_api_key=os.getenv("OPENAI_API_KEY"),
            azure_endpoint=os.getenv("OPEN_AI_AZURE_URL")
        )

        doc_result = embeddings.embed_documents([content])

        return doc_result[0]

openai_client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"), 
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"), 
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
    )

embeddings_client = AzureEmbeddings()
store_search_url: str = f'https://{os.getenv('AZURE_COGNITIVE_SEARCH_SERVICE_NAME')}.search.windows.net'
search_client = SearchClient(
            store_search_url, os.getenv("AZURE_COGNITIVE_SEARCH_INDEX_NAME"),
            AzureKeyCredential(os.getenv("AZURE_COGNITIVE_SEARCH_API_KEY"))
        )

co = cohere.Client(api_key=os.getenv('COHERE_API_KEY'))

In [16]:
log_file_name = ""


def ensure_directory(directory_name):
    if not os.path.exists(directory_name):
        os.makedirs(directory_name)


def log(message):
    global log_file_name
    ensure_directory("logs")
    with open(log_file_name, "a") as file:
        file.write(f"{datetime.now()}\n{message}\n")

In [18]:
conversation_price = 0
conversation_ct = 0
conversation_pt = 0
conversation_tt = 0


def get_conversation_price(new_completion):
    global conversation_price, conversation_ct, conversation_pt, conversation_tt
    USD_price_in_COP = 4100

    # gpt4o
    ct_price = 0.000015
    pt_price = 0.000005

    ct = new_completion.usage.completion_tokens
    pt = new_completion.usage.prompt_tokens
    tt = new_completion.usage.total_tokens
    usd_total_price = (ct * ct_price) + (pt * pt_price)
    cop_total_price = usd_total_price * USD_price_in_COP
    conversation_ct += ct
    conversation_pt += pt
    conversation_tt += tt
    conversation_usd_total_price = (conversation_ct * ct_price) + (
        conversation_pt * pt_price
    )
    conversation_cop_total_price = conversation_usd_total_price * USD_price_in_COP

    print(
        f"üí∞ Conversation price: ${conversation_cop_total_price} COP (In: {conversation_pt}, Out: {conversation_ct}, Total token: {conversation_tt}) "
    )
    print(
        f"   This message: ${cop_total_price} COP (In: {pt}, Out: {ct}, Total token: {tt}) "
    )
    print("")

    return

In [19]:
### üîë Definir system prompt y tools

messages = []

query_prompt = "Generate 1 or multiple search queries in Spanish based on what the user asked you if you need more information to acurratally answer the user. Each query should contain the whole idea and sources if they are needed. If the user question adresses a when, how or what information, you should keep that in the query because that will affect what you will get. The query will be used in a hybrid search (semantic and full text) so keep it short but make sure to include all the main keywords and infromation needed to give the necesary context to the query. Try to always include at least one concept followed by one specific source. Don't include more than 1 source. You can include the various ways of refering the same source eg: 'homicidio art√≠culo 103 ley 599 del 2000 c√≥digo penal'.If you can't find the source out of what the user asked you, you can use the 'Supporting Information Sources' to identify sources that could help you generate a better query to get additional info and to answer the user."

search_legal_info = {
    "type": "function",
    "function": {
        "name": "search_legal_info",
        "description": query_prompt,
        "parameters": {
            "type": "object",
            "strict": "true",
            "properties": {
                "search_query": {
                    "type": "string",
                    "description": "The text search query for supporting documents. eg: 'homicidio art√≠culo 103 ley 599 del 2000', 'cu√°ndo se consuma el hurto art√≠culo 239 ley 599 del 2000'",
                },
                "search_type": {
                    "enum": ["exact", "interpret"],
                    "type": "string",
                    "description": "If the query is for and exact citation or for a more concept interpretation",
                },
                "number_of_chunks_needed": {
                    "type": "integer",
                    "description": "A number between 3 and 10 depending on how many 500 tokens chunks of documents you want to retrieve out of this query",
                },
            },
            "required": ["search_query", "search_type", "number_of_chunks_needed"],
        },
    },
}
get_next_chunk = {
    "type": "function",
    "function": {
        "name": "get_next_chunk",
        "description": "Use this tool to get when an important sources comes incomplete or the content is cut off. This tool will help you retrieve the following section of that given source.",
        "parameters": {
            "type": "object",
            "properties": {
                "source_id": {
                    "type": "string",
                    "description": "the unique identifier of the chunk that is incomplete. eg:'20240719192458csjscpboletinjurisprudencial20181219pdf_chunk30'",
                },
            },
            "required": ["source_id"],
        },
    },
}

response_system_template = f'Eres Ariel, un asistente para la investigaci√≥n legal. \n\n    S√© lo m√°s detallado y preciso posible en tus respuestas. Cita el m√°ximo n√∫mero de fuentes posible, sin salirte del tema. En caso de encontrar informaci√≥n contradictoria, se√±√°lala y sugiere una posible causa. Expresa toda la informaci√≥n que encuentres en las fuentes proporcionadas. Solamente si la respuesta no est√° en las fuentes proporcionadas, responde ‚ÄúNo encuentro informaci√≥n con esos t√©rminos, ¬øpuedes reformular tu consulta?‚Äù. Al incluir t√≠tulos en tu respuesta, usa formato html (ej: titulos, <strong>).\n    \n    Cada fuente tiene un nombre seguido por dos puntos y la informaci√≥n real, siempre incluye el nombre de la fuente para cada hecho que uses en la respuesta. Todas las fuentes son PDFs. Utiliza corchetes para referenciar la fuente, por ejemplo [info1.pdf]. No combines fuentes, lista cada fuente por separado, por ejemplo [info1.txt] [info2.pdf]. El formato de algunas fuentes puede incluir acentos, puntos o guiones. Aseg√∫rate de capturar todo antes de ".pdf", por ejemplo: [SFC - T√≠tulos valores electr√≥nicos. Pagar√©s. Dep√≥sito centralizado de valores. Exigencia Concepto 2020086426-003 del 24 de junio de 2020 id2020086426.pdf]. No dividas los nombres de archivo por ninguna raz√≥n, ya que todos deben terminar en .pdf.\n\n    S√© lo m√°s detallado y preciso posible en tus respuestas. Cita el m√°ximo n√∫mero de fuentes posible, sin salirte del tema. En caso de encontrar informaci√≥n contradictoria, se√±√°lala y sugiere una posible causa. Expresa toda la informaci√≥n que encuentres en las fuentes proporcionadas. Solamente si la respuesta no est√° en las fuentes proporcionadas, responde ‚ÄúNo encuentro informaci√≥n con esos t√©rminos, ¬øpuedes reformular tu consulta?‚Äù. Al incluir t√≠tulos en tu respuesta, usa formato html (ej: titulos, <strong>).\n    \n    Cada fuente tiene un nombre seguido por dos puntos y la informaci√≥n real, siempre incluye el nombre de la fuente para cada hecho que uses en la respuesta. Todas las fuentes son PDFs. Utiliza corchetes para referenciar la fuente, por ejemplo [info1.pdf]. No combines fuentes, lista cada fuente por separado, por ejemplo [info1.txt] [info2.pdf]. El formato de algunas fuentes puede incluir acentos, puntos o guiones. Aseg√∫rate de capturar todo antes de ".pdf", por ejemplo: [SFC - T√≠tulos valores electr√≥nicos. Pagar√©s. Dep√≥sito centralizado de valores. Exigencia Concepto 2020086426-003 del 24 de junio de 2020 id2020086426.pdf]. No dividas los nombres de archivo por ninguna raz√≥n, ya que todos deben terminar en .pdf. Escribe la respuesta en formato HTML, pero sin incluir los tags "```html" al principio o final principio de tu respuesta.'

system_message = {"role": "system", "content": response_system_template}

In [20]:
def format_search_results(docs_list):
    documents = []
    for document in docs_list:
        captions: QueryCaptionResult = document["@search.captions"]
        captions_text = " // ".join([caption.text for caption in captions]) if captions is not None else ""
        doc_formatted = {
            "score": document["@search.score"],
            "rerank": document["@search.reranker_score"],
            "captions": captions_text,
            "id": document["id"],
            "title": document["title"],
            "author": document["author"],
            "keywords": document["keywords"],
            "category": document["category"],
            "page": document["page"],
            "year": document["year"],
            "has_copyright": document["has_copyright"],
            "file_path": document["file_path"],
            "external_id": document["external_id"],
            "content": document["content"],
        }
        documents.append(doc_formatted)
        # print(f"{len(documents)}. {doc_formatted["id"]} (score:{doc_formatted["score"]} - rerank:{doc_formatted["rerank"]})")
        # print(f"    {doc_formatted["captions"]}")
    return documents

def format_reranked_results(docs_list):
    documents = []
    for document in docs_list:
        doc_formatted = {
            "id": document.document.id,
            "relevance": document.relevance_score,
            "score": document.document.score,
            "rerank": document.document.rerank,
            "captions": document.document.captions,
            "title": document.document.title,
            "author": document.document.author,
            "keywords": document.document.keywords,
            "category": document.document.category,
            "page": document.document.page,
            "year": document.document.year,
            "has_copyright": document.document.has_copyright,
            "file_path": document.document.file_path,
            "external_id": document.document.external_id,
            "content": document.document.content,
        }
        documents.append(doc_formatted)
        print(f"{len(documents)}. >>>>>>>>>>>>>>>>>>>>>")
        print(f"{json.dumps(doc_formatted, indent=4)}")        
    return documents

def filter_docs(doc_list):
    filtered_docs = []
    contents = [doc["content"] for doc in doc_list]
    
    for doc in doc_list:
        score = doc["score"] if isinstance(doc["score"], float) else 0
        rerank = doc["rerank"] if isinstance(doc["rerank"], float) else 0
        relevance = doc.get("relevance", 1)
        if (
            contents.count(doc["content"]) == 1
            and (relevance > 0.9
            and rerank > 2
            or score > 0.025)
        ):
            filtered_docs.append(doc)
    print(f">> üßπ Docs filtrados. De {len(doc_list)} pasaron {len(filtered_docs)} ...")
    return filtered_docs

def add_doc_to_context(docs_list):
    sources = ""
    counter = 0
    
    for document in docs_list:
        counter += 1
        sources += (
            f"Fuente #{counter}\n"
            f"ID: {document["id"]}\n"
            f"T√≠tulo: {document["title"]}\n"
            f"Autor: {document["author"]}\n"
            f"A√±o de publicaci√≥n: {document["year"]}\n"
            f"√Årea Legal: {document["keywords"]}\n"
            f"Tipo de Documento: {document["category"]}\n"
            f"P√°gina: {document["page"]}\n"
            f"Extracto: {document["content"]}\n"
            f"\n\n"
        )      
    return sources
    

In [21]:
def search_for_chunks(text_query, vector_query, rerank_query):
    print(">> üîé Buscando documentos ...")
    results_num = 50
    results = search_client.search(
        search_text=text_query,
        vector_queries=[
            {
                "vector": vector_query,
                "k": results_num,
                "fields": "content_vector",
                "kind": "vector",
                "exhaustive": True,
            }
        ],
        top=results_num,
        query_type=QueryType.SEMANTIC,
        semantic_configuration_name="ariel-alberto",
        query_caption="extractive|highlight-false",
        scoring_profile="ariel-alberto",
    )

    results_formatted = format_search_results(results)
    restults_filtered = filter_docs(results_formatted)

    print(">> ‚≠êÔ∏è Aplicando reorganizaci√≥n sem√°ntica ...")
    reranked_docs = co.rerank(
        model="rerank-multilingual-v3.0",
        top_n=10,
        query=rerank_query,
        documents=restults_filtered,
        return_documents=True,
        rank_fields=["content"],
    )
    reranked_docs_formatted = format_reranked_results(reranked_docs.results)
    reranked_docs_filtered = filter_docs(reranked_docs_formatted)

    return reranked_docs_filtered


def get_next_chunks(id):
    pattern = r"^(.*_chunk)(\d+)$"
    match = re.search(pattern, id)

    if match:
        prefix = match.group(1)
        chunk_number = int(match.group(2))
        next_chunks_result = search_client.search(
            filter=f"id eq '{prefix}{chunk_number + 1}' or id eq '{prefix}{chunk_number + 2}'",
            top=2,
        )
        return format_search_results(list(next_chunks_result))


def get_next_chunk_tool(source_id):
    next_chunks = get_next_chunks(source_id)
    return f"Continuaci√≥n de {source_id}:\n{add_doc_to_context(next_chunks)}"


def search_legal_info_tool(search_query, search_type, current_context_size):
    return "B√∫squda realizada..."

In [22]:
def generate_completion():
    global messages
    # print("============ CONVERSATION: ============")
    # print(messages)
    # print("================")
    response = openai_client.chat.completions.create(
        model=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
        messages=messages,
        # tools=[get_next_chunk, search_legal_info],
        # tools=[get_next_chunk],
        temperature=0.2,
        n=1,
        # tool_choice="auto",
    )
    print(response)
    get_conversation_price(response)
    return response

In [23]:
def append_message(new_message):
    global messages
    conversation = messages

    if isinstance(new_message, dict):
        role = new_message.get("role")
        content = new_message.get("content")
    else:
        role = new_message.role
        content = new_message.content

    if role == "assistant":
        if new_message.tool_calls:
            tool_calls = new_message.tool_calls
            tool_calls_formatted = []
            for tool_call in tool_calls:
                tool_calls_formatted.append(
                    {
                        "id": tool_call.id,
                        "function": {
                            "arguments": str(json.loads(tool_call.function.arguments)),
                            "name": tool_call.function.name,
                        },
                        "type": "function",
                    }
                )
            conversation.append(
                {
                    "role": "assistant",
                    "tool_calls": tool_calls_formatted,
                },
            )
            return conversation  ## Assistant call tools
        elif content:
            conversation.append({"role": "assistant", "content": content})
            return conversation  ## Assistant talk
    elif role == "tool":
        conversation.append(new_message)
        return conversation  ## Tool reponse
    else:
        return conversation

In [24]:
def call_tools(tool_calls):
    for tool_call in tool_calls:
        tool_name = tool_call.function.name
        tool_args = json.loads(tool_call.function.arguments)
        if tool_name == "get_next_chunk":
            print(f">> üîé Buscando los chunks siguientes de {tool_args.get("source_id")} ...")
            content = get_next_chunk_tool(tool_args.get("source_id"))
        elif tool_name == "search_legal_info":
            print(f">> üîé Buscando informaci√≥n ...")
            content = search_legal_info_tool()

        new_tool_message = {
            "tool_call_id": tool_call.id,
            "role": "tool",
            "name": tool_name,
            "content": content,
        }

        append_message(new_message=new_tool_message)

    print(">> ü§ñ Generando respuesta ...")
    new_assistant_completion = generate_completion()
    new_assistant_message = new_assistant_completion.choices[0].message
    append_message(new_message=new_assistant_message)

    if new_assistant_message.content:
        return print("üí¨ Assistant:" + new_assistant_message.content)
    else:
        return call_tools(
            tool_calls=new_assistant_message.tool_calls,
        )

In [25]:
def run_conversation(user_prompt=None):
    if user_prompt:
        get_answer(user_prompt)
    else:
        while True:
            user_input = input("You: ")
            if user_input.lower() == "exit":
                print("Exiting chat...")
                break
            get_answer(user_input)


def get_answer(user_input):
    global messages, log_file_name

    current_time = datetime.now().strftime("%m-%d %H:%M")
    log_file_name = f"logs/{current_time} - {user_input[:50]}.log"

    print("üôé‚Äç‚ôÇÔ∏è User: " + user_input)
    log(f"User input: {user_input}")
    new_user_message = {"role": "user", "content": user_input}
    user_input_vectors = embeddings_client.generate_embeddings(content=user_input)

    results = search_for_chunks(
        text_query=user_input,
        rerank_query=user_input,
        vector_query=user_input_vectors,
    )
    search_results_message = {
        "role": "assistant",
        "content": f"Extractos de documentos encontrados: \n{add_doc_to_context(results)}",
    }

    messages = [system_message, new_user_message, search_results_message]

    print(">> ü§ñ Generando respuesta ...")
    response = generate_completion()
    print(response)
    response_message = response.choices[0].message

    messages = append_message(new_message=response_message)

    tool_calls = response_message.tool_calls
    if tool_calls:
        call_tools(tool_calls=tool_calls)
    elif response_message.content:
        print("üí¨ Assistant:", response_message.content)
    else:
        print("An error ocurred during completion generation - response: ", response)

In [26]:
messages = []
run_conversation("¬øCu√°les son las causales gen√©ricas de atenuaci√≥n punitiva?")

üôé‚Äç‚ôÇÔ∏è User: ¬øCu√°les son las causales gen√©ricas de atenuaci√≥n punitiva?
>> üîé Buscando documentos ...
>> üßπ Docs filtrados. De 50 pasaron 15 ...
>> ‚≠êÔ∏è Aplicando reorganizaci√≥n sem√°ntica ...
1. >>>>>>>>>>>>>>>>>>>>>
{
    "id": "20240823184234csjsp1086804042001pdf_chunk46",
    "relevance": 0.99987465,
    "score": 0.026375405490398407,
    "rerank": 2.674172878265381,
    "captions": "Y en tercer lugar, las denominadas causales gen\u00e9ricas de atenuaci\u00f3n y agravaci\u00f3n punitivas, previstas respectivamente en los art\u00edculos 64 y 66 del C.P.3 La clasificaci\u00f3n no es caprichosa pues que emana del propio art\u00edculo 61 que, entre otros motivos, ordena, para dosificar la pena, tener en cuenta las circunstancias de atenuaci\u00f3n y agravaci\u00f3n en su...",
    "title": "CSJ - SP10868(04-04-2001)",
    "author": "Corte Suprema de Justicia",
    "keywords": "Penal",
    "category": "Jurisprudencia",
    "page": 38,
    "year": 2001,
    "has_copyrigh

KeyboardInterrupt: 

In [None]:
# messages = []
# run_conversation("¬øQu√© dice el art√≠culo 103 del c√≥digo penal?")

In [None]:
# messages = []
# run_conversation(
#     "Daniel Andr√©s F√∫quenes Barriga, en su condici√≥n de auxiliar de la justicia y secuestre, recaud√≥ una suma de dinero por concepto de arrendamiento, espec√≠ficamente $684,000 entre febrero y julio de 2015. Este dinero deb√≠a ser entregado a su due√±o o poseedor, pero F√∫quenes Barriga retuvo la suma para s√≠ mismo, incumpliendo con la obligaci√≥n de devolverla. Este acto de retenci√≥n y apropiaci√≥n del dinero, que se le hab√≠a confiado por un t√≠tulo no traslativo de dominio, ¬øQu√© delito cometi√≥?"
# )