In [2]:
import numpy as np
from langchain_core.tools import tool
import pandas as pd
import re
import openai
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

In [17]:
load_dotenv()

True

In [72]:
def get_open_ai_client():
    return openai.Client()

def get_vector_chunks(open_ai_client: openai.Client) -> list:
    """Builds a vector store from the document."""
    # load text data 
    document = open('./knowledge-base/swiss_faq.md', 'r')

    # Split into text chunks
    content = document.read()
    docs = [{"page_content": txt} for txt in re.split(r"(?=\n##)", content)]

    # Create embeddings for each chunk
    embeddings = open_ai_client.embeddings.create(
        model = "text-embedding-3-small", 
        input = [ doc['page_content'] for doc in docs ]
    )

    return (docs, [ emb.embedding for emb in embeddings.data ])

def get_relevant_docs(open_ai_client, vector_chunks, query, docs):
    """Queries the vector store for the top k most similar documents.""" 
    # Create embedding for the query
    embed = open_ai_client.embeddings.create(
        model="text-embedding-3-small", input=[query]
    )
    
    # Calculate similarity scores
    scores = np.array(embed.data[0].embedding) @ np.array(vector_chunks).T
    
    # Get top k chunks
    k = 5
    top_k_idx = np.argpartition(scores, -k)[-k:]
    top_k_idx_sorted = top_k_idx[np.argsort(-scores[top_k_idx])]
    
    return [
        {**docs[idx], "similarity": scores[idx]} for idx in top_k_idx_sorted
    ]

def answer_query(query: str, filtered_relevant_docs: list) -> str:
    """Answer a query about the company's policies."""
    if (not filtered_relevant_docs) or (len(filtered_relevant_docs) == 0):
        return {
            "success": False,
            "content": "No relevant information found." 
        }
    
    # Combine the content of the top documents
    content = "\n\n".join([doc for doc in filtered_relevant_docs])
    
    response = openai.Client().chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"Context: {content} \n\n Question: {query}"}
        ]
    )
    
    return {
        "success": True,
        "content": response.choices[0].message.content.strip()
    }

In [69]:
from langchain_tavily import TavilySearch
import json

def perform_web_search(query):
    tool = TavilySearch(
        max_results=5,
        topic="general",
        include_answer=True,
    )

    model_generated_tool_call = {
        "args": {"query": query},
        "id": "1",
        "name": "tavily",
        "type": "tool_call",
    }
    tool_msg = tool.invoke(model_generated_tool_call)

    search_result = json.loads(tool_msg.content)
    return search_result["answer"]

In [73]:
open_ai_client = get_open_ai_client()

docs, vector_chunks = get_vector_chunks(open_ai_client)

# query = "How to make a Italian Pizza?"
query = "Should I reconfirm my flight?"
relevant_docs = get_relevant_docs(open_ai_client, vector_chunks, query, docs)

In [74]:
score_threshold = 0.30
similarity_scores = [doc["similarity"] for doc in relevant_docs]
print(similarity_scores)
filtered_relevant_docs = [
    doc["page_content"] for doc in relevant_docs if doc["similarity"] > score_threshold
]

if(len(filtered_relevant_docs) > 0):
    # print("\n\n".join(filtered_relevant_docs))
    result = answer_query(query, filtered_relevant_docs)
    print(result)
else:
    print("No relevant documents found with sufficient similarity score.")
    print("Performing a Web Search")
    search_results = perform_web_search(query)
    print(search_results)

[np.float64(0.43296246682792716), np.float64(0.399930448665162), np.float64(0.35440156372272646), np.float64(0.335730996429779), np.float64(0.32014820404430455)]
{'success': True, 'content': 'No, reconfirmation of SWISS flights is not required.'}
