In [1]:
# Worker Agent QA: handles long context efficiently
import os
import openai
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np



In [None]:
def find_similar_faq(user_query:str, faq_list:list)->list:
    '''Return a list of FAQ sorted from the most simialr to least similar to the user query by cosine similarity.
    '''

    _embed_client = openai.OpenAI(
        api_key=os.getenv("EMBEDDING_API_KEY"),
        base_url=os.getenv("EMBEDDING_BASE_URL"),
        max_retries=5)
    
    #embed user query
    user_query_embedding = _embed_client.embeddings.create(input=user_query, model=os.getenv('EMBEDDING_MODEL_NAME'))
    user_query_embedding = np.array(user_query_embedding.data[0].embedding)
    user_query_embedding = user_query_embedding.reshape(1, -1)

    cosi_list = []
    faq_embedding_list = _embed_client.embeddings.create(input=faq_list, model=os.getenv('EMBEDDING_MODEL_NAME'))
    for i, faq_embedding in enumerate(faq_embedding_list.data):
            faq_embedding = np.array(faq_embedding.embedding)
            faq_embedding = faq_embedding.reshape(1,-1)
            similarity_score = cosine_similarity(user_query_embedding, faq_embedding)[0][0]
            cosi_list.append({"faq":faq_list[i], "sim":similarity_score})

    sorted_faqs = sorted(cosi_list, key=lambda d: d["sim"], reverse=True)
    sorted_faqs_list = [i["faq"] for i in sorted_faqs]

    return sorted_faqs_list

In [23]:
find_similar_faq(user_query, faq_list)

Embedding(embedding=[-0.045440673828125, -0.037139892578125, -0.007572174072265625, 0.02801513671875, 0.0013322830200195312, 0.005214691162109375, -0.060943603515625, 0.03546142578125, -0.024261474609375, 0.0294952392578125, 0.020172119140625, 0.0290374755859375, 0.0293121337890625, 0.036834716796875, 0.0233001708984375, -0.06817626953125, 0.0177764892578125, 0.043304443359375, -0.0168914794921875, -0.04705810546875, -0.0280914306640625, -0.0340576171875, -0.00039005279541015625, -0.062225341796875, 0.01184844970703125, 0.0059051513671875, 0.01010894775390625, -0.00926971435546875, 0.010009765625, -0.08038330078125, 0.02398681640625, 0.0125732421875, -0.050079345703125, -0.0072784423828125, -0.01812744140625, -0.0145111083984375, -3.8623809814453125e-05, -0.0343017578125, -0.046783447265625, -0.024383544921875, -0.0020885467529296875, -0.0135498046875, 0.02789306640625, -0.008880615234375, -0.0223236083984375, -0.043212890625, -0.0175628662109375, -0.0230712890625, -0.02288818359375, 0

['Where are Toyota cars manufactured?',
 'Where is Japan?',
 'What is the engine power of Toyota RAV4']

In [None]:
_embed_client = openai.OpenAI(
        api_key=os.getenv("EMBEDDING_API_KEY"),
        base_url=os.getenv("EMBEDDING_BASE_URL"),
        max_retries=5)
user_query="what is toyota?"  
faq_list = ["Where are Toyota cars manufactured?", "What is the engine power of Toyota RAV4", "Where is Japan?"]  
#embed user query
user_query_embedding = _embed_client.embeddings.create(input=user_query, model=os.getenv('EMBEDDING_MODEL_NAME'))
user_query_embedding = np.array(user_query_embedding.data[0].embedding)
user_query_embedding = user_query_embedding.reshape(1, -1)

# faq_embedding_list = []
cosi_list = []
faq_embedding_list = _embed_client.embeddings.create(input=faq_list, model=os.getenv('EMBEDDING_MODEL_NAME'))
for i, faq_embedding in enumerate(faq_embedding_list.data):
        print(faq_embedding)
        faq_embedding = np.array(faq_embedding.embedding)
        faq_embedding = faq_embedding.reshape(1,-1)
        similarity_score = cosine_similarity(user_query_embedding, faq_embedding)[0][0]
        cosi_list.append({"faq":faq_list[i], "sim":similarity_score})

sorted_faqs = sorted(cosi_list, key=lambda d: d["sim"], reverse=True)
sorted_faqs_list = [i["faq"] for i in sorted_faqs]
    

Embedding(embedding=[-0.045440673828125, -0.037139892578125, -0.007572174072265625, 0.02801513671875, 0.0013322830200195312, 0.005214691162109375, -0.060943603515625, 0.03546142578125, -0.024261474609375, 0.0294952392578125, 0.020172119140625, 0.0290374755859375, 0.0293121337890625, 0.036834716796875, 0.0233001708984375, -0.06817626953125, 0.0177764892578125, 0.043304443359375, -0.0168914794921875, -0.04705810546875, -0.0280914306640625, -0.0340576171875, -0.00039005279541015625, -0.062225341796875, 0.01184844970703125, 0.0059051513671875, 0.01010894775390625, -0.00926971435546875, 0.010009765625, -0.08038330078125, 0.02398681640625, 0.0125732421875, -0.050079345703125, -0.0072784423828125, -0.01812744140625, -0.0145111083984375, -3.8623809814453125e-05, -0.0343017578125, -0.046783447265625, -0.024383544921875, -0.0020885467529296875, -0.0135498046875, 0.02789306640625, -0.008880615234375, -0.0223236083984375, -0.043212890625, -0.0175628662109375, -0.0230712890625, -0.02288818359375, 0

In [None]:

sorted_faqs_list

['Where are Toyota cars manufactured?',
 'Where is Japan?',
 'What is the engine power of Toyota RAV4']

In [1]:
from pydantic import BaseModel

In [4]:
faq_list = ["Where are Toyota cars manufactured?", "What is the engine power of Toyota RAV4", "Where is Japan?"]  

class FAQ(BaseModel):
    user_query: str
    similar_faqs: list[str]
    def __str__(self) -> str:
        """Return a string representation of the faq list"""
        return "\n".join(
            f"faq {step}\n"
            for step in self.similar_faqs
        )


In [22]:
"\n".join(f"{i}\n"for i in faq_list)

'Where are Toyota cars manufactured?\n\nWhat is the engine power of Toyota RAV4\n\nWhere is Japan?\n'

In [5]:
FAQ.similar_faqs=faq_list

In [20]:
# print(FAQ.similar_faqs)