In [1]:
import openai, random, os
from qdrant_client import QdrantClient
from qna import responding_openai
from datetime import datetime
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Initialize OpenAI API
openai.api_key = os.environ['OPENAI_API_KEY']

# Define QdrantClient CONGDOAN
os.environ['QDRANT_URL'] = "https://bd26be9e-256b-4c84-85b3-2588bfdd284e.us-east-1-0.aws.cloud.qdrant.io:6333"
os.environ['QDRANT_API_KEY'] = 'UiPqMg7pMhRsJ6_41vfskxzZZzlEwtWbgu3NOBxhOsEQsaIlX3vQdw'
os.environ['QDRANT_COLLECTION_NAME'] = 'context'

qdrant_client = QdrantClient(url=os.environ['QDRANT_URL'], api_key=os.environ['QDRANT_API_KEY'])


### RAG (Conventional)

In [2]:
# Openai Embedding a text to vector
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
# Search Qdrant Docs
def search_qdrant(query, top_k=3):
    search_result = qdrant_client.search(
        collection_name=os.environ['QDRANT_COLLECTION_NAME'],
        query_vector=get_embedding(query),
        limit=top_k
    )
    score_dict =  {result.payload['page_content']: result.score for result in search_result}
    return search_result, score_dict

In [None]:
# Querying
query = "Nêu các nhiệm vụ, quyền hạn của công đoàn cơ sở?"
# Get relevant documents
unranked_search_result, unranked_score_dict = search_qdrant(query)
# Re-arrange the received documents
search_info = "\n\n".join(unranked_search_result[i].payload['page_content'] for i in range(len(unranked_search_result)))
# Generate response with OpenAI
results, chatgpt_response_time = responding_openai(query, search_info)
print(results)

### RAG Fusion
Adding some steps below:
- Create multiple simple questions from the raw query
- Semantic Search (Vector) with each sub-queries
- Re-ranking with RRF (Reciprocal Rank Fusion)

In [None]:
# Function to generate queries using OpenAI's ChatGPT
def generate_queries_chatgpt(original_query):
    system_role = "You are a helpful assistant that generates multiple search queries based on a single input query."
    user_prompt = f"Generate multiple search queries in Vietnamese related to: {original_query}"
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_role},
            {"role": "user", "content": user_prompt},
            {"role": "user", "content": "OUTPUT (4 Vietnamese queries):"}
        ]
    )
    generated_queries = response.choices[0]["message"]["content"].strip().split("\n")
    return generated_queries

# Reciprocal Rank Fusion algorithm
def reciprocal_rank_fusion(search_results_dict, k=60):
    fused_scores = {}
    print("Initial individual search result ranks:")
    for query, doc_scores in search_results_dict.items():
        print(f"For query '{query}': {doc_scores}")
        
    for query, doc_scores in search_results_dict.items():
        for rank, (doc, score) in enumerate(sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)):
            if doc not in fused_scores:
                fused_scores[doc] = 0
            previous_score = fused_scores[doc]
            fused_scores[doc] += 1 / (rank + k)
            # print(f"Updating score for {doc} from {previous_score} to {fused_scores[doc]} based on rank {rank} in query '{query}'")

    reranked_results = {doc: score for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)}
    print("-----------------------------------")
    print("Final reranked results:", reranked_results)
    print("-----------------------------------")
    return reranked_results

# Dummy function to simulate generative output
def generate_output(reranked_results, queries):
    return f"Final output based on {queries} and reranked documents: {list(reranked_results.keys())}"

In [None]:
# Original query
original_query = "Nêu các nhiệm vụ, quyền hạn của công đoàn cơ sở?"
# Top_k results
top_k = 3
# Generate sub-questions from original query
generated_queries = generate_queries_chatgpt(original_query)
# Search Qdrant for each generated query
all_results = {}
for query in generated_queries:
    search_results, score_dict = search_qdrant(query)
    all_results[query] = score_dict
# Rerank the results
reranked_results = reciprocal_rank_fusion(all_results)
# Convert reraned info to a string
reranked_context = list(reranked_results.keys())
search_info_rerank = "\n\n".join(reranked_context[i] for i in range(top_k))
# Generate response with OpenAI
results, chatgpt_response_time = responding_openai(original_query, search_info_rerank)
print(results)