In [1]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader,PyPDFLoader, DirectoryLoader
from sentence_transformers import SentenceTransformer
from langchain.embeddings.base import Embeddings

import re
def remove_metadata(text):
    # X√≥a ƒëo·∫°n c√≥ d·∫°ng metadata={...}
    cleaned_text = re.sub(r"metadata=\{.*?\}", "", text, flags=re.DOTALL)
    return cleaned_text.strip()
loader = DirectoryLoader(
    "rag_data/text",
    # glob="**/*.pdf",  # This will load all PDF files in the directory and subdirectories
    # loader_cls=PyPDFLoader,
    loader_cls=TextLoader,
)
documents = loader.load()

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,       # Gi·∫£m t·ª´ 1000 -> 600
    chunk_overlap=150,    # Gi·∫£m t·ª´ 200 -> 150 (25% chunk size)
    length_function=len,
    separators=[
        "\n\n",        # ∆Øu ti√™n 1: ƒêo·∫°n vƒÉn
        "\n",          # ∆Øu ti√™n 2: Xu·ªëng d√≤ng
        ".", "!", "?", # ∆Øu ti√™n 3: K·∫øt th√∫c c√¢u
        " ",           # ∆Øu ti√™n 4: T·ª´
        ""             # Fallback
    ]
)
# T√°ch vƒÉn b·∫£n th√†nh c√°c ƒëo·∫°n nh·ªè
texts = text_splitter.split_documents(documents)

model = SentenceTransformer('dangvantuan/vietnamese-document-embedding', trust_remote_code=True)

# Create a custom embeddings class
class VietnameseEmbeddings(Embeddings):
    def __init__(self, model):
        self.model = model

    def embed_documents(self, texts):
        """Embed a list of documents."""
        embeddings = self.model.encode(texts, normalize_embeddings=True)
        return embeddings.tolist()

    def embed_query(self, text):
        """Embed a query."""
        embedding = self.model.encode(text, normalize_embeddings=True)
        return embedding.tolist()

# Create embeddings instance
embeddings = VietnameseEmbeddings(model)

# T·∫°o c∆° s·ªü d·ªØ li·ªáu FAISS
db = FAISS.from_documents(
    texts,
    embeddings,
    distance_strategy="COSINE"  # Use cosine similarity for better performance
)
db.save_local("faiss_index")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import ollama

def generate_response(prompt):
    messages = [
        {"role": "system", "content": "D·ª±a tr√™n th√¥ng tin sau ƒë√¢y, h√£y tr·∫£ l·ªùi c√¢u h·ªèi m·ªôt c√°ch ch√≠nh x√°c v√† ng·∫Øn g·ªçn. N·∫øu th√¥ng tin kh√¥ng ƒë·ªß ƒë·ªÉ tr·∫£ l·ªùi, h√£y n√≥i r√µ ƒëi·ªÅu ƒë√≥."},
        {"role": "user", "content": prompt}
    ]
    print(len(prompt))
    response = ollama.chat(
        model='llama3.1:8b-instruct-q4_K_M',
        messages=messages,
    )
    return response["message"]["content"]



In [5]:
import time
import sys
from pg_query import process_query

def rag_pipeline(query):
    start_time = time.time()
    
    # Get documents with scores
    docs_and_scores = db.similarity_search_with_score(query, k=3)
    
    # Define similarity threshold
    SIMILARITY_THRESHOLD = 0.5  # Adjust this threshold based on your needs
    
    # Filter documents by relevance
    filtered_docs = []
    max_similarity = 0
    
    for doc, score in docs_and_scores:
        # Convert score to similarity (FAISS uses distance, so lower is better)
        similarity = 1 - score
        print(similarity)   
        max_similarity = max(max_similarity, similarity)
        
        if similarity > SIMILARITY_THRESHOLD:
            doc.page_content = remove_metadata(doc.page_content)
            filtered_docs.append(doc)
    
    print(f"üîç FAISS search m·∫•t: {time.time() - start_time:.2f} gi√¢y")
    print(f"Found {len(filtered_docs)} relevant documents")
    print(f"Max similarity score: {max_similarity:.4f}")
        
    # If no documents meet the threshold, try PostgreSQL
    if not filtered_docs:
        print("No documents meet similarity threshold, querying PostgreSQL...")
        try:
            postgres_result = process_query(query)
            if postgres_result and "Kh√¥ng t√¨m th·∫•y k·∫øt qu·∫£" not in postgres_result:
                print("Found results in PostgreSQL")
                return postgres_result  # Return PostgreSQL results directly
            else:
                return "‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y d·ªØ li·ªáu ph√π h·ª£p ƒë·ªÉ tr·∫£ l·ªùi!"
        except Exception as e:
            print(f"Error querying PostgreSQL: {str(e)}")
            return "‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y d·ªØ li·ªáu ph√π h·ª£p ƒë·ªÉ tr·∫£ l·ªùi!"
        
    
    # Create context from filtered documents
    context = "\n\n".join([doc.page_content for doc in filtered_docs])
    
    # Improved prompt template
    prompt_template = """D·ª±a tr√™n th√¥ng tin sau ƒë√¢y, h√£y tr·∫£ l·ªùi c√¢u h·ªèi m·ªôt c√°ch ch√≠nh x√°c v√† ng·∫Øn g·ªçn.
    N·∫øu th√¥ng tin kh√¥ng ƒë·ªß ƒë·ªÉ tr·∫£ l·ªùi, h√£y n√≥i r√µ ƒëi·ªÅu ƒë√≥.

    Th√¥ng tin tham kh·∫£o:
    {context}

    C√¢u h·ªèi: {query}

    Tr·∫£ l·ªùi:"""
    
    prompt = prompt_template.format(context=context, query=query)
    
    print(f"Context length: {len(context)} characters")
    sys.stdout.write(prompt)
    
    start_time = time.time()
    response = generate_response(prompt)
    print(f"ü§ñ AI generate m·∫•t: {time.time() - start_time:.2f} gi√¢y")
    
    return response

# V√≠ d·ª• s·ª≠ d·ª•ng
# query = "Nh·ªØng ƒëi·ªÉm du l·ªãch n·ªïi b·∫≠t ·ªü ƒê√† N·∫µng?"
# query = "Festval ph·ªü 2025 di·ªÖn ra khi n√†o?"
# query = "Doanh thu c·ªßa Du l·ªãch Hu·∫ø trong qu√Ω I nƒÉm 2025?"
query = "Ph√π ƒëi√™u Kala N√∫i B√† l√† g√¨?"

# query = "Cho t√¥i bi·∫øt th√¥ng tin c·ªßa m·ªôt s·ªë h∆∞·ªõng d·∫´n vi√™n t·∫°i ƒëi·ªÉm du l·ªãch H·ªôi An"
# query = "Cho t√¥i bi·∫øt th√¥ng tin c·ªßa m·ªôt s·ªë h∆∞·ªõng d·∫´n vi√™n ·ªü c√≥ n∆°i c·∫•p th·∫ª ·ªü H·ªì Ch√≠ Minh"
# query = "Cho t√¥i bi·∫øt th√¥ng tin c·ªßa m·ªôt s·ªë n∆°i l∆∞u tr√∫ ·ªü H√† N·ªôi"


print(rag_pipeline(query))



0.6390332579612732
0.5552563369274139
0.32902228832244873
üîç FAISS search m·∫•t: 0.22 gi√¢y
Found 2 relevant documents
Max similarity score: 0.6390
Context length: 829 characters
D·ª±a tr√™n th√¥ng tin sau ƒë√¢y, h√£y tr·∫£ l·ªùi c√¢u h·ªèi m·ªôt c√°ch ch√≠nh x√°c v√† ng·∫Øn g·ªçn.
    N·∫øu th√¥ng tin kh√¥ng ƒë·ªß ƒë·ªÉ tr·∫£ l·ªùi, h√£y n√≥i r√µ ƒëi·ªÅu ƒë√≥.

    Th√¥ng tin tham kh·∫£o:
    Ph√π ƒëi√™u Kala N√∫i B√† l√† t√°c ph·∫©m ƒëi√™u kh·∫Øc ƒë√° Champa v·ªõi ch·∫•t li·ªáu ƒë√° t√∫p Riolit ƒêaxit, cao 60cm, ƒë·∫ø r·ªông 44cm, d√†y 17cm, tr·ªçng l∆∞·ª£ng 105,5kg.

Ph√π ƒëi√™u c√πng v·ªõi c√°c hi·ªán v·∫≠t kh√°c g·∫Øn li·ªÅn v·ªõi ki·∫øn tr√∫c ƒë·ªÅn th√°p n√†y gi√∫p cho c√°c nh√† khoa h·ªçc x√°c ƒë·ªãnh ƒë∆∞·ª£c c√°c gi√° tr·ªã cƒÉn b·∫£n c·ªßa di v·∫≠t n√†y nh∆∞ t√≠nh nguy√™n g·ªëc, ni√™n ƒë·∫°i c·ªßa di t√≠ch v√† di v·∫≠t c≈©ng nh∆∞ x√°c ƒë·ªãnh ƒë∆∞·ª£c quy m√¥, t·∫ßm v√≥c c·ªßa di t√≠ch N√∫i B√†, trong ph·ª©c h·ª£p c√°c di t√≠ch ChƒÉm ·ªü Ph√∫ Y√™n n√≥i ri√™ng v√† di t√≠ch 