# 1. Loading

## Import libraries and settings

In [None]:
from app.utils.llm.helpers import get_openai_api_key

# Get the OpenAI API key
openai_api_key = get_openai_api_key()

In [None]:
from llama_index.core import Settings
from app.utils.llm.helpers import init_llm_configurations
from app.settings import Constants

init_llm_configurations(llm_model=Constants.LLM_MODEL, embedding_model=Constants.EMBEDDING_MODEL)

Settings

In [None]:
import nest_asyncio

# Apply nest_asyncio to allow asyncio in Jupyter Notebook
nest_asyncio.apply()

## Read file pdf

### Behavior

Assume that already has the pdf file.

- Use markitdown to convert pdf to markdown
- Create a Document object of LlamaIndex from the markdown file
- Create an Ingestion Pipeline (cache enabled) and ingest the Document to Node objects
- Save those nodes to the storage context including docstore, vectorstore, and index store

### Chunking method

- Firstly, having the document with Vietnamese text
- Translate the document to English
- Use [semantic splitter](https://docs.llamaindex.ai/en/stable/examples/node_parsers/semantic_chunking/) to split the document into chunks
  - Threshold: 85
  - Buffer size: 3
  - Why?

References:
- https://youtu.be/8OJC21T2SL4?t=1933

Other methods:
- [Semantic Double Merging Chunking](https://docs.llamaindex.ai/en/stable/examples/node_parsers/semantic_double_merging_chunking/)

### Loading

In [None]:
from app.integrations.llama_index.ingestion_pipelines.readers import MarkitdownReader

# Initialize the MarkitdownReader
markitdown_reader = MarkitdownReader()
filepath = "data/NQLD01.pdf"

documents = markitdown_reader.load_data(filepath)
documents[0].metadata

### Translation from Vietnamese to English

In [None]:
from app.integrations.llama_index.ingestion_pipelines.translators import Translator

translator = Translator.from_defaults(source_language="vietnamese", target_language="english")
translated_documents = translator.get_translated_documents(documents, show_progress=True)

translated_documents

### Node splitting

In [None]:
from llama_index.core.node_parser import SemanticSplitterNodeParser

# Text splitters
# Use the SemanticSplitterNodeParser to split the text into nodes
semantic_splitter = SemanticSplitterNodeParser.from_defaults(
    embed_model=Settings.embed_model,
    breakpoint_percentile_threshold=85,
    buffer_size=3
)

### Metadata extraction

In [None]:
"""
Metadata key: issue_date, is_outdated
Input from user input on frontend.
Purpose: To filter out outdated documents.
"""
issue_data = input("Enter the issue date of the document (YYYY-MM-DD): ")
is_outdated = bool(input("Is the document outdated? (True/False): "))

for document in translated_documents:
    document.metadata["issue_date"] = issue_data
    document.metadata["is_outdated"] = is_outdated

In [None]:
"""
Metadata key: excerpt_keywords
Extract keywords from the text
Purpose: Can be used to do topic/tag or keyword-based search (metadata filter).
"""
from llama_index.core.extractors import KeywordExtractor

keyword_extractor = KeywordExtractor(llm=Settings.llm, keywords=3)

In [None]:
"""
"""
from llama_index.core.extractors import SummaryExtractor

summary_extractor = SummaryExtractor(summaries=["prev", "self", "next"], llm=Settings.llm)

### Ingestion pipeline

In [None]:
from llama_index.core.ingestion import IngestionPipeline

# Transformations
# TODO: add a cleaner to cleanup llm's generated text, e.g. here's a summary of the document: ...
transformations = []
transformations.append(semantic_splitter)
transformations.append(keyword_extractor)
transformations.append(summary_extractor)
transformations.append(Settings.embed_model)

# Initialize the ingestion pipeline
pipeline = IngestionPipeline(transformations=transformations)

In [None]:
# Run the pipeline
nodes = await pipeline.arun(documents=translated_documents, show_progress=True)
nodes

### Check content

In [None]:
print(f"Total nodes: {len(nodes)}")
for node in nodes:
    print(f"================== {node.id_} ========================")
    print(node.text)

### Check metadata

In [None]:
print(f"Total nodes: {len(nodes)}")
for node in nodes:
    print(f"================== {node.id_} ========================")
    print(node.metadata)

### Check embedding

In [None]:
print(f"Total nodes: {len(nodes)}")
for node in nodes:
    print(f"================== {node.id_} ========================")
    print(f"Dimensions: {len(node.embedding)}")
    print(node.embedding[:50])

# 2. Indexing

In [None]:
# Test the indexing
from llama_index.core import VectorStoreIndex
from llama_index.core.response.notebook_utils import display_source_node, display_response

index = VectorStoreIndex(nodes)
query_engine = index.as_query_engine(similarity_top_k=3)

In [None]:
question = "Company name?"

response = query_engine.query(question)
display_response(response, show_source=True, show_metadata=True)

# 3. Storing

# 4. Querying

# 5. Evaluation

# Trash

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams

# Kết nối tới Qdrant (đang chạy cục bộ hoặc cloud)
qdrant_client = QdrantClient(url="http://localhost:6333")
collection_name = "documents_collection"

# Kiểm tra kết nối và danh sách các collections hiện có
collections = qdrant_client.get_collections()
print("Current collections:", collections)

# Kiểm tra xem collection đã tồn tại hay chưa
if collection_name not in [collection.name for collection in collections.collections]:
    # Tạo collection nếu chưa tồn tại
    vector_params = VectorParams(size=1536, distance="Cosine")  # Kích thước vector và khoảng cách cosine
    qdrant_client.create_collection(
        collection_name=collection_name,
        vectors_config=vector_params,
        on_disk = True
    )
    print(f"Collection '{collection_name}' đã được tạo.")
else:
    print(f"Collection '{collection_name}' đã tồn tại.")

In [None]:
import json
def save_to_qdrant(nodes):
    
    for node in nodes:
        text = node.text

        # Get metadata to save as payload dict
        metadata = node.metadata
        payload = dict(metadata)
        
        # Create embdding from text
        embedding = embed_model._get_text_embedding(text)
    

        try:
            qdrant_client.upsert(
                collection_name="documents_collection",
                points=[
                    PointStruct(
                        id=node.id_,
                        vector=embedding,
                        payload=payload
                        )
                    ]
                )
            print("Dữ liệu đã được lưu vào Qdrant!")
        except Exception as e:
            print(f"Error saving data to Qdrant: {e}")

In [None]:
from qdrant_client import QdrantClient

# Kiểm tra xem collection có tồn tại không
try:
    # Lấy thông tin về collection
    collection_info = qdrant_client.get_collection(collection_name)
    print(f"Collection '{collection_name}' đã được kết nối thành công.")
except Exception as e:
    print(f"Lỗi khi kết nối với collection: {str(e)}")


In [None]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import StorageContext

# Khởi tạo QdrantVectorStore với tên collection được cập nhật
vector_store = QdrantVectorStore(
    client=qdrant_client, 
    collection_name="documents_collection"  # Đổi tên collection tại đây
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
print(storage_context)

# indexing & chunking & pipeline

In [None]:
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.extractors import QuestionsAnsweredExtractor, KeywordExtractor
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.core import Settings
from qdrant_client import QdrantClient
import numpy as np
from datetime import datetime
import nest_asyncio

nest_asyncio.apply()

# Tạo vector_store sử dụng Qdrant
from llama_index.vector_stores.qdrant import QdrantVectorStore
vector_store = QdrantVectorStore(client=qdrant_client, collection_name="documents_collection")

# Cấu hình các extractor và node parser
extractors = [
    QuestionsAnsweredExtractor(llm=Settings.llm, questions=1),
    KeywordExtractor(llm=Settings.llm, keywords=5),
]

splitter = SemanticSplitterNodeParser(
    buffer_size=1,
    breakpoint_percentile_threshold=95,
    embed_model=Settings.embed_model
)

# Các transformations
transformations = [splitter] + extractors

# Khởi tạo ingestion pipeline
pipeline = IngestionPipeline(
    transformations=transformations,
    vector_store=vector_store
)

# Chạy pipeline để xử lý documents
nodes = pipeline.run(documents=documents, show_progress=True, batch_size=64)

In [None]:
# Beutifulize print nodes
import json
print("Nodes:")
for node in nodes:
    print(node)

    # metadata
    # print("Metadata:")
    print(json.dumps(node.metadata, indent=2))
    break

In [None]:
from qdrant_client.models import PointStruct

In [None]:
import json
def save_to_qdrant(nodes):
    
    for node in nodes:
        text = node.text

        # Get metadata to save as payload dict
        metadata = node.metadata
        payload = dict(metadata)
        
        # Create embdding from text
        embedding = embed_model._get_text_embedding(text)
    

        try:
            qdrant_client.upsert(
                collection_name="documents_collection",
                points=[
                    PointStruct(
                        id=node.id_,
                        vector=embedding,
                        payload=payload
                        )
                    ]
                )
            print("Dữ liệu đã được lưu vào Qdrant!")
        except Exception as e:
            print(f"Error saving data to Qdrant: {e}")
# Lưu các nodes vào Qdrant
qdrant_client.recreate_collection(
    collection_name="documents_collection",
    vectors_config=VectorParams(size=1536, distance="Cosine"),
)

In [None]:
save_to_qdrant(nodes)

In [None]:
collection_info = qdrant_client.get_collection(collection_name=collection_name)
print(collection_info)

vectors = qdrant_client.scroll(
    collection_name="documents_collection",
    limit=10,
    with_payload=True,
)

vectors

In [None]:
all_points = []
scroll_token = None

while True:
    # Fetch points in batches
    response = qdrant_client.scroll(
        collection_name="documents_collection",
        with_vectors=True,  # Include vectors in the response
        with_payload=True,  # Include payloads in the response
        offset=scroll_token,  # Provide the scroll token for pagination
    )
    
    # Add retrieved points to the list
    all_points.extend(response[0])
    
    # Check if there's more data to fetch
    scroll_token = response[1]
    if scroll_token is None:  # No more data to fetch
        break
for point in all_points:
    print(f"ID: {point.id}, Vector: {point.vector}, Payload: {point.payload}")

In [None]:
from llama_index.core import Document
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core import StorageContext
# Chuyển đổi danh sách all_points thành danh sách Document
documents = [
    Document(
        id=point.id,  # ID của Document
        text=str(point.vector),  # Nội dung vector (hoặc chuyển vector thành chuỗi)
        metadata=point.payload  # Thêm metadata
    )
    for point in all_points
]

# Thêm vào docstore
docstore = SimpleDocumentStore()
docstore.add_documents(documents)
storage_context = StorageContext.from_defaults(
    docstore=docstore,
    vector_store=vector_store,
)

# retriver

In [None]:
from llama_index.retrievers.bm25 import BM25Retriever
import Stemmer

bm25_retriever = BM25Retriever.from_defaults(
    docstore=docstore,
    similarity_top_k=1,
    stemmer=Stemmer.Stemmer("english"),
    language="english",
)
from llama_index.core.response.notebook_utils import display_source_node
retrieved_nodes = bm25_retriever.retrieve(
    "What do you know?"
)
for node in retrieved_nodes:
    print(node)

In [None]:
from llama_index.core.postprocessor import SentenceEmbeddingOptimizer
from llama_index.core.postprocessor import EmbeddingRecencyPostprocessor
from llama_index.core.postprocessor import LLMRerank


node_postprocessors = [
    SentenceEmbeddingOptimizer(
        embed_model=Settings.embed_model,
        # percentile_cutoff=0.5,
        threshold_cutoff=0.7,
    ),
    EmbeddingRecencyPostprocessor(date_key="date", similarity_cutoff=0.7),
    LLMRerank(top_n=2),
]

In [None]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import get_response_synthesizer
from llama_index.core.response_synthesizers.type import ResponseMode

# Configure response synthesizer
response_synthesizer = get_response_synthesizer(llm=Settings.llm, response_mode=ResponseMode.COMPACT)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=bm25_retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=node_postprocessors,
)

In [None]:
from llama_index.core import PromptTemplate
from llama_index.core.llms import ChatMessage, MessageRole

CUSTOM_PROMPT = PromptTemplate(
    """
    Based on the conversation history between the User and the Assistant, along with the User's new question, analyze and understand the question within the context of the conversation.
    Provide a relevant response in Vietnamese, using a professional tone like a Human Resource Specialist.  

    <Conversation History>
    {chat_history}

    <Current Question>
    {question}
    """
)

custom_chat_history = [
    ChatMessage(
        role=MessageRole.USER,
        content="Hello assistant, we are having a conversation about the company's regulations.",
    ),
    ChatMessage(
        role=MessageRole.ASSISTANT,
        content="Great, would you like to know more information about the company's regulations?",
    ),
]

In [None]:
from llama_index.core.chat_engine import CondenseQuestionChatEngine

chat_engine = CondenseQuestionChatEngine.from_defaults(
    query_engine=query_engine,
    condense_question_prompt=CUSTOM_PROMPT,
    chat_history=custom_chat_history,
    verbose=True
)

In [None]:
query = "Các hình thức xử phạt của công ty?"
# response = query_engine.query(query)
response = chat_engine.chat(query)