## import library

In [12]:
# %pip install v llama-index-vector-stores-qdrant
# %pip install --upgrade llama-index
# %pip install --upgrade fastembed
# %pip install --upgrade llama-index-storage-chat-store-redis
# %pip install --upgrade llama-index-core
# %pip install --upgrade redis
# %pip install --upgrade llama-index-storage-kvstore-redis
# %pip install "numpy<2.0"
# %pip install --upgrade pandas pyarrow llama-index
# %pip install "pybind11>=2.12"
# %pip install --upgrade llama-index-retrievers-bm25

In [13]:
import os
from dotenv import load_dotenv

# Tải các biến môi trường từ file .env
load_dotenv()

# Lấy giá trị khóa API từ biến môi trường
openai_api_key = os.getenv("OPENAI_API_KEY")

In [14]:
import os
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

# Initialize the LLM
llm = OpenAI(
    model="gpt-4o-mini"
)

# Initialize the LLM embeddings
embed_model = OpenAIEmbedding(
    model="text-embedding-3-small"
)
# Global settings
Settings.llm = llm
Settings.embed_model = embed_model

## read and parse

In [15]:
import os
import tempfile
import pdfplumber
from typing import List
from llama_index.core import SimpleDirectoryReader

def process_single_pdf(pdf_path: str) -> List:
    """
    Load and parse a specific PDF file using SimpleDirectoryReader.
    Args:
        pdf_path (str): Path to the PDF file to process.

    Returns:
        List: Documents from SimpleDirectoryReader.
    """
    # Check file existence
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"File not found: {pdf_path}")

    # Check file extension
    if not pdf_path.lower().endswith('.pdf'):
        raise ValueError(f"File {pdf_path} is not a PDF file")

    # Create temporary directory
    with tempfile.TemporaryDirectory() as temp_dir:
        try:
            pdf_filename = os.path.basename(pdf_path)
            temp_text_file = os.path.join(temp_dir, f'{os.path.splitext(pdf_filename)[0]}.txt')

            # Load PDF and save to temporary text file
            with pdfplumber.open(pdf_path) as pdf:
                text_parts = []  # Use a list to gather text parts
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:  # Only add if there's text
                        text_parts.append(page_text)

            # Write all text at once
            with open(temp_text_file, 'w', encoding='utf-8') as f:
                f.write(''.join(text_parts))

            print(f"File processed: {pdf_filename}")

            # Use SimpleDirectoryReader to read text files
            documents = SimpleDirectoryReader(temp_dir).load_data()

        except pdfplumber.PDFException as e:
            raise Exception(f"PDF processing error for file {pdf_filename}: {str(e)}")
        except Exception as e:
            raise Exception(f"Error processing file {pdf_filename}: {str(e)}")

    return documents

input_dir = "TULD02.pdf"
documents = process_single_pdf(input_dir)

print(documents)

File processed: TULD02.pdf
[Document(id_='d5a59e39-e57e-4c76-8ed7-6c538bbe6ff6', embedding=None, metadata={'file_path': 'C:\\Users\\LUNE\\AppData\\Local\\Temp\\tmpaum5ax_9\\TULD02.txt', 'file_name': 'TULD02.txt', 'file_type': 'text/plain', 'file_size': 16468, 'creation_date': '2024-12-01', 'last_modified_date': '2024-12-01'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text='CỘNG HÒA XÃ HỘI CHỦ NGHĨA VIỆT NAM\r\nĐộc lập - Tự do - Hạnh phúc\r\nTHỎA ƯỚC\r\nLAO ĐỘNG TẬP THỂ\r\nCăn cứ vào Bộ Luật lao động số 45/2019/QH14 ngày 20/11/2019;\r\nĐể đảm bảo quyền lợi và nghĩa vụ hợp pháp của mỗi bên trong quan hệ\r\nlao động, chúng tôi gồm có:\r\n1. Người sử dụng lao động:\r\nÔng: PHẠM NGỌC THUẬN - Tổng Gi

# define qdrant

In [16]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams

# Kết nối tới Qdrant (đang chạy cục bộ hoặc cloud)
qdrant_client = QdrantClient(url="http://localhost:6333")
collection_name = "documents_collection"

# Kiểm tra kết nối và danh sách các collections hiện có
collections = qdrant_client.get_collections()
print("Current collections:", collections)

# Kiểm tra xem collection đã tồn tại hay chưa
if collection_name not in [collection.name for collection in collections.collections]:
    # Tạo collection nếu chưa tồn tại
    vector_params = VectorParams(size=1536, distance="Cosine")  # Kích thước vector và khoảng cách cosine
    qdrant_client.create_collection(
        collection_name=collection_name,
        vectors_config=vector_params,
        on_disk = True
    )
    print(f"Collection '{collection_name}' đã được tạo.")
else:
    print(f"Collection '{collection_name}' đã tồn tại.")

Current collections: collections=[CollectionDescription(name='documents_collection')]
Collection 'documents_collection' đã tồn tại.


In [17]:
import json
def save_to_qdrant(nodes):
    
    for node in nodes:
        text = node.text

        # Get metadata to save as payload dict
        metadata = node.metadata
        payload = dict(metadata)
        
        # Create embdding from text
        embedding = embed_model._get_text_embedding(text)
    

        try:
            qdrant_client.upsert(
                collection_name="documents_collection",
                points=[
                    PointStruct(
                        id=node.id_,
                        vector=embedding,
                        payload=payload
                        )
                    ]
                )
            print("Dữ liệu đã được lưu vào Qdrant!")
        except Exception as e:
            print(f"Error saving data to Qdrant: {e}")

In [18]:
from qdrant_client import QdrantClient

# Kiểm tra xem collection có tồn tại không
try:
    # Lấy thông tin về collection
    collection_info = qdrant_client.get_collection(collection_name)
    print(f"Collection '{collection_name}' đã được kết nối thành công.")
except Exception as e:
    print(f"Lỗi khi kết nối với collection: {str(e)}")


Collection 'documents_collection' đã được kết nối thành công.


In [19]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import StorageContext

# Khởi tạo QdrantVectorStore với tên collection được cập nhật
vector_store = QdrantVectorStore(
    client=qdrant_client, 
    collection_name="documents_collection"  # Đổi tên collection tại đây
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
print(storage_context)

StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x000002A5808A7B90>, index_store=<llama_index.core.storage.index_store.simple_index_store.SimpleIndexStore object at 0x000002A5FE2F7A10>, vector_stores={'default': QdrantVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=False, collection_name='documents_collection', url=None, api_key=None, batch_size=64, parallel=1, max_retries=3, client_kwargs={}, enable_hybrid=False, index_doc_id=True, fastembed_sparse_model=None, text_key='text'), 'image': SimpleVectorStore(stores_text=False, is_embedding_query=True, data=SimpleVectorStoreData(embedding_dict={}, text_id_to_ref_doc_id={}, metadata_dict={}))}, graph_store=<llama_index.core.graph_stores.simple.SimpleGraphStore object at 0x000002A580D19CD0>, property_graph_store=None)


# indexing & chunking & pipeline

In [20]:
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.extractors import QuestionsAnsweredExtractor, KeywordExtractor
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.core import Settings
from qdrant_client import QdrantClient
import numpy as np
from datetime import datetime
import nest_asyncio

nest_asyncio.apply()

# Tạo vector_store sử dụng Qdrant
from llama_index.vector_stores.qdrant import QdrantVectorStore
vector_store = QdrantVectorStore(client=qdrant_client, collection_name="documents_collection")

# Cấu hình các extractor và node parser
extractors = [
    QuestionsAnsweredExtractor(llm=Settings.llm, questions=2),
    KeywordExtractor(llm=Settings.llm, keywords=10),
]

splitter = SemanticSplitterNodeParser(
    buffer_size=1,
    breakpoint_percentile_threshold=95,
    embed_model=Settings.embed_model
)

# Các transformations
transformations = [splitter] + extractors

# Khởi tạo ingestion pipeline
pipeline = IngestionPipeline(
    transformations=transformations,
    vector_store=vector_store
)

# Chạy pipeline để xử lý documents
nodes = pipeline.run(documents=documents, show_progress=True, batch_size=64)

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/122 [00:00<?, ?it/s]

100%|██████████| 7/7 [00:06<00:00,  1.08it/s]
100%|██████████| 7/7 [00:02<00:00,  2.95it/s]


# Save to qdrant

In [21]:
# Beutifulize print nodes
import json
print("Nodes:")
for node in nodes:
    print(node)

    # metadata
    # print("Metadata:")
    print(json.dumps(node.metadata, indent=2))
    break

Nodes:
Node ID: 62259e0b-2e20-4ce9-aeaa-ed206a1fc5fd
Text: CỘNG HÒA XÃ HỘI CHỦ NGHĨA VIỆT NAM  Độc lập - Tự do - Hạnh phúc
THỎA ƯỚC  LAO ĐỘNG TẬP THỂ  Căn cứ vào Bộ Luật lao động số
45/2019/QH14 ngày 20/11/2019;  Để đảm bảo quyền lợi và nghĩa vụ hợp
pháp của mỗi bên trong quan hệ  lao động, chúng tôi gồm có:  1. Người
sử dụng lao động:  Ông: PHẠM NGỌC THUẬN - Tổng Giám đốc Tổng Công ty.
{
  "file_path": "C:\\Users\\LUNE\\AppData\\Local\\Temp\\tmpaum5ax_9\\TULD02.txt",
  "file_name": "TULD02.txt",
  "file_type": "text/plain",
  "file_size": 16468,
  "creation_date": "2024-12-01",
  "last_modified_date": "2024-12-01",
  "questions_this_excerpt_can_answer": "Based on the provided context, here are two specific questions that can be answered:\n\n1. **Who is the employer mentioned in the labor agreement?**\n   - Answer: \u00d4ng PH\u1ea0M NG\u1eccC THU\u1eacN - T\u1ed5ng Gi\u00e1m \u0111\u1ed1c T\u1ed5ng C\u00f4ng ty.\n\n2. **What is the legal basis for the collective labor agreement refere

In [22]:
from qdrant_client.models import PointStruct

In [28]:
import json
def save_to_qdrant(nodes):
    
    for node in nodes:
        text = node.text

        # Get metadata to save as payload dict
        metadata = node.metadata
        payload = dict(metadata)
        
        # Create embdding from text
        embedding = embed_model._get_text_embedding(text)
    

        try:
            qdrant_client.upsert(
                collection_name="documents_collection",
                points=[
                    PointStruct(
                        id=node.id_,
                        vector=embedding,
                        payload=payload
                        )
                    ]
                )
        except Exception as e:
            print(f"Error saving data to Qdrant: {e}")
# Lưu các nodes vào Qdrant
qdrant_client.recreate_collection(
    collection_name="documents_collection",
    vectors_config=VectorParams(size=1536, distance="Cosine"),
)

  qdrant_client.recreate_collection(


True

In [29]:
save_to_qdrant(nodes)

In [30]:
collection_info = qdrant_client.get_collection(collection_name=collection_name)
print(collection_info)

vectors = qdrant_client.scroll(
    collection_name="documents_collection",
    limit=10,
    with_payload=True,
)

vectors

status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=0 points_count=7 segments_count=8 config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=1536, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None)

([Record(id='1bb2d658-4950-4022-9fd5-8361aba84441', payload={'file_path': 'C:\\Users\\LUNE\\AppData\\Local\\Temp\\tmpaum5ax_9\\TULD02.txt', 'file_name': 'TULD02.txt', 'file_type': 'text/plain', 'file_size': 16468, 'creation_date': '2024-12-01', 'last_modified_date': '2024-12-01', 'questions_this_excerpt_can_answer': 'Based on the provided context regarding the collective labor agreement (Thỏa ước lao động tập thể) at Becamex IDC, here are two specific questions that can be answered from the text:\n\n1. **What is the duration of the collective labor agreement and what happens if new labor laws provide greater benefits during this period?**\n   - The collective labor agreement is effective for 03 years from the date of signing, and if new labor laws are enacted that provide greater benefits than those agreed upon in the contract, the provisions of the current law will apply.\n\n2. **Who are the representatives signing the collective labor agreement on behalf of the employees and the empl

In [26]:
all_points = []
scroll_token = None

while True:
    # Fetch points in batches
    response = qdrant_client.scroll(
        collection_name="documents_collection",
        with_vectors=True,  # Include vectors in the response
        with_payload=True,  # Include payloads in the response
        offset=scroll_token,  # Provide the scroll token for pagination
    )
    
    # Add retrieved points to the list
    all_points.extend(response[0])
    
    # Check if there's more data to fetch
    scroll_token = response[1]
    if scroll_token is None:  # No more data to fetch
        break
for point in all_points:
    print(f"ID: {point.id}, Vector: {point.vector}, Payload: {point.payload}")

ID: 1bb2d658-4950-4022-9fd5-8361aba84441, Vector: [-0.020973377, 0.050685663, 0.043888737, -0.014885272, 0.05655044, 0.0028984044, -0.025634129, 0.05352095, 0.005277329, -0.06637686, 0.013118071, 0.032489315, 0.03755788, -0.017701142, 0.00520936, 0.0034712881, -0.052511122, -0.03984942, 0.005092841, 0.014768753, -0.0050394367, 0.021051057, 0.015438736, 0.043966413, -0.010341041, -0.021167576, -0.033809863, -0.009865255, -0.03132413, -0.03402348, 0.054103546, -0.027828565, 0.010700307, 0.0043840185, 0.01508918, 0.036955867, 0.004012615, 0.0394416, 0.021128736, -0.04917092, 0.0024808787, -0.0035344025, -0.014962952, -0.038023956, 0.002340085, -0.0074232165, -0.0012022066, 0.019011978, -0.015186279, 0.04730662, 0.02369215, -0.01506976, 0.018128378, 0.03107167, -0.043849897, -0.009622508, 0.012506347, -0.022934778, 0.025168054, -0.008622388, -0.053171396, -0.016467985, 0.018206056, 0.0014953242, -0.035169248, -0.02843058, -0.014050221, 0.04163604, -0.00075433764, -0.027090613, -0.044665527

In [31]:
from llama_index.core import Document
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
# Chuyển đổi danh sách all_points thành danh sách Document
documents = [
    Document(
        id=point.id,  # ID của Document
        text=str(point.vector),  # Nội dung vector (hoặc chuyển vector thành chuỗi)
        metadata=point.payload  # Thêm metadata
    )
    for point in all_points
]

# Thêm vào docstore
docstore = SimpleDocumentStore()
docstore.add_documents(documents)
storage_context = StorageContext.from_defaults(
    docstore=docstore,
    vector_store=vector_store,
)