In [1]:
import os
import re
import unicodedata
from uuid import uuid4
from dotenv import load_dotenv
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_qdrant import QdrantVectorStore
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters  import RecursiveCharacterTextSplitter, TokenTextSplitter
from langchain.evaluation.qa import QAEvalChain

In [4]:
load_dotenv()

QDRANT_COLLECTION_NAME = "customer_assistant_collection"
DATASET_FILE = "../data/policy_manual.pdf"

In [5]:
def load_data(name_file):
    loader = PyMuPDFLoader(
        file_path=name_file,
        mode="page",               # ekstraksi per halaman
        extract_images=True,       # ikut ekstrak gambar
        extract_tables="markdown"  # ekstrak tabel dalam format markdown
    )
    docs = loader.load()
    return docs

extracted_file = load_data(DATASET_FILE)
extracted_file

Consider using the pymupdf_layout package for a greatly improved page layout analysis.


[Document(metadata={'producer': 'GPL Ghostscript 8.15', 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': 'D:20160412111310', 'source': '../data/policy_manual.pdf', 'file_path': '../data/policy_manual.pdf', 'total_pages': 43, 'format': 'PDF 1.4', 'title': 'Microsoft Word - 03 Policy Manual - Apr 2016 - excl SAICA.docx', 'author': 'MD', 'subject': '', 'keywords': '', 'moddate': 'D:20160412111310', 'trapped': '', 'page': 0}, page_content='FINANCIAL LEADERSHIP THROUGH PROFESSIONAL EXCELLENCE \n__________________________________________________________________________________ \n \n__________________________________________________________________________________ \n \nMD ACCOUNTANTS & AUDITORS INC. \n  \n \n     - 1 - \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \nCOMPANY POLICY MANUAL \n \n \n \n \n \n \n \n \n \n \n \n \n \n \nAs per your Letter of Appointment and Employment Contracts, this Company Policy Manual is to be read and \nsigned prior to the commencement of your employ

In [6]:
PATTERNS = [
    re.compile(r'FINANCIAL LEADERSHIP THROUGH PROFESSIONAL EXCELLENCE.*?- \d+ -', re.DOTALL),
    re.compile(r'_+'),
    re.compile(r'\n\s*\n+'),
    re.compile(r' {2,}'),
    re.compile(r'<br\s*/?>'),
    re.compile(r'[\u0000-\u001F\u007F-\u009F]'),
]

def clean_data(extracted_file):
    cleaned_docs = []
    for d in extracted_file:
        content = d.page_content
        metadata = d.metadata

        content = unicodedata.normalize('NFKC', content)    # Fullwidth norm.
        content = PATTERNS[0].sub('', content)
        content = PATTERNS[1].sub('', content)
        content = PATTERNS[2].sub(' ', content)
        content = PATTERNS[3].sub(' ', content)
        content = PATTERNS[4].sub(' ', content)
        content = PATTERNS[5].sub('', content)

        cleaned_docs.append(Document(page_content=content, metadata=metadata))

    cleaned_docs = [d for i, d in enumerate(cleaned_docs) if i not in (1, 2)]   # Only page w/o TOC.

    return cleaned_docs

cleaned_doccuments = clean_data(extracted_file=extracted_file)
cleaned_doccuments

[Document(metadata={'producer': 'GPL Ghostscript 8.15', 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': 'D:20160412111310', 'source': '../data/policy_manual.pdf', 'file_path': '../data/policy_manual.pdf', 'total_pages': 43, 'format': 'PDF 1.4', 'title': 'Microsoft Word - 03 Policy Manual - Apr 2016 - excl SAICA.docx', 'author': 'MD', 'subject': '', 'keywords': '', 'moddate': 'D:20160412111310', 'trapped': '', 'page': 0}, page_content=' COMPANY POLICY MANUAL As per your Letter of Appointment and Employment Contracts, this Company Policy Manual is to be read and signed prior to the commencement of your employment with the Company. Should you have any queries, please do not hesitate to contact us prior to your start date. One copy of your Employment Contract, a copy of your Letter of Appointment and the signing page of this Company Policy Manual are to be handed to Cathy Haumann on your first day of employment. On commencement of your employment, you will always have access to th

In [7]:
def split_text(cleaned_doccuments, chunk_size, chunk_overlap):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return text_splitter.split_documents(cleaned_doccuments)

splitted_text = split_text(
    cleaned_doccuments=cleaned_doccuments, 
    chunk_size=500, 
    chunk_overlap=200
)
splitted_text

[Document(metadata={'producer': 'GPL Ghostscript 8.15', 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': 'D:20160412111310', 'source': '../data/policy_manual.pdf', 'file_path': '../data/policy_manual.pdf', 'total_pages': 43, 'format': 'PDF 1.4', 'title': 'Microsoft Word - 03 Policy Manual - Apr 2016 - excl SAICA.docx', 'author': 'MD', 'subject': '', 'keywords': '', 'moddate': 'D:20160412111310', 'trapped': '', 'page': 0}, page_content='COMPANY POLICY MANUAL As per your Letter of Appointment and Employment Contracts, this Company Policy Manual is to be read and signed prior to the commencement of your employment with the Company. Should you have any queries, please do not hesitate to contact us prior to your start date. One copy of your Employment Contract, a copy of your Letter of Appointment and the signing page of this Company Policy Manual are to be handed to Cathy Haumann on your first day of employment. On commencement'),
 Document(metadata={'producer': 'GPL Ghostscript 8.

In [None]:
# import requests

# res = requests.get("http://localhost:6333/collections")
# res.json()

{'result': {'collections': [{'name': 'customer_assistant_collection'}]},
 'status': 'ok',
 'time': 0.004469022}

In [None]:
db_client = QdrantClient(url="http://localhost:6333")
# db_client = QdrantClient(":memory:")
# db_client = QdrantClient(path="/tmp/langchain_qdrant")
embed_client = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")       # 384
# embed_client = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")    # 768

In [None]:
# db_client.delete_collection(collection_name=QDRANT_COLLECTION_NAME)

True

In [None]:
def setup_vector_store(db_client, embed_client, collection_name, reset=False):
    vector_size = len(embed_client.embed_query("sample text"))

    if db_client.collection_exists(collection_name):
        if reset:
            db_client.delete_collection(collection_name)      # Handle duplicate
        else:
            return QdrantVectorStore(
                client=db_client,
                collection_name=collection_name,
                embedding=embed_client,
            )

    db_client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
    )

    return QdrantVectorStore(
        client=db_client,
        collection_name=collection_name,
        embedding=embed_client,
    )

vector_store = setup_vector_store(
    db_client=db_client, 
    embed_client=embed_client, 
    collection_name=QDRANT_COLLECTION_NAME
)

In [None]:
# results = vector_store.similarity_search(
#     "How many distribution centers does Nike have in the US?"
# )

# print(results[0])

page_content='casual shoes (ladies shoes must have a heel and not look like casual beach wear) vi) Tracksuits / casual pants with pockets / cargo pants / big baggy pants that then taper in towards the feet vii) Inappropriate jewellery including all toe / nose / tongue rings, ankle chains and earrings worn by men viii) Beach wear of any kind ix) Oversized / chunky / bulky jerseys You are required to dress in a manner appropriate for an office environment adhering to a strict ‘Business Casual’ dress code for' metadata={'producer': 'GPL Ghostscript 8.15', 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': 'D:20160412111310', 'source': '../knowledge_base/policy_manual.pdf', 'file_path': '../knowledge_base/policy_manual.pdf', 'total_pages': 43, 'format': 'PDF 1.4', 'title': 'Microsoft Word - 03 Policy Manual - Apr 2016 - excl SAICA.docx', 'author': 'MD', 'subject': '', 'keywords': '', 'moddate': 'D:20160412111310', 'trapped': '', 'page': 25, '_id': 'b8bff70a-311d-4992-bc8d-25cf1999531

In [None]:
def add_data(vector_store, splitted_text):
    ids = [str(uuid4()) for _ in range(len(splitted_text))]
    vector_store.add_documents(
        documents=splitted_text,
        ids=ids
    )

add_data(vector_store, splitted_text)

In [22]:
db_client.count(collection_name=QDRANT_COLLECTION_NAME)

CountResult(count=325)

In [28]:
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 5}
)

In [None]:
def rag_chain(vector_store):
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.5-flash", 
        temperature=0
    )

    # retriever = vector_store.as_retriever(
    #     search_type="similarity", 
    #     search_k=3
    # )
    retriever = vector_store.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={"score_threshold": 0.5}
    )

    # system_prompt = (
    #     "Use the given context to answer the question. "
    #     "If you don't know the answer, say you don't know. "
    #     "Context: {context}"
    # )

    system_prompt = (
        "You are a professional assistant named *MD Assistant*. "
        "Your responsibility is to give information about MD Company. "
        "Provide a concise and accurate answer based on the given context without unecessary explanation. "
        "If you don't know the answer, say you don't know. "
        "Context: {context}"
    )

    prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("human", "{question}"),
    ])

    chain = (
        {
            "context": retriever | RunnablePassthrough(lambda docs: "\n\n".join(d.page_content for d in docs)),
            "question": RunnablePassthrough(),
        }
        | prompt
        | llm
        | StrOutputParser()
    )
    
    return chain

chain = rag_chain(vector_store=vector_store)

In [None]:
while True:
    question = input("User: ")
    if question.lower() == 'exit':
        break

    answer = chain.invoke(question)
    print(f"User: {question}\nAsisten: {answer}\n")

In [None]:
# questions = ["placeholder"]
# responses = []
# ground_truth = ["placeholder"]

# for question in questions:
#     response = chain.invoke({"input": question})
#     responses.append(response["answer"])

# eval_examples = [
#     {"query": q, "result": r, "answer": g}
#     for q, r, g in zip(questions, responses, ground_truth)
# ]

# QA_client = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0)
# evaluator = QAEvalChain.from_llm(llm=QA_client)
# results = evaluator.batch(eval_examples)