# <a id='toc1_'></a>[RAG Sample Setup](#toc0_)

In [None]:
%pip install langchain langgraph "markitdown[all]" 'markitdown[pdf]' faiss-cpu langchain-openai langchain-experimental pydantic langchain_google_genai ragas langchain sentence_transformers docling-core docling mlflow pinecone --quiet

**Table of contents**<a id='toc0_'></a>    
- [RAG Sample Setup](#toc1_)    
    - [Phase 1: Ingestion](#toc1_1_1_)    
      - [Sample 1: Document + HybridChunker](#toc1_1_1_1_)    
      - [Sample 2: RecursiveCharacterTextSplitter](#toc1_1_1_2_)    
      - [Sample 3: Custom chunking (no import)](#toc1_1_1_3_)    
    - [Vectordb](#toc1_1_2_)    
      - [Chroma](#toc1_1_2_1_)    
      - [Faiss (Facebook AI Similarity Search)](#toc1_1_2_2_)    
      - [Weaviate](#toc1_1_2_3_)    
      - [Pinecone](#toc1_1_2_4_)    
    - [Phase 2: Model architecture](#toc1_1_3_)    
      - [Retriever](#toc1_1_3_1_)    
      - [Model pipeline](#toc1_1_3_2_)    
      - [LinearRAG](#toc1_1_3_3_)    
      - [AI Agent](#toc1_1_3_4_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

In [1]:
import os, sys

import pandas as pd
import numpy as np

from typing import Literal, TypedDict, List, Optional, Any

from pydantic import BaseModel, SecretStr

from datetime import datetime

from langchain_core.messages import HumanMessage, ToolMessage
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import google.generativeai as genai #Use gg genai for gemini API
from langgraph.prebuilt import create_react_agent
from markitdown import MarkItDown

import mlflow

## RAG
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from docling.document_converter import DocumentConverter
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractOcrOptions # to the use of OCR in scanned pdfs
from langchain_docling.loader import DoclingLoader, ExportType
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langgraph.prebuilt import create_react_agent
from langchain_community.vectorstores.utils import filter_complex_metadata
import json

from datetime import datetime

#for module import
import sys, os
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)



In [2]:
class ModelConfig(BaseModel):
    openai_api_key: SecretStr
    googleai_api_key: SecretStr
    pinecone_api_key: SecretStr
    # openai_embedding_model: Literal['text-embedding-3-small', 'text-embedding-3-large', 'text-embedding-ada-002'] #pick text-embedding-3-small for matching with pinecone
    googleai_embedding_model: Literal['gemini-embedding-001', 'gemini-embedding-002', 
                                      'text-embedding-005', 'text-multilingual-embedding-002'] #pick gemini-embedding-001 or text-embedding-005 for matching with pinecone
    # openai_chat_model: Literal['gpt-4o-mini', 'gpt-4o', 'gpt-4.1-mini', 'gpt-4.1', 'none'] #none for using gemini model
    googleai_chat_model: Literal['gemini-2-turbo', 'gemini-2-pro', 'gemini-2.5-flash', 'gemini-2.5-pro'] #gemini models
    # embedding_as_judge: Literal['text-embedding-3-small', 'text-embedding-3-large', 'text-embedding-ada-002'] 
    embedding_as_judge: Literal['gemini-embedding-001', 'gemini-embedding-002', 
                                      'text-embedding-005', 'text-multilingual-embedding-002'] 
    # llm_as_judge: Literal['gpt-4o-mini', 'gpt-4o', 'gpt-4.1-mini', 'gpt-4.1']
    llm_as_judge: Literal['gemini-2-turbo', 'gemini-2-pro', 'gemini-2.5-flash', 'gemini-2.5-pro'] #gemini models
    
    corpus_type: Literal['gen_100', 'gen_hybrid']
    max_token: int = 1024
    use_ocr: bool = False
    chunker: Literal['hybrid', 'recursive', 'custom']
    mode: Literal['test', 'real'] #test for 1 sample dataset, real for full dataset
    model_setup: Literal['linear', 'agent']
    
config = ModelConfig.model_validate(
    dict(
        openai_api_key=os.environ.get("OPENAI_API_KEY"),
        googleai_api_key = os.environ.get("GOOGLE_API_KEY"),
        pinecone_api_key=os.environ.get("PINECONE_API_KEY"),
        # openai_embedding_model='text-embedding-3-small',
        googleai_embedding_model='gemini-embedding-001',
        # openai_chat_model='none', #none for using gemini model
        googleai_chat_model='gemini-2.5-flash',
        embedding_as_judge='gemini-embedding-001',
        llm_as_judge='gemini-2-turbo',
        corpus_type='gen_100',
        max_token=1024,
        use_ocr=False,
        chunker='hybrid',
        mode='test',
        model_setup='agent'
    )
)

### <a id='toc1_1_1_'></a>[Phase 1: Ingestion](#toc0_)

In [3]:
import logging, warnings

# Suppress verbose logs
logging.getLogger("docling").setLevel(logging.WARNING)
logging.basicConfig(level=logging.ERROR)

# Suppress all warnings
warnings.filterwarnings("ignore")

In [15]:
run_name = f'{config.model_setup}_{config.corpus_type}_{datetime.now().strftime("%Y%m%d_%H%M%S")}'

In [None]:
#Setup MlFlow

import mlflow

experiment_name = "RAG_Legal_Experiment"

# Create or get experiment
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    mlflow.create_experiment(experiment_name)

# Start a parent MLflow run
with mlflow.start_run(
    run_name=run_name,
    description="RAG Model Experiment with different configurations"
) as parent_run:
    
    run_id = parent_run.info.run_id  # Save the run id for future reference
    
    # Enable LangChain autologging
    mlflow.langchain.autolog()

    def config_to_dict(config: "ModelConfig") -> dict:
        """
        Convert a Pydantic ModelConfig to a flat dictionary suitable for MLflow logging.
        Unwraps SecretStr values automatically.
        """
        d = config.model_dump()
        for k, v in d.items():
            if isinstance(v, SecretStr):
                d[k] = v.get_secret_value()
        return d

    # Log all config parameters
    mlflow.log_params(config_to_dict(config))

In [None]:
## For Loading document corpus from source

from shared_functions.global_functions import list_file_from_source, load_file

document_links = list_file_from_source()

if config.mode == 'test':
    document = [document_links[0]]
    doc = load_file(document)
elif config.mode == 'real':
    document = document_links
    doc = [load_file(document[i]) for i in range(len(document))]
    
# Output is either 1 document (test) or list of documents (real) saved in the "doc" variable

# Note that some document parsing libraries may require input as path to document storage (instead of actual file content) 
# so the load_file may not be necessary and the if-else block can be marked as "# comment"

#### <a id='toc1_1_1_1_'></a>[Sample 1: Document + HybridChunker](#toc0_)

In [4]:
from docling.document_converter import DocumentConverter, PdfFormatOption

In [None]:
# Document Converter Sample

# The source can be a local file path or a URL
source = "https://arxiv.org/pdf/2408.09869"

#Raw conversion
converter = DocumentConverter()
result = converter.convert(source) #Note that the input of convert can be a link or a path

In [8]:
## Enhance PDF conversion with OCR (in case scanned pdf or semantic image)

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions

source = 'D:/Study/Education/Projects/Group_Project_RAG/source/document/scanned_legal.pdf'

pipeline_options = PdfPipelineOptions(
    do_ocr=True,
    # ocr_options= TesseractOcrOptions(lang=['vie']) 
    ocr_options= EasyOcrOptions(lang=['vi']) 
)

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)
result = converter.convert(source)

In [15]:
#for better interpretation of Document format
print(result.document.export_to_text())

#pretty bad rendering of Vietnamese text, so we may need to tackle another approach if needed



&lt;unknown&gt;

## VĂN PHÒNG CHÍNH PHỦ

Số: 8452/VPCP-CN

Công ty TNHH Hòa Bình

## Kính gửi:

## CỘNG HOÀ XÃ HỘI CHỦ NGHĨA VIỆT NAM Độc lập do Hạnh Tự phúc

Hà Nội, ngày 09 tháng 9 năm 2025

- Bộ trưởng Bộ Xây dựng;
- ty TNHH Hòa Bình. Công

Xét đề nghị của Công ty TNHH Hòa Bình (văn bản số 134/CV-HB ngày 03 9 năm 2025 và số 13O/CV-HB ngày 25 8 năm 2025) về việc kiến nghị áp dụng đổi mới khoa công nghệ trong việc xây dựng đường cao tốc theo giải pháp của Công ty Thưong binh nặng Hòa Bình đề xuất, Phó Thủ tuớng Chính phủ Trần Hồng Hà có ý kiến như sau: tháng tháng học

Bộ trưởng Bộ Xây dựng khẩn trương báo cáo Đồng chí Bí thu, Truởng Ban Chỉ đạo Trung ương về phát triền khoa học, nghệ, đổi mới tạo và chuyển đổi số theo chỉ đạo của Lãnh đạo Chính phủ tại văn bản số 8295/VPCP CN ngày 05 tháng 9 năm 2025 của Văn phòng Chính thời , nghiên cứu xử lý để xuất, kiến nghị của Công ty theo thẩm quyền; làm việc hoặc có văn bản hướng dẫn Công ty thực hiện theo quy định của pháp luật. Tổng công sá

In [None]:
#Try OCR with English sample

print(result.document.export_to_text())

#Which is reasonably better



(

## Dear Commissioner:

There is no higher priority for the U.S. Environmental Protection Agency than protecting public health and ensuring the safety of our nation 's drinking water. Under the Safe Drinking Water Act (SDWA), (Statev and other states have the primary responsibility for the implementation and enforcement of drinking water regulations; while the EPA is tasked with oversight of state efforts. Recent events in Flint; Michigan; and other U.S. cities; have led to important discussions about the safety of our nation's drinking water [am writing today to ask you to join in taking action to strengthen our safe drinking water programs; consistent with our shared recognition of the critical importance of safe drinking water for the health of all Americans. supplies.

First;, with most states having primacy under SDWA; we need to work together to ensure that states are taking action to demonstrate that the Lead and Copper Rule (LCR) is being properly implemented. To this end, th

In [5]:
#Docling Loader Sample + HybridChunker

documents = []
sample_path = 'D:/Study/Education/Projects/Group_Project_RAG/source/document/sample_legal.pdf'
loader = DoclingLoader(
                file_path= sample_path,
                export_type=ExportType.DOC_CHUNKS,
                chunker=HybridChunker(chunk_size = 500, chunk_overlap = 50)
            )
            
            # Load and process chunks
            
for chunk in loader.load(): #load will return the full batch of documents at once, so it's better to use lazy_load with large document corpus,
                            # which is an iterator that yields one document at a time
    
    # Add metadata - note that the metadata must be changed later based on our agreement about the information of the document
    chunk.metadata.update({
        'source': sample_path.split('/')[-1],
        'file_type': 'pdf',
        'processing_method': 'docling',
        'chunker': 'hybrid',
        'file_size': len(chunk.page_content),
    })
    documents.append(chunk)

print(f"Successfully processed {sample_path.split('/')[-1]} - {len(documents)} chunks")



Successfully processed sample_legal.pdf - 34 chunks


In [25]:
for doc in documents:
    print(doc.page_content)
    print('-'* 200)

THE MINISTRY OF FINANCE
No. 51/2021/TT-BTC
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
THE SOCIALIST REPUBLIC OF VIETNAM Independence - Freedom - Happiness
Hanoi, June 30, 2021
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Guiding the obligations of organizations and individuals in foreign investment activities on the Vietnamese securities market
Pursuant to the November 26, 2019 Law on Securities;
Pursuant to the Government's Decree No. 155/2020/ND -CP of December 31, 2020, detailing a number of articles of the Law on Securities;
Pursuant  to  the  Government's  Decree  No.  87/2017/ND -CP  of  July  26, 2017, defining the functions, tasks, powers and organizational structure of

Working under Langchain environment -> use DoclingLoader

Non Langchain environment -> use DoclingConverter

#### <a id='toc1_1_1_2_'></a>[Sample 2: RecursiveCharacterTextSplitter](#toc0_)

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from markitdown import MarkItDown

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
mk = MarkItDown()

source_path = 'D:/Study/Education/Projects/Group_Project_RAG/source/document/sample_legal.pdf'

#parse to text
markdown = mk.convert(source_path).markdown

#Convert to Document
doc_obj = Document(
    page_content = markdown,
    metadata = {'source': 'sample_legal.pdf'}
    )

#Require document parsing and Document Converter first
docs = splitter.split_documents([doc_obj])

count = 0
for doc in docs:
    count += 1

print(f"Successfully chunked the document {source_path.split('/')[-1]} to {count} chunks")

Successfully chunked the document sample_legal.pdf to 78 chunks


In [19]:
for doc in docs:
    print(doc.page_content)
    print('-'* 200)

THE MINISTRY OF
FINANCE

THE SOCIALIST REPUBLIC OF VIETNAM
Independence - Freedom - Happiness

No. 51/2021/TT-BTC

Hanoi, June 30, 2021

CIRCULAR
Guiding the obligations of organizations and individuals in
foreign investment activities on the Vietnamese securities
market

Pursuant to the November 26, 2019 Law on Securities;

Pursuant  to  the  Government’s  Decree  No.  155/2020/ND-CP  of  December

31, 2020, detailing a number of articles of the Law on Securities;
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Pursuant  to  the  Government’s  Decree  No.  87/2017/ND-CP  of  July  26,
2017,  defining  the  functions,  tasks,  powers  and  organizational  structure  of  the
Ministry of Finance;

At the proposal of the Chairperson of the State Securities Commission;

The Minister of Finance promulgates the Circular guiding the obligatio

#### <a id='toc1_1_1_3_'></a>[Sample 3: Custom chunking (no import)](#toc0_)

In [17]:
def chunk_by_words(path: str, n: int = 200) ->list[str]:
    """
    Split text into chunks of exactly n words.
    Last chunk may contain fewer than n words.
    
    Args:
        path (str): The link or path to the document
        n (int): Number of words per chunk.
    
    Returns:
        list[str]: List of text chunks.
    """
    
    text = mk.convert(path).markdown
    words = text.split()
    chunks = []
    for i in range(0, len(words), n):
        chunk = " ".join(words[i:i+n])
        chunks.append(chunk)
    return chunks

In [20]:
docs = chunk_by_words(source_path, n=200)

for doc in docs:
    print(doc)
    print('-'* 200)
    
print(f"Successfully chunked the document {source_path.split('/')[-1]} to {len(docs)} chunks")

THE MINISTRY OF FINANCE THE SOCIALIST REPUBLIC OF VIETNAM Independence - Freedom - Happiness No. 51/2021/TT-BTC Hanoi, June 30, 2021 CIRCULAR Guiding the obligations of organizations and individuals in foreign investment activities on the Vietnamese securities market Pursuant to the November 26, 2019 Law on Securities; Pursuant to the Government’s Decree No. 155/2020/ND-CP of December 31, 2020, detailing a number of articles of the Law on Securities; Pursuant to the Government’s Decree No. 87/2017/ND-CP of July 26, 2017, defining the functions, tasks, powers and organizational structure of the Ministry of Finance; At the proposal of the Chairperson of the State Securities Commission; The Minister of Finance promulgates the Circular guiding the obligations of organizations and individuals in foreign investment activities on the Vietnamese securities market. Article 1. Scope of regulation, subjects of application 1. This Circular guides the implementation of Clause 6, Article 138 of the 

### <a id='toc1_1_2_'></a>[Vectordb](#toc0_)

#### <a id='toc1_1_2_1_'></a>[Chroma](#toc0_)

In [5]:
docs = []

for document in documents:
        if document.page_content.strip():
            metadata = {
                'source': document.metadata['source'],
                'file_type': document.metadata['file_type'],
                'processing_method': document.metadata['processing_method'],
                'headings': document.metadata['dl_meta']['headings'],
                'file_size': document.metadata['file_size']
            }


docs.append(Document(
    page_content=document.page_content,
    metadata=metadata
))

In [6]:
import time #For rate limiter

# ✅ Initialize embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# ✅ Save docs to Chroma
chroma_store = Chroma(
    collection_name="jobs_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_db"
)

try:
    print("🚀 Adding documents to Chroma...")
    chroma_store.add_documents(docs)
except Exception as e:
    print('Waiting for rate reset')
    time.sleep(5)
print("🎉 Successfully saved documents to Chroma!")

2025-09-11 10:31:43,556 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


🚀 Adding documents to Chroma...
Waiting for rate reset
🎉 Successfully saved documents to Chroma!


In [None]:
chroma_store.similarity_search('What is in Article 1', k = 1)

#### <a id='toc1_1_2_2_'></a>[Faiss (Facebook AI Similarity Search)](#toc0_)

Not exactly a standard Vectordb but optimized for similarity search and clustering

In [None]:
from langchain_community.vectorstores import FAISS

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

faiss_index = FAISS.from_documents(documents, embeddings) #Note that documents are list of Document objects

faiss_index.save_local("faiss_index")

In [None]:
d = 128  # embedding size
index = faiss.IndexFlatIP(d)  # inner product search
id_map = []  # map each vector to doc id

for doc_id, emb in enumerate(doc_embeddings):
    index.add(emb.numpy())  # add all token embeddings
    id_map.extend([doc_id] * emb.shape[0])

print("Index size:", index.ntotal)


#### <a id='toc1_1_2_4_'></a>[Pinecone](#toc0_)

In [None]:
## Example Pinecone Usage

from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore

# pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
# index_name = "raglegal"
# if not pc.has_index(index_name):
#     pc.create_index_for_model(
#         name=index_name,
#         cloud="aws",
#         region="us-east-1",
#         embed={
#             "model":"text-embedding-3-small",
#             "field_map":{"text": "chunk_text"}
#         }
#     )

# Will be implemented after document chunking
vectorstore = PineconeVectorStore(
            index_name='raglegal',
            embedding=OpenAIEmbeddings(api_key=config.openai_api_key, model=config.openai_embedding_model),
            namespace='test',
            pinecone_api_key=config.pinecone_api_key.get_secret_value()
        )

doc = Document(
    page_content="This is the text content",
    metadata={"source": "myfile.txt"}
)

vectorstore.add_documents([doc]) #input must be Document type

@tool
def retrieve(query: str, k: int = 3) -> list[Document]:
    results = vectorstore.similarity_search(query, k=k)
    return results

### <a id='toc1_1_3_'></a>[Phase 2: Model architecture](#toc0_)

In [9]:
from langgraph.graph import StateGraph, END
from typing import TypedDict, Annotated, Sequence
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage, ToolMessage, SystemMessage
from operator import add as add_messages
from langchain_core.tools import tool

#### <a id='toc1_1_3_3_'></a>[LinearRAG](#toc0_)

#### <a id='toc1_1_3_4_'></a>[AI Agent](#toc0_)