# Environment Variable

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
env

{'COLORTERM': 'truecolor',
 'COMMAND_MODE': 'unix2003',
 'HOME': '/Users/picetrp',
 'LANG': 'en_GB.UTF-8',
 'LOGNAME': 'picetrp',
 'PATH': '/Users/picetrp/Documents/Learn/KMUTT/MS/projects/simple_rag/.venv/bin:/Users/picetrp/.langflow/uv:/Users/picetrp/Downloads/google-cloud-sdk/bin:/Users/picetrp/.local/bin:/Users/picetrp/.pyenv/shims:/Users/picetrp/.pyenv/bin:/opt/homebrew/opt/jpeg/bin:/Users/picetrp/opt/anaconda3/condabin:/Users/picetrp/.nvm/versions/node/v16.20.2/bin:/Users/picetrp/.rubies/ruby-3.2.2/bin:/opt/homebrew/bin:/usr/local/bin:/System/Cryptexes/App/usr/bin:/usr/bin:/bin:/usr/sbin:/sbin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/local/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/appleinternal/bin:/Library/Apple/usr/bin:/Library/TeX/texbin:/Users/picetrp/.cargo/bin:/Users/picetrp/Library/Application Support/JetBrains/Toolbox/scripts:/Users/picetrp/development/flut

In [5]:
import os
print(os.getenv("PWD"))
print(os.getcwd())

/Users/picetrp/Documents/Learn/KMUTT/MS/projects/simple_rag
/Users/picetrp/Documents/Learn/KMUTT/MS/projects/simple_rag/notebooks


In [6]:
from pydantic import Field, validator
from pydantic_settings import BaseSettings, SettingsConfigDict
import os

class Settings(BaseSettings):
    # * langsmith setting
    langsmith_tracing: str
    langsmith_endpoint: str
    langsmith_api_key: str
    langsmith_project: str

    # * openai settings
    openai_api_key: str
    google_api_key: str
    embedding_model: str = "all-MiniLM-L6-v2"
    
    # * qdrant settings
    qdrant_cloud_api_key: str
    qdrant_cloud_url: str = "https://cloud.qdrant.io"
    qdrant_collection_name: str = "demo_collection"

    # * huggingface token
    huggingface_token: str
    
    # * chunking settings
    chunk_size: int = 1000
    chunk_overlap: int = 200
    top_k_results: int = 5
    
    # * api settings
    api_host: str = "0.0.0.0"
    api_port: int = 8000
    
    # * file settings
    root_dir: str = "/Users/picetrp/Documents/Learn/KMUTT/MS/projects/simple_rag"
    upload_dir: str = os.path.join(root_dir, "data/uploads")

    model_config = SettingsConfigDict(
        env_file= os.path.join(root_dir, ".env"),
        env_file_encoding="utf-8",
    )
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        os.makedirs(self.upload_dir, exist_ok=True)

settings = Settings()

# Document

In [11]:
root_path = settings.root_dir
data_dirpath = settings.upload_dir
data_dirpath

'/Users/picetrp/Documents/Learn/KMUTT/MS/projects/simple_rag/data/uploads'

In [17]:
from typing import List, Set

def get_filtered_files_by_extension(folder_path: str,
                                    extensions: Set[str] = {".pdf", ".txt", ".docx", ".doc"}
                                   ) -> List[str]:
    extensions = {ext.lower() for ext in extensions}  # Ensure all extensions are lowercase

    try:
        all_files = os.listdir(folder_path)
    except FileNotFoundError:
        raise ValueError(f"Folder not found: {folder_path}")

    return [
        f for f in all_files
        if os.path.isfile(os.path.join(folder_path, f)) and os.path.splitext(f)[1].lower() in extensions
    ]

doc_files = get_filtered_files_by_extension(data_dirpath)
doc_files = [os.path.join(data_dirpath, filename) for filename in doc_files]
doc_files

['/Users/picetrp/Documents/Learn/KMUTT/MS/projects/simple_rag/data/uploads/รายละเอียดในการสอบ_short_paper.pdf',
 '/Users/picetrp/Documents/Learn/KMUTT/MS/projects/simple_rag/data/uploads/ข้อมูลที่ต้องเขียนในส่วนของวิธีดำเนินการวิจัยโดยสังเขป.doc',
 '/Users/picetrp/Documents/Learn/KMUTT/MS/projects/simple_rag/data/uploads/Template-short-Paper-3-1.docx',
 '/Users/picetrp/Documents/Learn/KMUTT/MS/projects/simple_rag/data/uploads/sit_kmutt_แนะนำการทำ_Short_Paper.txt',
 '/Users/picetrp/Documents/Learn/KMUTT/MS/projects/simple_rag/data/uploads/แบบฟอร์ม_บ1_IRB.doc',
 '/Users/picetrp/Documents/Learn/KMUTT/MS/projects/simple_rag/data/uploads/รายชื่อคณะกรรมการสอบการศึกษาค้นคว้าอิสระขั้นสุดท้าย.pdf',
 '/Users/picetrp/Documents/Learn/KMUTT/MS/projects/simple_rag/data/uploads/การโอนลิขสิทธิ์การศึกษาโครงการเฉพาะเรื่อง-IT.docx',
 '/Users/picetrp/Documents/Learn/KMUTT/MS/projects/simple_rag/data/uploads/การโอนลิขสิทธิ์การศึกษาโครงการเฉพาะเรื่อง-BIS.docx']

In [20]:
import os
from langchain_community.document_loaders import TextLoader, PyPDFLoader, UnstructuredWordDocumentLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

def preprocess_document(file_path):
    """
    Load a document based on its file extension and return LangChain Document(s).
    
    Supported:
    - .txt
    - .pdf
    - .docx
    """
    ext = os.path.splitext(file_path)[1].lower()
    
    if ext == ".txt":
        loader = TextLoader(file_path, encoding="utf-8")
    elif ext == ".pdf":
        loader = PyPDFLoader(file_path)
    elif ext == ".docx" or ext == ".doc":
        loader = UnstructuredWordDocumentLoader(file_path, mode="elements")
    else:
        raise ValueError(f"Unsupported file type: {ext}")
    
    return loader.load()

In [22]:
from tqdm.notebook import tqdm
from langchain_community.document_loaders import PyPDFLoader
all_docs = []
for doc_path in tqdm(doc_files):
    doc_content = preprocess_document(doc_path)
    all_docs.append(doc_content)

  0%|          | 0/8 [00:00<?, ?it/s]

In [23]:
all_docs[0]

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-05-18T12:50:12+07:00', 'author': 'WALAIPUN PORNWIROON', 'moddate': '2025-05-18T12:50:12+07:00', 'source': '/Users/picetrp/Documents/Learn/KMUTT/MS/projects/simple_rag/data/uploads/รายละเอียดในการสอบ_short_paper.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='ขั้นตอนที่ต้องปฏิบัติก่อนถึงวันสอบ Short Paper ขั้นสุดท้าย (เฉพาะนักศึกษารหัสนำหน้า 63XXXXXXXXX \nขึ้นไป) \n            1.นักศึกษาต้องแก้ไขตามคำแนะนำของอาจารย์ที่ปรึกษาและกรรมการภายในคณะฯ ให้เสร็จสิ้นเรียบร้อย      \nทั้งนี้ระบบจะดึงบทความ Short Paper ฉบับล่าสุดให้กับผู้ทรงคุณวุฒิภายนอกอัตโนมัติ \n2. ดำเนินการ Upload คลิป VDO บรรยายผลการศึกษา Short Paper ความยาวประมาณ ไม่เกิน 5 นาที      \nไฟล์ PowerPoint และบทความ Short Paper ฉบับสมบูรณ์ที่ผ่านจากกรรมการภายในคณะ 2 ท่านเรียบร้อยแล้ว ให้ \nUpload link ใน Google Drive เท่านั้น ไม่อนุญาต Upload link ใน YouTube โดยนำ link ที่จัด

# Qdrant

In [5]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

In [6]:
import os
from huggingface_hub import login
login(token=settings.huggingface_token)

In [7]:
encoder = SentenceTransformer("all-MiniLM-L6-v2")

In [8]:
from qdrant_client import QdrantClient

client = QdrantClient(url=settings.qdrant_cloud_url, api_key=settings.qdrant_cloud_api_key)

In [10]:
from qdrant_client.models import Distance, VectorParams

def create_collection(client: QdrantClient,
                      collection_name: str = "test_collection",
                      vector_size: int = 384, 
                      distance: Distance = Distance.DOT, 
                      **kwargs) -> None:
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=vector_size, 
            distance=distance
        ),
    )
    return 

def delete_collection(client: QdrantClient,
                      collection_name: str = "test_collection",
                      **kwargs) -> None:
    client.delete_collection(
        collection_name=collection_name,
    )

# create_collection(client, distance=Distance.COSINE)
# delete_collection(client, collection_name="testestest")

In [None]:
from qdrant_client.models import PointStruct, Filter

def upsert():
    vectors = self.embedder.embed(texts)
    points = [
        PointStruct(id=ids[i], vector=vectors[i], payload=metadatas[i])
        for i in range(len(ids))
    ]
    self.qdrant_client.upsert(collection_name=self.collection_name, points=points)