## initialization (2025-12-19-09-14)

In [1]:
# ============================================================================
# IMPORTS
# ============================================================================
import os
from dotenv import load_dotenv

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [2]:
# ============================================================================
# ENV SETUP (fail-fast)
# ============================================================================
load_dotenv()

OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
OPENROUTER_BASE_URL = os.getenv("OPENROUTER_BASE_URL")  # e.g. https://openrouter.ai/api/v1
if not OPENROUTER_API_KEY:
    raise RuntimeError("Missing OPENROUTER_API_KEY in .env")
if not OPENROUTER_BASE_URL:
    raise RuntimeError("Missing OPENROUTER_BASE_URL in .env")

In [None]:
# Initialize embedding model for converting text into numerical vectors
# Used during: document indexing and query embedding for retrieval
embeddings = OpenAIEmbeddings(
    api_key=OPENROUTER_API_KEY,        # API key for authentication
    base_url=OPENROUTER_BASE_URL,      # OpenRouter endpoint URL
    model="text-embedding-3-small",    # Small, fast embedding model (1536 dimensions)
)

# Initialize the LLM for generating natural language responses
# Used during: answer generation phase of RAG
llm = ChatOpenAI(
    api_key=OPENROUTER_API_KEY,        # API key for authentication
    base_url=OPENROUTER_BASE_URL,      # OpenRouter endpoint URL
    model="openai/gpt-4o",             # GPT-4o model for high-quality responses
    temperature=0.2,                   # Low temperature for more deterministic, focused answers
)

## Indexing pipeline for RAG (2025-12-19-09-14)

"""
PyCharm (With LangChain) — Simple RAG from plain text (data.txt)
- LLM: OpenRouter (ChatOpenAI)
- Retriever: FAISS (in-memory)
"""

In [3]:
# ============================================================================
# LOAD -> SPLIT
# ============================================================================
docs = TextLoader("data.txt", encoding="utf-8").load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=80,
)
splits = splitter.split_documents(docs)

In [5]:
# ============================================================================
# EMBEDDINGS -> VECTORSTORE -> RETRIEVER
# ============================================================================

vs = FAISS.from_documents(splits, embeddings)
retriever = vs.as_retriever(search_kwargs={"k": 3})

In [6]:
print(retriever)

tags=['FAISS', 'OpenAIEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000023FFE63BD70> search_kwargs={'k': 3}


In [7]:
print(f"Retriever type: {type(retriever)}")
print(f"Search kwargs: {retriever.search_kwargs}")

Retriever type: <class 'langchain_core.vectorstores.base.VectorStoreRetriever'>
Search kwargs: {'k': 3}


## Generation pipeline for RAG (2025-12-19-09-14)

In [None]:
# ============================================================================
# RAG CHAIN
# ============================================================================
prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are a helpful assistant. Answer using ONLY the provided context. "
     "If the answer is not in the context, say you don't know."),
    ("human",
     "Question: {question}\n\nContext:\n{context}")
])

def format_docs(docs_):
    return "\n\n".join(d.page_content for d in docs_)

# no main() (your default)
question = "บริษัท ABC ขายอะไรบ้าง และมีช่องทางขายอะไร?"
context_docs = retriever.invoke(question)
chain = prompt | llm | StrOutputParser()

In [None]:
answer = chain.invoke({
    "question": question,
    "context": format_docs(context_docs),
})

In [None]:
print("Q:", question)
print("\n--- Retrieved Context ---")
print(format_docs(context_docs))
print("\n--- Answer ---")
print(answer)