# 1. Imports and Configuration
Here we import all the required modules and set up basic configurations.

In [None]:
import os
import shutil
import logging
import gradio as gr

import torch
import ollama
import numpy as np
import chromadb
from chromadb.config import Settings
import pymupdf as fitz

from pythainlp.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from transformers import MT5Tokenizer, MT5ForConditionalGeneration
from typing import List

# --- Configuration ---
PDF_DIR = "./data/pdfs"  # <<< IMPORTANT: Place your PDF files in this directory
TEMP_VECTOR = "./data/chromadb"
DATA_TXT_PATH = "./data/data.txt"
MODEL_NAME = "university-assistant:latest"

# Create directories if they don't exist
os.makedirs(PDF_DIR, exist_ok=True)

# --- Setup Logging ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- Global variable for summary ---
#document_summary = ""

#  2 Initialize Models and Database
This cell initializes the Chroma vector database and loads the required AI models.

In [2]:
# --- Initialize ChromaDB ---
os.environ["CHROMA_TELEMETRY_ENABLED"] = "false"

def clear_and_init_db():
    if os.path.exists(TEMP_VECTOR):
        shutil.rmtree(TEMP_VECTOR)
    client = chromadb.PersistentClient(path=TEMP_VECTOR, settings=Settings(anonymized_telemetry=False))
    collection = client.get_or_create_collection(name="pdf_data")
    logger.info("Vector database cleared and re-initialized.")
    return collection

collection = clear_and_init_db()


# --- Initialize Models ---
device = torch.device("cuda:s" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

logger.info("Loading SentenceTransformer embedding model...")
sentence_model = SentenceTransformer('intfloat/multilingual-e5-base', device=device)

logger.info("Loading MT5 summarization model...")
sum_tokenizer = MT5Tokenizer.from_pretrained('StelleX/mt5-base-thaisum-text-summarization')
sum_model = MT5ForConditionalGeneration.from_pretrained('StelleX/mt5-base-thaisum-text-summarization')

2025-10-07 09:49:44,828 - INFO - Vector database cleared and re-initialized.
2025-10-07 09:49:44,828 - INFO - Using device: cpu
2025-10-07 09:49:44,828 - INFO - Loading SentenceTransformer embedding model...
2025-10-07 09:49:44,828 - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-base
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:  12%|#2        | 136M/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

2025-10-07 10:00:09,967 - INFO - Loading MT5 summarization model...
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:  10%|9         | 231M/2.33G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/226 [00:00<?, ?B/s]

# 3. Core PDF Processing and RAG Functions
These are the backend functions that handle everything from text extraction to answering questions.

In [None]:
# Cell [3] - Core PDF Processing and RAG Functions (Corrected)

def preprocess_thai_text(text: str) -> str:
    return " ".join(word_tokenize(text, engine="newmm"))

def summarize_content(content: str) -> str:
    """ฟังก์ชันสำหรับสรุปเนื้อหาที่ได้รับมา (ตอนนี้จะใช้สำหรับสรุป context ของแต่ละคำถาม)"""
    logger.info("Generating a specific summary for the query context...")
    input_ = sum_tokenizer(content, truncation=True, max_length=1024, return_tensors="pt")
    with torch.no_grad():
        preds = sum_model.generate(
            input_['input_ids'].to('cpu'),
            num_beams=15, num_return_sequences=1, no_repeat_ngram_size=1,
            remove_invalid_values=True, max_length=512
        )
    summary = sum_tokenizer.decode(preds[0], skip_special_tokens=True)
    return summary
def embed_text(text: str) -> np.ndarray:
    processed_text = preprocess_thai_text(text)
    return sentence_model.encode(processed_text, normalize_embeddings=True, device=device)

def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
    words = text.split()
    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size - overlap)]

# --- ฟังก์ชัน RAG ที่แยกออกมา ---
def ask_question_rag_stream(question: str, llm_model: str):
    """
    ฟังก์ชันหลักสำหรับกระบวนการ RAG: ค้นหาข้อมูล, สร้าง prompt, และเรียก LLM แบบ stream
    """
    # 1. Retrieval (การดึงข้อมูล)
    question_embedding = embed_text(question)
    results = collection.query(query_embeddings=[question_embedding.tolist()], n_results=5)
    context = "\n\n".join(results["documents"][0])
    
    # 2. Augmentation (การเสริมข้อมูล)
    query_context_summary = summarize_content(context)
    
    prompt = f"""Based on the following context, please answer the question.
    Relevant Summary:
    {query_context_summary}
    ---
    Relevant Context from Documents:
    {context}
    ---
    Question: {question}
    Provide a clear and concise answer in Thai based only on the provided context and summary.
    """
    
    # 3. Generation (การสร้างคำตอบ) - คืนค่าเป็น stream iterator
    try:
        stream = ollama.chat(
            model=llm_model,
            messages=[{"role": "user", "content": prompt}],
            stream=True
        )
        yield from stream
    except Exception as e:
        error_message = f"Error communicating with Ollama: {e}"
        yield {"message": {"content": error_message}}

# 4. Startup Function to Process All PDFs
This is the main function that runs once when the notebook cell is executed. It prepares all the documents.

In [None]:
def extract_and_save_text_from_pdfs(pdf_directory: str) -> str:
    """
    ขั้นตอนที่ 1 ของ Startup: ดึงข้อมูลจาก PDF ทั้งหมด
    พร้อมทั้งเพิ่ม Header, Footer, และเลขหน้า แล้วบันทึกลงไฟล์ .txt
    """
    pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith(".pdf")]
    if not pdf_files:
        logger.error(f"No PDF files found in {pdf_directory}.")
        return "Error: No PDF files found to process."

    logger.info(f"Found {len(pdf_files)} PDF(s) to process.")

    # List สำหรับเก็บข้อความที่จัดรูปแบบแล้วของแต่ละไฟล์
    all_formatted_text = []

    for pdf_file in pdf_files:
        doc_name = os.path.basename(pdf_file)
        logger.info(f"Processing document: {doc_name}")
        try:
            doc = fitz.open(os.path.join(pdf_directory, pdf_file))
            
            # สร้าง List สำหรับเก็บส่วนต่างๆ ของเอกสารนี้
            doc_content_parts = [f"|==== Start Document: {doc_name} =====|"]

            # วนลูปแต่ละหน้าในเอกสาร
            for i, page in enumerate(doc):
                page_num = i + 1
                page_header = f"===== PAGE {page_num} ====="
                page_text = page.get_text("text").strip()

                doc_content_parts.append(page_header)
                if page_text:
                    doc_content_parts.append(page_text)
            
            doc.close()

            # เพิ่ม Footer ของเอกสาร
            doc_content_parts.append(f"||==== End Document: {doc_name} =====|")
            
            # รวมทุกส่วนของเอกสารนี้เข้าด้วยกันโดยคั่นด้วย newline
            single_doc_string = "\n".join(doc_content_parts)
            all_formatted_text.append(single_doc_string)

        except Exception as e:
            logger.error(f"Failed to process {pdf_file}: {e}")
    
    # รวมข้อความจากทุกเอกสารเข้าด้วยกัน คั่นแต่ละเอกสารด้วย newline 2 บรรทัด
    full_text_content = "\n\n".join(all_formatted_text)

    # บันทึกข้อความทั้งหมดลงไฟล์ .txt
    with open(DATA_TXT_PATH, "w", encoding="utf-8") as f:
        f.write(full_text_content)
    
    status_message = f"✅ Extracted text from {len(pdf_files)} PDF(s) with formatting and saved to '{DATA_TXT_PATH}'."
    logger.info(status_message)
    return status_message
extract_and_save_text_from_pdfs(PDF_DIR)

2025-10-07 10:33:30,575 - INFO - Found 1 PDF(s) to process: ComputerScience_65.pdf
2025-10-07 10:33:30,915 - INFO - Generating summary...
2025-10-07 10:34:33,098 - INFO - Embedding and storing 10 text chunks...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-10-07 10:34:46,459 - INFO - PDF processing complete. The system is ready.


# 5. Load Text and Populate ChromaDB

In [None]:

def load_text_to_chromadb(text_file_path: str) -> str:
    """
    ขั้นตอนที่ 2 ของ Startup: อ่านข้อมูลจากไฟล์ .txt แล้วนำไปสร้าง Vector Database
    """
    try:
        with open(text_file_path, "r", encoding="utf-8") as f:
            full_text_content = f.read()
    except FileNotFoundError:
        logger.error(f"Text file not found at {text_file_path}. Please run the extraction step first.")
        return f"Error: '{text_file_path}' not found."

    # แบ่งข้อความเป็นส่วนๆ (Chunking)
    text_chunks = chunk_text(full_text_content)
    if not text_chunks:
        return "Error: No text chunks to process."

    # ล้าง DB เก่าและนำเข้าข้อมูลใหม่
    clear_and_init_db() # สมมติว่ามีฟังก์ชันนี้อยู่เพื่อล้าง DB
    logger.info(f"Embedding and storing {len(text_chunks)} text chunks into ChromaDB...")
    
    for chunk in text_chunks:
        text_embedding = embed_text(chunk)
        collection.add(documents=[chunk], embeddings=[text_embedding.tolist()], ids=[str(hash(chunk))])
    
    status_message = f"✅ System is ready. Loaded data from .txt and populated the vector database."
    logger.info(status_message)
    return status_message
load_text_to_chromadb(DATA_TXT_PATH)

# 6. Gradio Chat Interface
This final cell defines the user interface and launches the application. It's now much simpler, containing only the chat component.

In [None]:
def chat_interface_fn(message, history):
    """
    Gradio function สำหรับหน้าจอแชท ทำหน้าที่เป็นตัวกลาง
    """
    if collection.count() == 0:
        yield "Error: No documents have been loaded. Please check the PDF directory and restart."
        return

    # เรียกใช้ฟังก์ชัน RAG หลักเพื่อรับ stream
    stream = ask_question_rag_stream(message, llm_model=MODEL_NAME)

    # เพิ่มข้อความเปล่าของผู้ช่วยเข้าไปในประวัติแชทก่อน
    history.append({"role": "assistant", "content": ""})

    # วนลูปเพื่อรับข้อมูลแต่ละชิ้น (chunk) ที่ถูกส่งมา
    for chunk in stream:
        content_piece = chunk["message"]["content"]
        # นำข้อความชิ้นล่าสุดไปต่อท้ายในประวัติแชท
        history[-1]["content"] += content_piece
        # ส่งข้อมูลประวัติแชทที่อัปเดตแล้วกลับไปที่หน้าจอทันที
        yield history

# --- ส่วนของการสร้าง UI (เหมือนเดิม) ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.ChatInterface(
        fn=chat_interface_fn,
        title="PDF Chatbot",
        description="Ask any question about the content of the pre-loaded documents."
    )

# Launch the app
demo.launch(debug=True, share=True)

  self.chatbot = Chatbot(
2025-10-07 10:54:37,262 - INFO - HTTP Request: GET http://127.0.0.1:7860/gradio_api/startup-events "HTTP/1.1 200 OK"
2025-10-07 10:54:37,294 - INFO - HTTP Request: HEAD http://127.0.0.1:7860/ "HTTP/1.1 200 OK"


* Running on local URL:  http://127.0.0.1:7860


2025-10-07 10:55:05,366 - INFO - HTTP Request: GET https://api.gradio.app/v3/tunnel-request "HTTP/1.1 200 OK"


* Running on public URL: https://190f7651662370814e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


2025-10-07 10:55:08,923 - INFO - HTTP Request: HEAD https://190f7651662370814e.gradio.live "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-10-07 10:56:31,118 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Keyboard interruption in main thread... closing server.


2025-10-07 10:57:44,193 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Killing tunnel 127.0.0.1:7860 <> https://190f7651662370814e.gradio.live


