<a href="https://colab.research.google.com/github/VOX304/SchoolChatbot/blob/main/RAG_SQTT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Packages setting up

In [None]:
%pip install langchain \
langchain_community \
langchain_core \
langchain_google_genai \
python-dotenv \
pypdf

Collecting langchain_community
  Downloading langchain_community-0.3.19-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain_google_genai
  Downloading langchain_google_genai-2.1.0-py3-none-any.whl.metadata (3.6 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting pypdf
  Downloading pypdf-5.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain_google_genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting google-ai-generativelanguage<0.7.0,>=0.6.16 (from langchain

In [None]:
%pip install faiss-cpu



In [None]:
%pip install scikit-learn \
numpy

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import os
from google.colab import userdata

os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')
embedding_model = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
pdf_files = ['/content/20240918_466_QD-DHVD_Ban hanh Quy che dao tao Nam dai cuong cua VGU-VN.pdf',
             '/content/CSE2021_Info Session  Internship, Thesis and Graduation.pdf',
             '/content/CSE_Specific Examination Regulation_Annex A and B_VN.pdf',
             '/content/General Examination Regulation for Bachelor and Master programs_VN.pdf']  # Adjust paths

#PDF-Preprocessing & VectorDB

In [None]:
documents = []
for pdf in pdf_files:
    pdf_loader = PyPDFLoader(pdf)
    documents.extend(pdf_loader.load())

text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

# Ensure embeddings are generated correctly
#embeddings = embedding_model.embed_documents([doc.page_content for doc in chunks])

# Pass embedded vectors to FAISS

with open("extracted_content.txt", "w", encoding="utf-8") as f:
    for i, chunk in enumerate(chunks):
        f.write(f"Chunk {i+1}:\n{chunk.page_content}\n\n{'='*50}\n\n")

print("📄 Extracted content saved to extracted_content.txt")

📄 Extracted content saved to extracted_content.txt


In [None]:
vector_db = FAISS.from_documents(chunks, embedding_model)

In [None]:
print(f"✅ Processed {len(chunks)} text chunks into FAISS vector database.")


✅ Processed 605 text chunks into FAISS vector database.


In [None]:
query = "What is the requirement for graduation?"
retrieved_docs = vector_db.similarity_search(query, k = 3)


In [None]:
for i, doc in enumerate(retrieved_docs[:3]):  # Show top 3
    print(f"\n📄 Document {i+1}:\n{doc.page_content}")


📄 Document 1:
GRADUATION
1. General Information
2. Graduation Timeline

📄 Document 2:
Vietnamese-German University Computer Science Program
General Information
1. Prerequisites: 
- Pass all modules (180 ECTS)
- Complete 04 German classes or submit an A2 German Certificate
2. Expected timeline:
- VGU conducts two graduation assessments annually: in April and October 
- Only one Graduation Ceremony: November

📄 Document 3:
General Information
1. Prerequisites: 
- Evidence of the internship registration with a signed training contract (IC)
- Successful completion of all modules of the first 5 semesters (150 ECTS)
2. Grading Policy: Bachelor Thesis (weighting 80%) and Colloquium 
(min. 30 min. and max. 60 min., weighting 20%)
3. Regulation: Thesis final reports submitted late will fail. Bachelor’s 
thesis with colloquium only be repeated once.
Vietnamese-German University
7
Computer Science Program

📄 Document 4:
Vietnamese-German University Computer Science Program
Graduation Timeline
1.

#LLM model

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

chat_model = ChatGoogleGenerativeAI(
    google_api_key=os.environ["GOOGLE_API_KEY"],
    model="gemini-2.0-flash-thinking-exp-01-21",
    temperature=0.7
)
print("✅ Chat model loaded successfully.")

✅ Chat model loaded successfully.


#Augment_Prompt

In [None]:
from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage
)

def augment_prompt(query):
    # Get top 3 results from the knowledge base
    results = vector_db.similarity_search(query, k=4)

    # Extract text, sources, and pages
    source_map = {}
    for doc in results:
        source = doc.metadata.get("source", "Unknown")
        page = doc.metadata.get("page", "Unknown")
        source_map[doc.page_content] = (source, page)

    # Construct the augmented prompt
    source_knowledge = "\n".join(source_map.keys())

    augmented_prompt = f"""Bạn là tư vấn viên của trường Sĩ Quan Thông Tin. Dựa vào nội dung tài liệu, hãy trả lời câu hỏi một cách chính xác và thân thiện bằng tiếng Việt.
    Không thêm thông tin ngoài nội dung tài liệu. Nếu không tìm thấy câu trả lời trong tài liệu, chỉ cần nói rằng bạn không biết.

    Nội dung tài liệu:
    {source_knowledge}

    Câu hỏi:
    {query}"""


    return augmented_prompt, source_map



#Test - generating answer based on knowledge base

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# User question
question = "Theo thông tin từ buổi Info Session CSE2021, sinh viên CSE cần đáp ứng những yêu cầu học tập chính nào ngoài khóa luận tốt nghiệp để đủ điều kiện tốt nghiệp?"

# Generate augmented prompt and retrieve sources
context, source_map = augment_prompt(question)

# Create human message for Gemini model
prompt = HumanMessage(content=context)

# Invoke Gemini model
res = chat_model.invoke([prompt])
response_text = res.content

# Get embeddings for LLM response
response_embedding = embedding_model.embed_query(response_text)

# Track relevant sources
relevant_sources = set()

for text, (source, page) in source_map.items():
    chunk_embedding = embedding_model.embed_query(text)  # Get embedding for each chunk
    similarity_score = cosine_similarity([response_embedding], [chunk_embedding])[0][0]

    if similarity_score >= 0.7:  # Threshold for relevance
        relevant_sources.add(f"{source} (Page {page+1})")

formatted_response = f"Response: {response_text}\nSources: {list(relevant_sources) if relevant_sources else ['No sources matched']}"
print(formatted_response)

Response: Chào bạn, rất vui được hỗ trợ bạn về chương trình Sĩ Quan Thông Tin CSE2021.

Theo thông tin từ buổi Info Session CSE2021, để đủ điều kiện tốt nghiệp chương trình Khoa học Máy tính (CSE) ngoài khóa luận tốt nghiệp, sinh viên cần phải đạt được **150 TC từ các học phần bắt buộc** và **15 TC từ thực tập chuyên ngành**.

Hy vọng thông tin này hữu ích cho bạn! Nếu có bất kỳ câu hỏi nào khác, đừng ngần ngại hỏi nhé.
Sources: ['/content/CSE_Specific Examination Regulation_Annex A and B_VN.pdf (Page 5)']


#Generate Questions and Save to CSV

In [None]:
import time
import csv
import os

csv_filename = "generated_questions.csv"

# Open CSV file for writing with UTF-8 BOM
with open(csv_filename, mode="a", newline="", encoding="utf-8-sig") as file:
    writer = csv.writer(file)

    # Write the header (only "Question" column now)
    writer.writerow(["Question"])

    for i, doc in enumerate(pdf_files):
        time.sleep(2)  # Avoid hitting API rate limits

        # Extract document name (without path)
        doc_name = os.path.basename(doc)

        # Select relevant chunks
        doc_chunks = chunks[i * 3 : (i + 1) * 3]
        doc_text = "\n".join([chunk.page_content.strip() for chunk in doc_chunks]).strip()

        if not doc_text:
            continue  # Skip if the document has no text

        # Generate questions with improved prompt
        prompt = HumanMessage(content=f"""Dựa vào nội dung tài liệu "{doc_name}", hãy tạo 5 câu hỏi đa dạng bằng tiếng Việt.
        Không đánh số thứ tự, không để lại khoảng trắng dư thừa, và mỗi câu hỏi phải có đủ ngữ cảnh để hiểu được tài liệu liên quan.""")

        response = chat_model.invoke([prompt])
        questions = [q.strip() for q in response.content.strip().split("\n") if q.strip()]  # Clean up questions

        # Write each question to the CSV file
        for question in questions:
            writer.writerow([question])

print(f"✅ Questions saved to {csv_filename}")


✅ Questions saved to generated_questions.csv


#Generate Answers

In [None]:
import pandas as pd
import time
from sklearn.metrics.pairwise import cosine_similarity

answer_csv = "generated_QA.csv"

# Load generated questions
df_questions = pd.read_csv("generated_questions.csv")
questions = df_questions["Question"].tolist()

answers = []
answer_sources = []

for question in questions:
    time.sleep(2)  # Avoid hitting API rate limits

    # Retrieve relevant document content
    context, _ = augment_prompt(question)
    prompt = HumanMessage(content=context)

    # Get answer from chatbot model
    answer_res = chat_model.invoke([prompt])
    answer = answer_res.content.strip()
    answers.append(answer)

    # Identify relevant sources for the answer
    relevant_sources = set()
    response_embedding = embedding_model.embed_query(answer)  # Embed answer

    for text, (source, page) in source_map.items():
        chunk_embedding = embedding_model.embed_query(text)  # Embed document chunk
        similarity_score = cosine_similarity([response_embedding], [chunk_embedding])[0][0]

        if similarity_score >= 0.7:  # Threshold for relevance
            relevant_sources.add(f"{source} (Page {page+1})")

    # Store relevant sources for answer
    source_list = "; ".join(relevant_sources) if relevant_sources else "No sources matched"
    answer_sources.append(source_list)

# Save answers and sources to CSV
df_answers = pd.DataFrame({"Answer": answers, "Relevant Source (Answer)": answer_sources})
df_answers.to_csv(answer_csv, index=False, encoding="utf-8-sig")

print(f"✅ Answers saved to {answer_csv}")


✅ Answers saved to generated_QA.csv


In [None]:
!pip install paddlepaddle-gpu

Collecting paddlepaddle-gpu
  Downloading paddlepaddle_gpu-2.6.2-cp311-cp311-manylinux1_x86_64.whl.metadata (8.6 kB)
Collecting astor (from paddlepaddle-gpu)
  Downloading astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting opt-einsum==3.3.0 (from paddlepaddle-gpu)
  Downloading opt_einsum-3.3.0-py3-none-any.whl.metadata (6.5 kB)
Downloading paddlepaddle_gpu-2.6.2-cp311-cp311-manylinux1_x86_64.whl (759.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m759.0/759.0 MB[0m [31m735.7 kB/s[0m eta [36m0:00:00[0m
[?25hDownloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading astor-0.8.1-py2.py3-none-any.whl (27 kB)
Installing collected packages: opt-einsum, astor, paddlepaddle-gpu
  Attempting uninstall: opt-einsum
    Found existing installation: opt_einsum 3.4.0
    Uninstalling opt_einsum-3.4.0:
      Successfully uninstalled o

In [None]:
!pip install pymupdf pdfplumber camelot-py[cv] paddleocr vietocr

Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting paddleocr
  Downloading paddleocr-2.10.0-py3-none-any.whl.metadata (12 kB)
Collecting vietocr
  Downloading vietocr-0.3.13-py3-none-any.whl.metadata (4.1 kB)
Collecting camelot-py[cv]
  Downloading camelot_py-1.0.0-py3-none-any.whl.metadata (9.4 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
INFO: pi

In [None]:
pdf_test_file = '/content/sample_data/20240918_466_QD-DHVD_Ban hanh Quy che dao tao Nam dai cuong cua VGU-VN.pdf'

In [None]:
import fitz  # PyMuPDF for text & images
import pdfplumber  # Extract tables
import camelot  # Advanced table extraction
import cv2
import torch
import numpy as np
from PIL import Image
from paddleocr import PaddleOCR
from vietocr.tool.predictor import Predictor
from vietocr.tool.config import Cfg

# Initialize OCR Models
paddle_ocr = PaddleOCR(use_angle_cls=True, lang="vi")

config = Cfg.load_config_from_name("vgg_transformer")
config['device'] = 'cuda' if torch.cuda.is_available() else 'cpu'
vietocr_model = Predictor(config)


[2025/03/14 08:57:39] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, use_gcu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_

Downloading: "https://download.pytorch.org/models/vgg19_bn-c79401a0.pth" to /root/.cache/torch/hub/checkpoints/vgg19_bn-c79401a0.pth
100%|██████████| 548M/548M [00:04<00:00, 125MB/s]
18533it [00:09, 1900.49it/s]


In [None]:
# Install necessary libraries

# Function to extract text from PDF
def extract_text(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = []
    for page in doc:
        full_text.append(page.get_text("text"))
    return "\n".join(full_text)

# Function to extract tables using PDFPlumber
def extract_tables(pdf_path):
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables.extend(page.extract_tables())
    return tables

# Function to extract images from PDF
def extract_images(pdf_path):
    doc = fitz.open(pdf_path)
    images = []
    for i, page in enumerate(doc):
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_data = base_image["image"]
            image = Image.open(io.BytesIO(image_data))
            images.append(image)
    return images

# Function to apply PaddleOCR on extracted images
def ocr_paddle(image):
    image_np = np.array(image)
    results = paddle_ocr.ocr(image_np, cls=True)
    return " ".join([res[1][0] for line in results for res in line if len(res) > 1])

# Function to apply VietOCR on extracted images
def ocr_vietocr(image):
    return vietocr_model.predict(image)

# Main function to process PDF
def process_pdf(pdf_path):
    print("Extracting text...")
    extracted_text = extract_text(pdf_path)

    print("Extracting tables...")
    extracted_tables = extract_tables(pdf_path)

    print("Extracting images...")
    extracted_images = extract_images(pdf_path)

    print("Running OCR on images...")
    ocr_results = [ocr_vietocr(img) for img in extracted_images]

    # Combine results
    return {
        "text": extracted_text,
        "tables": extracted_tables,
        "ocr_results": ocr_results
    }

# Run the pipeline on a sample PDF
pdf_file = "/content/sample_data/your_file.pdf"
results = process_pdf(pdf_file)

# Print results
print("Extracted Text:\n", results["text"])
print("\nExtracted Tables:\n", results["tables"])
print("\nOCR Results from Images:\n", results["ocr_results"])


Extracting text...


FileNotFoundError: no such file: '/content/sample_data/your_file.pdf'