In [None]:
!pip install -U langchain==0.2.11 openai==1.37.0 ragas==0.1.11 arxiv==2.1.3 pymupdf==1.24.9 chromadb==0.5.5 wandb==0.17.5 tiktoken==0.7.0 pypdf==4.3.1 sentence_transformers==2.7.0
!pip install rank_bm25

### 项目二、作业

请在完成项目二学习任务的前提下，完成以下任务：


#### 1. 设计RAG应用场景并收集语料数据：
开发基于私有知识库的垂直大语言模型应用能有效解决通用大语言模型在更新不及时、对特定领域知识不足的问题，RAG技术是当前构建垂直大语言模型的主流技术，请根据你对大语言模型、RAG技术的理解，设想一个典型应用场景，自行搜集相关语料数据资源，并统一保存在一个文件夹中。

应用场景选择： 应用场景的选择综合考虑应用价值和语料数据来源可获得性，本作业将作为本次培训项目汇报的选题之一，请认真考虑选择的应用场景，在进行项目汇报时需重点说明本项目的应用价值或商用价值。

语料数据集要求：语料数据集的来源可以是网页、论文、文章、公共数据集等，不同来源的语料建议统一转换成一种格式（如pdf或txt格式等），尽可能手工去除文档中杂乱无意义的文本或内容（如论文中的页面页脚、网页中的广告等）。来自同一篇独立的文档的语料，请用单独的1份文档保存，不要把不同文档的内容合并到一个文档中。搜集的文档数量不少于10份。

#### 2. 针对语料数据采用不同检索器和参数构建RAG系统

基于你搜集的语料数据，分别采用三种不同检索器（base_retriver、PDR、EM），使用项目二教程中所采用的模型和RAG流程获得检索得到的文档以及最终结果。

设计10个以上的提问，分别执行三种RAG流程，将检索得到的文档以及最终结果保存在“rag-result.csv”文件中。

该文件包含的列包括：


| 列名      | 描述 |
| ----------- | ----------- |
| index      | 序号       |
| question   | 提问内容        |
| ground_truth_answer   | 最贴切的回答，可以是你人工分析文案后手工撰写的回答，也可以是你综合三种RAG给出的结果选择最贴近的回答再补充你认为需要调整或补充的内容；        |
| base_retriever_chunks_size1000_overlap100_k2   | 使用base_retriever检索得到的文本块（chunks），设置参数chunk_size为1000，chunk_overlap为100，k为2；        |
| base_retriever_answer_size1000_overlap100_k2   | 使用base_retriever获得的最终回答，设置参数chunk_size为1000，chunk_overlap为100，k为2；        |
| PDR_chunks_psize1500_csize200   | 使用parent_document_retriever检索得到的文本块（chunks），设置父文本块chunk_size为1500，子文本块的chunk_size为200；        |
| PDR_answer_psize1500_csize200   | 使用parent_document_retriever获得的最终回答，设置父文本块chunk_size为1500，子文本块的chunk_size为200；       |
| ER_chunks_size1000_overlap100_k3_w75   | 使用ensemble_retriever检索得到的文本块（chunks），设置参数chunk_size为1000，chunk_overlap为100，k为3，bm25_retriever的权重是0.75；        |
| ER_answer_size1000_overlap100_k3_w75   | 使用ensemble_retriever获得的最终回答，设置参数chunk_size为1000，chunk_overlap为100，k为3，bm25_retriever的权重是0.75；      |


#### 3. 完成结果文件提交

完成本作业后，请找任课老师现场验收完成结果，再将数据集文件夹、代码文件（本Notebook文件）、rag-result.csv文件，共同打包命名为“项目二作业-(姓名)-(学号).tar”提交。

In [2]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.retrievers import ParentDocumentRetriever, BM25Retriever, EnsembleRetriever
from langchain.storage import InMemoryStore
from langchain.llms import ChatGLM
from langchain.chains import RetrievalQA
import pandas as pd

In [3]:
import os
from langchain.document_loaders import pdf, PyPDFLoader

def load_pdf_doucuments(pdf_folder_path: str) -> list:
    base_docs = []
    
    if not os.path.exists(pdf_folder_path):
        raise FileNotFoundError(f"The folder '{pdf_folder_path}' does not exist.")

    for file in os.listdir(pdf_folder_path):
        if file.endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder_path, file)
            print(f"Processing: {pdf_path}")
            try:
                loader = PyPDFLoader(pdf_path)
                pages = loader.load()
                base_docs.extend(pages)
            except Exception as e:
                print(f"Error processing {pdf_path}: {str(e)}")

    return base_docs

In [5]:
from langchain.llms.base import LLM
from typing import Any, List, Optional
from langchain.callbacks.manager import CallbackManagerForLLMRun
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

class ChatGLM3_LLM(LLM):
    # 基于本地 ChatGLM3 自定义 LLM 类
    tokenizer : AutoTokenizer = None
    model: AutoModelForCausalLM = None

    def __init__(self, model_path :str):
        # model_path: ChatGLM3 模型路径
        # 从本地初始化模型
        super().__init__()
        print("正在从本地加载模型...")
        # 从本地加载一个预训练的分词器（tokenizer）
        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        # 从本地加载一个预训练的生成式语言模型
        self.model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).half().cuda()
        # 将模型设置为评估模式
        self.model = self.model.eval()
        print("完成本地模型的加载")

    def _call(self, prompt : str, stop: Optional[List[str]] = None,
                run_manager: Optional[CallbackManagerForLLMRun] = None,
                **kwargs: Any):
        # 重写调用函数
        response, history = self.model.chat(self.tokenizer, prompt , history=[])
        return response
        
    @property
    def _llm_type(self) -> str:
        return "ChatGLM3-6B"

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
pdf_folder_path = '3DGS'
base_docs = load_pdf_doucuments(pdf_folder_path)
len(base_docs)

Processing: 3DGS/2106.12372v2.pdf
Processing: 3DGS/2308.04079v1.pdf
Processing: 3DGS/2310.08529v3.pdf
Processing: 3DGS/2401.00834v2.pdf
Processing: 3DGS/2407.11343v1.pdf
Processing: 3DGS/3503250.pdf
Processing: 3DGS/3528223.3530127.pdf
Processing: 3DGS/Barron_Mip-NeRF_A_Multiscale_Representation_for_Anti-Aliasing_Neural_Radiance_Fields_ICCV_2021_paper.pdf
Processing: 3DGS/Jiang_GaussianShader_3D_Gaussian_Splatting_with_Shading_Functions_for_Reflective_Surfaces_CVPR_2024_paper.pdf
Processing: 3DGS/Wu_4D_Gaussian_Splatting_for_Real-Time_Dynamic_Scene_Rendering_CVPR_2024_paper.pdf


135

In [7]:
# 初始化语言模型
model_path = os.path.expandvars("$GEMINI_PRETRAIN2/chatglm3-6b")
primary_qa_llm = ChatGLM3_LLM(model_path)

正在从本地加载模型...


Loading checkpoint shards: 100%|██████████| 7/7 [01:52<00:00, 16.06s/it]


完成本地模型的加载


In [8]:
# 初始化嵌入模型
EMBEDDING_PATH = os.path.expandvars('$GEMINI_PRETRAIN/bge-m3')
embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_PATH)

  warn_deprecated(


In [15]:
base_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1500)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=200)

In [16]:
# 1. Base Retriever
base_docs_split = base_splitter.split_documents(base_docs)
base_vectorstore = Chroma.from_documents(base_docs_split, embeddings)
base_retriever = base_vectorstore.as_retriever(search_kwargs={"k": 2})

Exception occurred invoking consumer for subscription f16d3be1f7bb4110a5811abd0c788908to topic persistent://default/default/c1f96998-9aed-4825-bd5f-2b3d62b8bce8 'utf-8' codec can't encode characters in position 757-758: surrogates not allowed


In [17]:
# 2. Parent Document Retriever (PDR)
vectorstore = Chroma(collection_name="split_parents", embedding_function=embeddings)
store = InMemoryStore()
pdr = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)
pdr.add_documents(base_docs)

Exception occurred invoking consumer for subscription acb3d08a818848a597eb684198ad66a5to topic persistent://default/default/a10dcdea-5a51-41ac-a3d3-af1aeba800de 'utf-8' codec can't encode characters in position 191-192: surrogates not allowed


In [18]:
# 3. Ensemble Retriever (ER)
bm25_retriever = BM25Retriever.from_documents(base_docs_split)
bm25_retriever.k = 3
chroma_retriever = base_vectorstore.as_retriever(search_kwargs={"k": 3})
er = EnsembleRetriever(
    retrievers=[bm25_retriever, chroma_retriever],
    weights=[0.75, 0.25]
)

In [19]:
# Create QA chains
template = """Please answer the following question based on the provided context information. If you think the question cannot be answered based on the provided information, please answer 'I don't know':

### Context Information
{context}

### Question
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

In [20]:
def create_qa_chain(retriever):
    return (
        {"context": itemgetter("question") | retriever,
         "question": itemgetter("question")
        }
        | RunnablePassthrough.assign(
            context=itemgetter("context")
          )
        | {
             "response": prompt | primary_qa_llm,
             "context": itemgetter("context"),
          }
    )

base_qa = create_qa_chain(base_retriever)
pdr_qa = create_qa_chain(pdr)
er_qa = create_qa_chain(er)

In [21]:
# Prepare the list of questions and ground truth answers
questions_and_answers = [
    {
        "question": "What is 3D Gaussian Splatting?",
        "ground_truth": "3D Gaussian Splatting is an emerging technique for 3D scene representation and rendering. It uses 3D Gaussian functions to represent points in a scene, each with its own position, scale, orientation, and color attributes. This method efficiently represents and renders complex 3D scenes, particularly excelling in handling scenes reconstructed from multi-view images."
    },
    {
        "question": "How does 3D Gaussian Splatting differ from traditional point cloud rendering methods?",
        "ground_truth": "3D Gaussian Splatting differs from traditional point cloud rendering in several ways: 1) Representation: It uses 3D Gaussian functions instead of simple points. 2) Rendering quality: It produces smoother, more continuous surfaces, reducing the 'holes' common in point cloud rendering. 3) Efficiency: It typically requires fewer points to achieve the same rendering quality, thus having an advantage in rendering speed. 4) Multi-scale representation: Gaussian functions can represent details at different scales, better capturing the multi-scale nature of scenes."
    },
    {
        "question": "What are the main advantages of 3D Gaussian Splatting?",
        "ground_truth": "The main advantages of 3D Gaussian Splatting include: 1) High-quality rendering: Produces smooth, continuous surfaces with fewer artifacts. 2) Rendering efficiency: Usually requires fewer points than traditional methods to represent scenes of similar quality. 3) Fast training: Trains faster compared to methods like NeRF. 4) Editability: Gaussian points can be directly edited, facilitating scene modification. 5) Multi-view consistency: Maintains good consistency across different viewpoints. 6) Adaptability: Can represent various complex geometric shapes and materials."
    },
    {
        "question": "How does 3D Gaussian Splatting handle occlusion?",
        "ground_truth": "3D Gaussian Splatting handles occlusion through: 1) Depth sorting: Gaussian points are sorted by depth during rendering, ensuring correct occlusion of points. 2) Alpha compositing: Uses alpha compositing techniques to blend overlapping Gaussian points, simulating semi-transparent effects. 3) Adaptive opacity: The opacity of Gaussian points can be dynamically adjusted based on viewpoint and occlusion conditions. 4) Multi-scale representation: Gaussian points at different scales help provide finer representation at occlusion boundaries."
    },
    {
        "question": "How are Gaussian point parameters optimized during the training process of 3D Gaussian Splatting?",
        "ground_truth": "In 3D Gaussian Splatting, Gaussian point parameters are optimized during training through: 1) Gradient descent: Using backpropagation and gradient descent to optimize the position, scale, orientation, and color of each Gaussian point. 2) Density adaptation: Dynamically adding or removing Gaussian points based on reconstruction error. 3) Hierarchical optimization: Optimizing large-scale structures first, then gradually optimizing details. 4) Regularization: Using regularization terms to prevent overfitting and maintain reasonable point distribution. 5) View consistency constraints: Ensuring consistent rendering results across different viewpoints."
    },
    {
        "question": "How does 3D Gaussian Splatting handle dynamic scenes?",
        "ground_truth": "3D Gaussian Splatting handles dynamic scenes through methods such as: 1) Time-dependent Gaussian points: Adding a time dimension to each Gaussian point, allowing its attributes to change over time. 2) Deformation fields: Using additional deformation fields to describe dynamic changes in the scene. 3) Keyframe interpolation: Interpolating Gaussian point attributes between keyframes. 4) Dynamic point clouds: Generating independent sets of Gaussian points for each time step. 5) Spatio-temporal consistency constraints: Incorporating temporal continuity constraints in the optimization process."
    },
    {
        "question": "What advantages does 3D Gaussian Splatting have over Neural Radiance Fields (NeRF)?",
        "ground_truth": "3D Gaussian Splatting has several advantages over NeRF: 1) Faster training: 3DGS typically takes minutes to hours, while NeRF can take days. 2) Faster rendering: 3DGS can achieve real-time rendering, while NeRF is usually slower. 3) Better memory efficiency: 3DGS uses explicit scene representation, requiring less memory. 4) Better editability: Gaussian points can be directly edited, while NeRF's implicit representation is harder to edit. 5) Better geometric representation: 3DGS can better capture sharp edges and details."
    },
    {
        "question": "What are the potential applications of 3D Gaussian Splatting in computer vision tasks?",
        "ground_truth": "Potential applications of 3D Gaussian Splatting in computer vision include: 1) 3D reconstruction: High-quality 3D model reconstruction from multi-view images. 2) Novel view synthesis: Generating images from arbitrary new viewpoints. 3) Augmented reality: Providing high-quality, real-time 3D scene rendering. 4) Virtual reality: Creating immersive 3D environments. 5) 3D object recognition and segmentation: Using 3DGS representations for 3D object analysis. 6) Motion capture: Representing and analyzing dynamic human actions. 7) Cultural heritage digitization: High-precision recording and display of artifacts and historical sites."
    },
    {
        "question": "What challenges does 3D Gaussian Splatting face when dealing with large-scale outdoor scenes?",
        "ground_truth": "Challenges faced by 3D Gaussian Splatting in large-scale outdoor scenes include: 1) Computational complexity: Large numbers of Gaussian points can lead to increased computational and memory demands. 2) Scale variations: Outdoor scenes have large scale differences between near and far objects, requiring multi-scale representation. 3) Lighting variations: Complex outdoor lighting conditions require consideration of global illumination effects. 4) Dynamic elements: Moving vehicles, pedestrians, etc., need special handling. 5) Sparse area representation: Efficient representation of large uniform areas like the sky. 6) Data acquisition: Requires a large number of high-quality multi-view images."
    },
    {
        "question": "How does 3D Gaussian Splatting handle specular reflections and transparent objects?",
        "ground_truth": "3D Gaussian Splatting handles specular reflections and transparent objects through: 1) View-dependent color: Allowing Gaussian point colors to vary with viewpoint. 2) Multi-layer representation: Using multiple layers of Gaussian points to model transparency. 3) Environment mapping: Incorporating environment maps to simulate reflection effects. 4) Ray tracing: Integrating simplified ray tracing for reflections and refractions. 5) Material parameters: Adding extra material parameters like refractive index and reflectivity to Gaussian points. 6) Post-processing: Applying screen-space reflection techniques after rendering."
    },
    {
        "question": "What are the characteristics of 3D Gaussian Splatting in representing and rendering thin structures like hair or grass?",
        "ground_truth": "3D Gaussian Splatting's characteristics in representing thin structures include: 1) Flat Gaussians: Using highly anisotropic Gaussian functions to represent thin structures. 2) Density adjustment: Increasing the density of Gaussian points in areas with thin structures. 3) Orientation optimization: Precisely optimizing the orientation of Gaussian points to align with thin structures. 4) Multi-scale representation: Combining Gaussian points at different scales to capture details. 5) Special rendering techniques: May require special alpha blending or shadowing techniques. 6) Procedural generation: Can be combined with procedural generation techniques for repetitive structures like grass."
    },
    {
        "question": "How can 3D Gaussian Splatting be combined with other 3D reconstruction or rendering techniques?",
        "ground_truth": "3D Gaussian Splatting can be combined with other techniques to enhance performance: 1) Integration with SLAM: For real-time 3D reconstruction and localization. 2) Combination with traditional geometric reconstruction: Using Gaussian points to refine results from traditional reconstruction. 3) Integration with neural rendering: Such as NeRF, for handling complex lighting effects. 4) Combination with point cloud processing techniques: For efficient rendering of large-scale point clouds. 5) Integration with deep learning: For improving parameter initialization and optimization. 6) Combination with physical simulation: For creating interactive dynamic 3D scenes."
    }
]

In [24]:
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Helper function to safely get chunks
def safe_get_chunks(retriever, question):
    try:
        chunks = retriever.invoke(question)
        if not chunks:
            logger.warning(f"No chunks returned for question: {question}")
            return []
        return chunks
    except Exception as e:
        logger.error(f"Error retrieving chunks for question '{question}': {str(e)}")
        return []

# Helper function to safely execute QA
def safe_qa_invoke(qa_chain, question):
    try:
        result = qa_chain.invoke({"question": question})
        return result.get("response", "No response generated")
    except Exception as e:
        logger.error(f"Error in QA chain for question '{question}': {str(e)}")
        return "Error in generating response"

# Execute queries and save results
results = []
for idx, qa in enumerate(questions_and_answers, 1):
    question = qa["question"]
    ground_truth = qa["ground_truth"]
    
    logger.info(f"Processing question {idx}: {question}")

    # For base_retriever
    base_chunks = safe_get_chunks(base_retriever, question)
    base_result = safe_qa_invoke(base_qa, question)
    
    # For PDR
    pdr_chunks = safe_get_chunks(pdr, question)
    pdr_result = safe_qa_invoke(pdr_qa, question)
    
    # For ER
    er_chunks = safe_get_chunks(er, question)
    er_result = safe_qa_invoke(er_qa, question)
    
    results.append({
        "index": idx,
        "question": question,
        "ground_truth_answer": ground_truth,
        "base_retriever_chunks_size1000_overlap100_k2": str(base_chunks),
        "base_retriever_answer_size1000_overlap100_k2": base_result,
        "PDR_chunks_psize1500_csize200": str(pdr_chunks),
        "PDR_answer_psize1500_csize200": pdr_result,
        "ER_chunks_size1000_overlap100_k3_w75": str(er_chunks),
        "ER_answer_size1000_overlap100_k3_w75": er_result,
    })

    logger.info(f"Completed processing question {idx}")

# Save results to CSV
df = pd.DataFrame(results)
df.to_csv("rag-result.csv", index=False)
logger.info("Results saved to rag-result.csv")

INFO:__main__:Processing question 1: What is 3D Gaussian Splatting?
INFO:__main__:Completed processing question 1
INFO:__main__:Processing question 2: How does 3D Gaussian Splatting differ from traditional point cloud rendering methods?
ERROR:__main__:Error retrieving chunks for question 'How does 3D Gaussian Splatting differ from traditional point cloud rendering methods?': 1 validation error for Document
page_content
  none is not an allowed value (type=type_error.none.not_allowed)
ERROR:__main__:Error in QA chain for question 'How does 3D Gaussian Splatting differ from traditional point cloud rendering methods?': 1 validation error for Document
page_content
  none is not an allowed value (type=type_error.none.not_allowed)
INFO:__main__:Completed processing question 2
INFO:__main__:Processing question 3: What are the main advantages of 3D Gaussian Splatting?
INFO:__main__:Completed processing question 3
INFO:__main__:Processing question 4: How does 3D Gaussian Splatting handle occlus