In [1]:
import torch
# 向量模型下载
from modelscope import snapshot_download
model_dir = snapshot_download('AI-ModelScope/bge-small-zh-v1.5', cache_dir='./')

# 源大模型下载
from modelscope import snapshot_download
model_dir = snapshot_download('IEITYuan/Yuan2-2B-Mars-hf', cache_dir='./')

Downloading [1_Pooling/config.json]: 100%|██████████| 190/190 [00:00<00:00, 422B/s]
Downloading [config.json]: 100%|██████████| 776/776 [00:00<00:00, 1.42kB/s]
Downloading [config_sentence_transformers.json]: 100%|██████████| 124/124 [00:00<00:00, 226B/s]
Downloading [configuration.json]: 100%|██████████| 47.0/47.0 [00:00<00:00, 87.1B/s]
Downloading [model.safetensors]: 100%|██████████| 91.4M/91.4M [00:00<00:00, 112MB/s] 
Downloading [modules.json]: 100%|██████████| 349/349 [00:00<00:00, 635B/s]
Downloading [pytorch_model.bin]: 100%|██████████| 91.4M/91.4M [00:00<00:00, 101MB/s] 
Downloading [README.md]: 100%|██████████| 27.5k/27.5k [00:00<00:00, 48.2kB/s]
Downloading [sentence_bert_config.json]: 100%|██████████| 52.0/52.0 [00:00<00:00, 101B/s]
Downloading [special_tokens_map.json]: 100%|██████████| 125/125 [00:00<00:00, 190B/s]
Downloading [tokenizer.json]: 100%|██████████| 429k/429k [00:00<00:00, 678kB/s]
Downloading [tokenizer_config.json]: 100%|██████████| 367/367 [00:00<00:00, 666

In [None]:
# 定义模型路径
model_path = './IEITYuan/Yuan2-2B-Mars-hf'

# 定义向量模型路径
embedding_model_path = './AI-ModelScope/bge-small-zh-v1___5'

In [None]:
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains import LLMChain
from langchain.chains.question_answering import load_qa_chain
from langchain.llms.base import LLM
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re
from typing import Any, List, Optional
from langchain_community.vectorstores import Chroma

# 定义源大模型类
class Yuan2_LLM(LLM):
    """
    class for Yuan2_LLM
    """
    tokenizer: AutoTokenizer = None
    model: AutoModelForCausalLM = None

    def __init__(self, mode_path :str):
        super().__init__()

        # 加载预训练的分词器和模型
        print("Creat tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(mode_path, add_eos_token=False, add_bos_token=False, eos_token='<eod>')
        self.tokenizer.add_tokens(['<sep>', '<pad>', '<mask>', '<predict>', '<FIM_SUFFIX>', '<FIM_PREFIX>', '<FIM_MIDDLE>','<commit_before>','<commit_msg>','<commit_after>','<jupyter_start>','<jupyter_text>','<jupyter_code>','<jupyter_output>','<empty_output>'], special_tokens=True)

        print("Creat model...")
        self.model = AutoModelForCausalLM.from_pretrained(mode_path, torch_dtype=torch.bfloat16, trust_remote_code=True).cuda()

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        prompt = prompt.strip()
        prompt += "<sep>"
        inputs = self.tokenizer(prompt, return_tensors="pt")["input_ids"].cuda()
        outputs = self.model.generate(inputs,do_sample=False,max_length=4096)
        output = self.tokenizer.decode(outputs[0])
        response = output.split("<sep>")[-1].split("<eod>")[0]

        return response

    @property
    def _llm_type(self) -> str:
        return "Yuan2_LLM"

# 定义一个函数，用于获取llm和embeddings
@st.cache_resource
def get_models():
    llm = Yuan2_LLM(model_path)

    model_kwargs = {'device': 'cuda'}
    encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
    embeddings = HuggingFaceEmbeddings(
        model_name=embedding_model_path,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs,
    )
    return llm, embeddings

template = """使用以下上下文片段来回答最后的问题。不要试图编造答案。不要重复回答。尽量简明扼要地回答。
{context}
问题：{query}"""

# 定义ChatBot类
class ChatBot:
    """
    class for ChatBot.
    """

    def __init__(self, llm, embeddings, vectordb):
        self.prompt = PromptTemplate(
            template=template
        )
        self.chain = RetrievalQA.from_chain_type(
            llm,
            retriever=vectordb.as_retriever(),
            return_source_documents=True,
            chain_type_kwargs={"prompt": self.prompt}
        )
        self.embeddings = embeddings

        # 加载 text_splitter
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=450,
            chunk_overlap=10,
            length_function=len
        )

    def run(self, query):

        sim_docs = vectordb.max_marginal_relevance_search(query, k=5, fetch_k=3)
        context = ""
        for i, sim_doc in enumerate(sim_docs):
            context = context + str(i + 1) + "." + sim_doc.page_content
        self.prompt.format(context=context, query=query)
        # # 切分成chunks
        # all_chunks = self.text_splitter.split_text(text=text)

        # # 转成向量并存储
        # VectorStore = FAISS.from_texts(all_chunks, embedding=self.embeddings)

        # 检索相似的chunks
        # chunks = VectorStore.similarity_search(query=query, k=1)

        # 生成回复
        response = self.chain.run(question=query)

        return response


def main():
    # 获取llm和embeddings
    llm, embeddings = get_models()
    # 将csv文件转向量储存
    persist_directory = "./vector_db"
    csvloader = CSVLoader(file_path="./my_file.csv", encoding="utf-8", csv_args={'delimiter': ',', 'quotechar': '"'})
    pdfloader = PyPDFLoader("./教材：中国近现代史纲要（2015年版）.pdf")
    pdf = []
    # for loader in loaders_chinese:
    #     docs.extend(loader.load())
    pdf = pdfloader.load()
    CHUNK_SIZE = 200
    # 知识库中相邻文本重合长度
    OVERLAP_SIZE = 70
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=OVERLAP_SIZE
    )
    split_pdf = text_splitter.split_documents(pdf)
    csv = csvloader.load()
    pdf = pdfloader.load()
    choiceproblem_vectordb = Chroma.from_documents(
        documents=csv,
        embedding=embeddings,
        persist_directory=persist_directory + "/choiceproblem"  # 允许我们将persist_directory目录保存到磁盘上
    )
    choiceproblem_vectordb.persist()
    analysis_vectordb = Chroma.from_documents(
        documents=split_pdf,
        embedding=embeddings,
        persist_directory=persist_directory + "/pdf"  # 允许我们将persist_directory目录保存到磁盘上
    )
    vectordb.persist()
    query = "在新民主主义的经济纲领中，极具特色的一项内容是什么？"

    # 初始化ChatBot
    chatbot = ChatBot(llm, embeddings, vectordb)
    
    response = chatbot.run(query)
    # 生成概括
    print(response)