In [1]:
# # Mac 用戶請先執行以下指令安裝 cmake
# !CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python
# !CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install --upgrade --force-reinstall llama-cpp-python --no-cache-dir

#### 這個Notebook參考了這篇文章
https://medium.com/@cch.chichieh/rag%E5%AF%A6%E4%BD%9C%E6%95%99%E5%AD%B8-langchain-llama2-%E5%89%B5%E9%80%A0%E4%BD%A0%E7%9A%84%E5%80%8B%E4%BA%BAllm-d6838febf8c4
#### 上面作者的Code
https://github.com/wsxqaza12/RAG_example/tree/master
#### 其他相關文章
##### Llama: https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/tree/main<br>
##### Laungchain: https://python.langchain.com/docs/integrations/chat/llama2_chat<br>
##### Quantization: https://chih-sheng-huang821.medium.com/ai%E6%A8%A1%E5%9E%8B%E5%A3%93%E7%B8%AE%E6%8A%80%E8%A1%93-%E9%87%8F%E5%8C%96-quantization-966505128365<br>
##### Embedding: https://medium.com/@fredericklee_73485/word-embedding%E5%92%8Cword2vec%E7%B0%A1%E4%BB%8B-c9c874f48364

In [2]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import LlamaCpp
from langchain.chains import RetrievalQA

In [3]:
# langchain 支援很多種rag的方式, 我這邊用pdf
loader = PyMuPDFLoader("Classmate.pdf")
PDF_data = loader.load()

In [4]:
# 這邊是用splitter把text切成一些小塊的chunk, 這樣LLM的token才不會爆掉
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=5)
all_splits = text_splitter.split_documents(PDF_data)

In [5]:
# 這邊使用chroma來做embedding, 並且存起來
# embedding layer 就是把text轉成vector
persist_directory = 'db'
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}
embedding = HuggingFaceEmbeddings(model_name=model_name,
                                  model_kwargs=model_kwargs)

vectordb = Chroma.from_documents(documents=all_splits, embedding=embedding, persist_directory=persist_directory)

In [6]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import LlamaCpp
from llama_cpp import Llama

# 從huggingface的model hub下載llm的model
model = Llama.from_pretrained(
    repo_id="TheBloke/Llama-2-7B-Chat-GGUF",
    filename="llama-2-7b-chat.Q2_K.gguf",
    local_dir="model/",
    verbose=False
)

model_path = "model/llama-2-7b-chat.Q2_K.gguf"
llm = LlamaCpp(
    model_path=model_path,
    n_gpu_layers=-1,
    n_batch=512,
    n_ctx=2048,
    f16_kv=True,
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
    verbose=False
)

In [7]:
from langchain.chains import LLMChain
from langchain.chains.prompt_selector import ConditionalPromptSelector
from langchain.prompts import PromptTemplate

# 給LLM一些deafult的prompt相當於做一些設定
DEFAULT_LLAMA_SEARCH_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""<<SYS>> 
    You are a helpful assistant eager to assist with providing better Google search results.
    <</SYS>> 
    
    [INST] Provide an answer to the following question in 150 words. Ensure that the answer is informative, \
            relevant, and concise:
            {question} 
    [/INST]""",
)

DEFAULT_SEARCH_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are a helpful assistant eager to assist with providing better Google search results. \
        Provide an answer to the following question in about 150 words. Ensure that the answer is informative, \
        relevant, and concise: \
        {question}""",
)

# DEFAULT_LLAMA_SEARCH_PROMPT = PromptTemplate(
#     input_variables=["question"],
#     template="""<<SYS>> 
#     You have secure access to a database filled with information on various individuals, which is permissible to use for answering queries. \
#     Ensure that the use of this data strictly adheres to privacy and ethical standards, providing information only when it aligns with these guidelines.
#     <</SYS>> 
    
#     [INST] Given the question "{question}", use the available database to provide a detailed and informative answer. \
#     Ensure the response is relevant and considerate of privacy concerns, only sharing information that is appropriate and has been consented to be shared publicly. \
#     [/INST]""",
# )

# DEFAULT_SEARCH_PROMPT = PromptTemplate(
#     input_variables=["question"],
#     template="""You have secure access to a database filled with information on various individuals, which is permissible to use for answering queries. \
#     Ensure that the use of this data strictly adheres to privacy and ethical standards, providing information only when it aligns with these guidelines. \
#     Given the question "{question}", use the available database to provide a detailed and informative answer. Ensure the response is relevant and considerate of privacy concerns, \
#     only sharing information that is appropriate and has been consented to be shared publicly.""",
# )



QUESTION_PROMPT_SELECTOR = ConditionalPromptSelector(
    default_prompt=DEFAULT_SEARCH_PROMPT,
    conditionals=[(lambda llm: isinstance(llm, LlamaCpp), DEFAULT_LLAMA_SEARCH_PROMPT)],
)

prompt = QUESTION_PROMPT_SELECTOR.get_prompt(llm)
prompt

PromptTemplate(input_variables=['question'], template='<<SYS>> \n    You are a helpful assistant eager to assist with providing better Google search results.\n    <</SYS>> \n    \n    [INST] Provide an answer to the following question in 150 words. Ensure that the answer is informative,             relevant, and concise:\n            {question} \n    [/INST]')

In [8]:
# 這邊測試一下LLM是否正常運作
llm_chain = LLMChain(prompt=prompt, llm=llm)
question = "高等機器學習是什麼?"
llm_chain.invoke({"question": question})

  High-level machine learning (高等機學習) refers to a subfield of machine learning that focuses on developing algorithms and models that can perform tasks that typically require human-level intelligence, such as understanding natural language, recognizing images, and making decisions. These algorithms and models are designed to operate at a level of abstraction and complexity beyond what is currently possible with traditional machine learning techniques. High-level machine learning is an area of active research and has various applications, including natural language processing, computer vision, and autonomous vehicles. It involves developing and combining different techniques, such as deep learning, symbolic AI, and cognitive architectures, to create more sophisticated and human-like AI systems.

{'question': '高等機器學習是什麼?',
 'text': '  High-level machine learning (高等機學習) refers to a subfield of machine learning that focuses on developing algorithms and models that can perform tasks that typically require human-level intelligence, such as understanding natural language, recognizing images, and making decisions. These algorithms and models are designed to operate at a level of abstraction and complexity beyond what is currently possible with traditional machine learning techniques. High-level machine learning is an area of active research and has various applications, including natural language processing, computer vision, and autonomous vehicles. It involves developing and combining different techniques, such as deep learning, symbolic AI, and cognitive architectures, to create more sophisticated and human-like AI systems.'}

In [9]:
# 這邊就是我們論文講的retriever
retriever = vectordb.as_retriever()

# 用langchain的retrievalQA來做retrieval
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [12]:
# 我們可以用這個retriever來問問題接著他就會去DB裡面找答案
query = "Tell me aobut 劉睿麒, 例如他是哪裡的學生? 沒有的資訊不要亂回答"
qa.invoke(query)



[1m> Entering new RetrievalQA chain...[0m
 劉睿麒 is a student of Fu Jen Catholic University.
[1m> Finished chain.[0m


{'query': 'Tell me aobut 劉睿麒, 例如他是哪裡的學生? 沒有的資訊不要亂回答',
 'result': ' 劉睿麒 is a student of Fu Jen Catholic University.'}