run_localGPT.py updated

- Moved all model loading code out of the run_localGPT.py file. - Removed llamacpp from requirement.txt file. It needs to be installed separately to ensure it supports GPU
PromtEngineer · Sep 16, 2023 · 16f949e · 16f949e
1 parent 53ac922
commit 16f949e
Show file tree

Hide file tree

Showing 3 changed files with 124 additions and 371 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,6 @@
 # Natural Language Processing
 langchain==0.0.267
 chromadb==0.4.6
-llama-cpp-python==0.1.78
 pdfminer.six==20221105
 InstructorEmbedding
 sentence-transformers

diff --git a/run_localGPT.py b/run_localGPT.py
@@ -1,30 +1,35 @@
+import os
 import logging
-
 import click
 import torch
-from auto_gptq import AutoGPTQForCausalLM
-from huggingface_hub import hf_hub_download
 from langchain.chains import RetrievalQA
 from langchain.embeddings import HuggingFaceInstructEmbeddings
-from langchain.llms import HuggingFacePipeline, LlamaCpp
-from langchain.memory import ConversationBufferMemory
-from langchain.prompts import PromptTemplate
+from langchain.llms import HuggingFacePipeline
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler # for streaming response
+from langchain.callbacks.manager import CallbackManager
+callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
+
+from prompt_template_utils import get_prompt_template
 
 # from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain.vectorstores import Chroma
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    GenerationConfig,
-    LlamaForCausalLM,
-    LlamaTokenizer,
-    pipeline,
-)
-
-from constants import EMBEDDING_MODEL_NAME, PERSIST_DIRECTORY, MODEL_ID, MODEL_BASENAME
-
-
-def load_model(device_type, model_id, model_basename=None):
+from transformers import (GenerationConfig,
+                          pipeline,
+                          )
+
+from load_models import (load_quantized_model_gguf_ggml,
+                         load_quantized_model_qptq,
+                         load_full_model,
+                         )
+
+from constants import (EMBEDDING_MODEL_NAME, 
+                       PERSIST_DIRECTORY, 
+                       MODEL_ID, 
+                       MODEL_BASENAME, 
+                       MAX_NEW_TOKENS
+                       )
+
+def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
     """
     Select a model for text generation using the HuggingFace library.
     If you are running this for the first time, it will download a model for you.
@@ -46,64 +51,16 @@ def load_model(device_type, model_id, model_basename=None):
     logging.info("This action can take a few minutes!")
 
     if model_basename is not None:
-        if ".ggml" in model_basename:
-            logging.info("Using Llamacpp for GGML quantized models")
-            model_path = hf_hub_download(repo_id=model_id, filename=model_basename, resume_download=True)
-            max_ctx_size = 2048
-            kwargs = {
-                "model_path": model_path,
-                "n_ctx": max_ctx_size,
-                "max_tokens": max_ctx_size,
-            }
-            if device_type.lower() == "mps":
-                kwargs["n_gpu_layers"] = 1000
-            if device_type.lower() == "cuda":
-                kwargs["n_gpu_layers"] = 1000
-                kwargs["n_batch"] = max_ctx_size
-            return LlamaCpp(**kwargs)
-
+        if ".gguf" in model_basename:
+            llm = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING)
+            return llm
+        elif ".ggml" in model_basename.lower():
+            model, tokenizer = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING)
+            # return llm
         else:
-            # The code supports all huggingface models that ends with GPTQ and have some variation
-            # of .no-act.order or .safetensors in their HF repo.
-            logging.info("Using AutoGPTQForCausalLM for quantized models")
-
-            if ".safetensors" in model_basename:
-                # Remove the ".safetensors" ending if present
-                model_basename = model_basename.replace(".safetensors", "")
-
-            tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
-            logging.info("Tokenizer loaded")
-
-            model = AutoGPTQForCausalLM.from_quantized(
-                model_id,
-                model_basename=model_basename,
-                use_safetensors=True,
-                trust_remote_code=True,
-                device="cuda:0",
-                use_triton=False,
-                quantize_config=None,
-            )
-    elif (
-        device_type.lower() == "cuda"
-    ):  # The code supports all huggingface models that ends with -HF or which have a .bin
-        # file in their HF repo.
-        logging.info("Using AutoModelForCausalLM for full models")
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        logging.info("Tokenizer loaded")
-
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            device_map="auto",
-            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True,
-            trust_remote_code=True,
-            # max_memory={0: "15GB"} # Uncomment this line with you encounter CUDA out of memory errors
-        )
-        model.tie_weights()
+            model, tokenizer = load_quantized_model_qptq(model_id, model_basename, device_type, LOGGING)
     else:
-        logging.info("Using LlamaTokenizer")
-        tokenizer = LlamaTokenizer.from_pretrained(model_id)
-        model = LlamaForCausalLM.from_pretrained(model_id)
+        model, tokenizer = load_full_model(model_id, model_basename, device_type, LOGGING)
 
     # Load configuration from the model to avoid warnings
     generation_config = GenerationConfig.from_pretrained(model_id)
@@ -116,9 +73,9 @@ def load_model(device_type, model_id, model_basename=None):
         "text-generation",
         model=model,
         tokenizer=tokenizer,
-        max_length=2048,
-        temperature=0,
-        top_p=0.95,
+        max_length=MAX_NEW_TOKENS,
+        temperature=0.2,
+        # top_p=0.95,
         repetition_penalty=1.15,
         generation_config=generation_config,
     )
@@ -129,6 +86,67 @@ def load_model(device_type, model_id, model_basename=None):
     return local_llm
 
 
+def retrieval_qa_pipline(device_type, use_history, promptTemplate_type="llama"):
+    """
+    Initializes and returns a retrieval-based Question Answering (QA) pipeline.
+
+    This function sets up a QA system that retrieves relevant information using embeddings 
+    from the HuggingFace library. It then answers questions based on the retrieved information.
+
+    Parameters:
+    - device_type (str): Specifies the type of device where the model will run, e.g., 'cpu', 'cuda', etc.
+    - use_history (bool): Flag to determine whether to use chat history or not.
+
+    Returns:
+    - RetrievalQA: An initialized retrieval-based QA system.
+
+    Notes:
+    - The function uses embeddings from the HuggingFace library, either instruction-based or regular.
+    - The Chroma class is used to load a vector store containing pre-computed embeddings.
+    - The retriever fetches relevant documents or data based on a query.
+    - The prompt and memory, obtained from the `get_prompt_template` function, might be used in the QA system.
+    - The model is loaded onto the specified device using its ID and basename.
+    - The QA system retrieves relevant documents using the retriever and then answers questions based on those documents.
+    """
+
+    embeddings = HuggingFaceInstructEmbeddings(model_name=EMBEDDING_MODEL_NAME, 
+                                               model_kwargs={"device": device_type})
+    # uncomment the following line if you used HuggingFaceEmbeddings in the ingest.py
+    # embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
+
+    # load the vectorstore
+    db = Chroma(persist_directory=PERSIST_DIRECTORY,
+                embedding_function=embeddings,)
+    retriever = db.as_retriever()
+
+    # get the prompt template and memory if set by the user.
+    prompt, memory = get_prompt_template(promptTemplate_type=promptTemplate_type, 
+                                         history=use_history)
+
+    # load the llm pipeline
+    llm = load_model(device_type, 
+                     model_id=MODEL_ID, 
+                     model_basename=MODEL_BASENAME, 
+                     LOGGING=logging)
+
+    if use_history:
+        qa = RetrievalQA.from_chain_type(llm=llm, 
+                                    chain_type="stuff", # try other chains types as well. refine, map_reduce, map_rerank
+                                    retriever=retriever,
+                                    return_source_documents=True,# verbose=True,
+                                    callbacks=callback_manager,
+                                    chain_type_kwargs={"prompt": prompt, "memory": memory},)
+    else:
+        qa = RetrievalQA.from_chain_type(llm=llm, 
+                                    chain_type="stuff", # try other chains types as well. refine, map_reduce, map_rerank
+                                    retriever=retriever,
+                                    return_source_documents=True,# verbose=True,
+                                    callbacks=callback_manager,
+                                    chain_type_kwargs={"prompt": prompt,},)
+
+    return qa
+
+
 # chose device typ to run on as well as to show source documents.
 @click.command()
 @click.option(
@@ -165,58 +183,45 @@ def load_model(device_type, model_id, model_basename=None):
     is_flag=True,
     help="Show sources along with answers (Default is False)",
 )
-def main(device_type, show_sources):
+@click.option(
+    "--use_history",
+    "-h",
+    is_flag=True,
+    help="Use history (Default is False)",
+)
+def main(device_type, show_sources, use_history):
     """
-    This function implements the information retrieval task.
+    Implements the main information retrieval task for a localGPT.
+
+    This function sets up the QA system by loading the necessary embeddings, vectorstore, and LLM model.
+    It then enters an interactive loop where the user can input queries and receive answers. Optionally,
+    the source documents used to derive the answers can also be displayed.
 
+    Parameters:
+    - device_type (str): Specifies the type of device where the model will run, e.g., 'cpu', 'mps', 'cuda', etc.
+    - show_sources (bool): Flag to determine whether to display the source documents used for answering.
+    - use_history (bool): Flag to determine whether to use chat history or not.
+
+    Notes:
+    - Logging information includes the device type, whether source documents are displayed, and the use of history.
+    - If the models directory does not exist, it creates a new one to store models.
+    - The user can exit the interactive loop by entering "exit".
+    - The source documents are displayed if the show_sources flag is set to True.
 
-    1. Loads an embedding model, can be HuggingFaceInstructEmbeddings or HuggingFaceEmbeddings
-    2. Loads the existing vectorestore that was created by inget.py
-    3. Loads the local LLM using load_model function - You can now set different LLMs.
-    4. Setup the Question Answer retreival chain.
-    5. Question answers.
     """
 
     logging.info(f"Running on: {device_type}")
     logging.info(f"Display Source Documents set to: {show_sources}")
+    logging.info(f"Use history set to: {use_history}")
 
-    embeddings = HuggingFaceInstructEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs={"device": device_type})
-
-    # uncomment the following line if you used HuggingFaceEmbeddings in the ingest.py
-    # embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
-
-    # load the vectorstore
-    db = Chroma(
-        persist_directory=PERSIST_DIRECTORY,
-        embedding_function=embeddings,
+    # check if models directory do not exist, create a new one and store models here.
+    if not os.path.exists("./models"):
+        os.mkdir("models")
 
-    )
-    retriever = db.as_retriever()
-
-
-    template = """Use the following pieces of context to answer the question at the end. If you don't know the answer,\
-    just say that you don't know, don't try to make up an answer.
-
-    {context}
-
-    {history}
-    Question: {question}
-    Helpful Answer:"""
-
-    prompt = PromptTemplate(input_variables=["history", "context", "question"], template=template)
-    memory = ConversationBufferMemory(input_key="question", memory_key="history")
-
-    llm = load_model(device_type, model_id=MODEL_ID, model_basename=MODEL_BASENAME)
-
-    qa = RetrievalQA.from_chain_type(
-        llm=llm,
-        chain_type="stuff",
-        retriever=retriever,
-        return_source_documents=True,
-        chain_type_kwargs={"prompt": prompt, "memory": memory},
-    )
+    qa = retrieval_qa_pipline(device_type, use_history, promptTemplate_type="llama")
     # Interactive questions and answers
     while True:
+
         query = input("\nEnter a query: ")
         if query == "exit":
             break