In [None]:
curr_question ="what are the weights of iPhone 13 pro max and iPhone SE"

In [None]:
import json
import os
from pathlib import Path
from pprint import pprint
from langchain.llms  import LlamaCpp
from langchain.text_splitter import CharacterTextSplitter
#from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
#from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader
import json
#from langchain.schema import SystemMessage
import cv2
from langchain.embeddings import HuggingFaceEmbeddings
import numpy as np
import pdf2image
from langchain.vectorstores.utils import filter_complex_metadata
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.schema import Document
import asyncio
from langchain.chains import RetrievalQA
import nest_asyncio
nest_asyncio.apply()

In [None]:
import sys
print("Python version:", sys.version)
print(sys.prefix)

In [None]:
MODEL_PATH = ".\\llms\\Meta-Llama-3-8B-Instruct.Q8_0.gguf" #path to gguf quantized llama 3 model
PERSIST_DIR = '.\\data' #path to where the chromadb is stored

In [None]:
#this class handles creating an instance of a llama, in other words, each object created is a llama 3 instance

class MyLlama:
    #useful parameters to know:
    #model_path is where model is stored
    #prompt is not part of the actual LlamaCpp Llama3 initialization, but for the sake of the other functions below, it is what it takes to answer questions
    #lower temperature means model is less creative and less wild
    #n_ctx is number of tokens for context size
    #n_batch should be set to > 520 to use gpu, if verbose is set to true, you should see BLAS=1 when using gpu, adjust gpu layers as high as possible to not involve cpu
    #max_tokens is for output token size

    def __init__(self,model_path,prompt=None,temperature=10e-10, n_ctx=8192, n_batch=521, max_tokens=2000, n_gpu_layers=80):
        self.prompt = prompt
        self.llm = LlamaCpp(model_path=model_path,temperature=temperature,n_gpu_layers= n_gpu_layers,
        main_gpu= 0,
        vocab_only = False,
        use_mmap = True,
        use_mlock = False,
        # Context Params                                                                                                                                                                                                                                                          
        seed=  0xFFFFFFFF,
        n_ctx = n_ctx,
        n_batch = n_batch,
        n_threads= None,
        rope_freq_base = 0.0,
        rope_freq_scale = 0.0,
        f16_kv = True,
        logits_all = False,
        embedding = False,
        # Sampling Params                                                                                                                                                                                                                                                         
        last_n_tokens_size = 64,
        # LoRA Params                                                                                                                                                                                                                                                             
        lora_base = None,
        lora_scale = 1.0,
        lora_path = None,
        # Backend Params                                                                                                                                                                                                                                                          
        numa= False,
        # Chat Format Params                                                                                                                                                                                                                                                      
        # Misc                                                                                                                                                                                                                                                                    
        verbose = False,
        max_tokens = max_tokens)
    def prompt_llama(self, **kwargs):
         #llm should answer this prompt
         message = self.prompt.format(**kwargs)
         print(message)
         return self.llm(message)
    def set_prompt(self, prompt):
        #sets prompt for the particular instance
        self.prompt = prompt
    def get_llm(self):
        #just returns the raw llm (used later)
        return self.llm


        

In [None]:
#create the vector database to store pdf documents


def get_product_name(text):
    """Supposed to get the product name given description of the product's specs."""
    prompt = """Given this text, only output one specific product name and nothing else; no code or reasoning.
    ---------------------------
    {text}
    ---------------------------
    Answer:
    """
    new_llm = MyLlama(prompt=prompt,model_path=MODEL_PATH)
    product_name = new_llm.prompt_llama(text = text)
    product_name = product_name.split( '```')[0].split('\n')[0].lstrip()
    return product_name
    #use llama model here to ask this question


def generate_question_source_pairs(question,tools):
    """This function generates the sub-question and the data-source that would answer the question."""
    sub_question_prompt="""\
    You are an assistant that does not want to annoy the user, please do exactly as the user has asked below or you will be penalized!!!
    The user demands a single list containing list(s) to be returned, please follow this when you output your answer

    These are the guidelines you consider when completing your task:
    - You should output at most one Python list of question-tool pairs (one for each tool and no more). Sample output: [["What is A?", "tool_name"]].
    - You can keep the original question if it's straightforward
    - You should break down a generic, ambiguous question into concrete sub-questions about documents in the database (please output this in one list and do not make it too long)
    - Please keep the subquestion-tool pair lists within one list!
    - Each sub-question must ask about only ONE product!
    - Each tool name in the output must be part of the tools GIVEN!
    - Each output must follow the Sample output's format, no multiple tools, each pair must have only one sub-question and one tool name
    - You can generate multiple sub-questions for each tool
    - You don't need to use a tool if it's irrelevant
    - Don't generate too many sub-questions. Only generate necessary ones.
    Just the output and no code or text or symbol explanation afterwards please!

    ## Examples of Question and their respective Subquestions:

    Question: Compare Product A and Product B 
    Subquestions: [["Product overview of A", "product_a"], ["Product overview of B", "product_b"]]

    Question: Compare Product A and Product B in terms of scalability
    Subquestions: [["Scalability of A", "product_a"], ["Scalability of B", "product_b"]]

    Question: What is a feature of Product A?
    Subquestions: [["Feature of A", "product_a"]]

    ## Tools
    {tools}

    ##Question
    {question}

    ##Output
    Please output here and follow the instructions that the user has mentioned.
    """
    new_llm = MyLlama(prompt=sub_question_prompt, model_path=MODEL_PATH)
    sub_pairs = new_llm.prompt_llama(question = question, tools = tools)
    return sub_pairs
    



def generate_data_idxs(db, question):
    """This function generates which data sources pertain to the questions given all the data sources."""
    dbs = db.similarity_search(question, k=20)
    dbs_str = "".join(
        f"{database.metadata['collection_name']}: {database.page_content} " for database in dbs
    )
    return dbs_str




In [None]:

# embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
embedding_model = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en-v1.5')



#So what these lines do:
#Gathers the pdfs, chunks them after loading them then stores it into chroma db and names it after that pdf file name
#for easier search I also introduced a dictionary

collection_dict = {}
collection_to_db = {}

for file in os.listdir("docs"):
    pdf_path = ".\\docs\\"+file
    loader = UnstructuredPDFLoader(pdf_path)
    docs = loader.load()
    text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size = 2000, chunk_overlap=300)
    output_docs = text_splitter.split_documents(docs)
    output_docs = filter_complex_metadata(output_docs)
    for document in output_docs:
        if "source" in document.metadata:
            document.metadata = {
                **document.metadata,
                "filename": os.path.basename(document.metadata["source"]),
            }
    output_docs = [doc for doc in output_docs]
    product_name = get_product_name(output_docs[0].page_content)
    collection_name = Path(file).stem
    print("Collection name: ", collection_name)
    vectordb = Chroma(
        collection_name=collection_name,
        persist_directory=PERSIST_DIR,
        embedding_function=embedding_model,
    )
    #adding the document to general and product-specific index
    # general_vectordb.add_documents(output_docs)
    vectordb.add_documents(output_docs)
    collection_to_db[collection_name] = vectordb
    collection_dict[collection_name] = product_name


with open("collection_dict.json", "w") as f:
    json.dump(collection_dict,f)




In [None]:
from langchain.schema import Document

#loading the files back into the collection
with open("collection_dict.json", "r") as f:
    collection_dict = json.load(f)

#creating a vectordb that indexes into the proper collection
vectordb = Chroma(
    collection_name="index",
    persist_directory=PERSIST_DIR,
    embedding_function=embedding_model,
)

#this is how the mapping occurs
vectordb.add_documents([Document(page_content=value, metadata={"collection_name": key}) for key, value in collection_dict.items()])

In [None]:
#just to show the collection_dict's keys and values very clearly
print(collection_dict)

In [None]:
#database stuff

question = "Given these documents, give me the difference between the iPhone 11 Pro Max and the iPhone 13 Pro Max's features using only these documents and not prior knowledge."
question1 = "What are the dimensions of iPhone 11 Pro Max and iPhone SE?"
question2 = "What are the display sizes of the iPhone 11 Pro Max and iPhone SE?"
question3 = "What is the display of iPhone 11 Pro Max?"
question4 = "What are the display sizes of the iPhone SE and iPhone 13 Pro Max?"

# curr_question = question1

tools_str = generate_data_idxs(db=vectordb, question=curr_question) #tools string gives filename to product name mapping
print("tools_str: ",tools_str)
sub_pairs = generate_question_source_pairs(question=curr_question, tools=tools_str) #generates [question, product] broken up for each question
print(sub_pairs)

In [None]:
print(sub_pairs)

In [None]:
#isolate the sub-question-tool pairs from text output (just pre-processing)
original_str = ''
for char in sub_pairs:
    if char == "]" and original_str[-1] == "]":
        original_str = original_str + char
        break
    if char != "'" or char != " ":
        original_str = original_str + char
list_of_subqs = json.loads(original_str)
print(list_of_subqs)
print("type: ", type(list_of_subqs))

In [None]:
async def rag_it(question, db,llm):
    """Given a question return an answer using the rag pipeline"""
    question = question[0:-1] + "using only the documents given and no prior knowledge"
    docs = db.similarity_search(question)
    rag_pipeline = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff",retriever=db.as_retriever())
    return (rag_pipeline(question))['result']

async def really_answer_subqs(subquestions):
    #for each subquestion, we pass it to the rag_it function for an answer
    #each of these is async, so kind of execute at the same time
    #then we wait to return the answer
    async_tasks = []
    llm = MyLlama(model_path=MODEL_PATH).get_llm()
    for subquestion in subquestions:
        db = collection_to_db[subquestion[1]]
        async_tasks.append(rag_it(subquestion[0], db, llm))
    answers = await asyncio.gather(*async_tasks)
    #print("answers: ", answers)
    return answers



def answer_subqs(subq_list):
    #we need to wrap the above function so that it is async
    try:
        loop = asyncio.get_event_loop()
    except:
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
    result = loop.run_until_complete(really_answer_subqs(subquestions=subq_list))
    return result




In [None]:



#list of answers from answering subquestions
answer_list = answer_subqs(list_of_subqs)

In [None]:
#gathers all the answers and with some string processing, returns final answer
unformatted_str = ''
for answer in answer_list:
    unformatted_str = unformatted_str + ' '+ answer
print("Question: ", curr_question)
print("Final Answer: ", unformatted_str.lstrip())

