In [None]:
import sys
print("Python version:", sys.version)

In [None]:
pip show llama-cpp-python


In [None]:
from llama_cpp.llama_cpp import _load_shared_library

def is_gpu_available_v3() -> bool:
    lib = _load_shared_library('llama')
    return bool(lib.llama_supports_gpu_offload())

In [None]:
is_gpu_available_v3()

In [None]:
import torch

In [None]:
torch.cuda.is_available()

In [None]:
from langchain.embeddings import LlamaCppEmbeddings
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
#import pdfplumber
from langchain.text_splitter import CharacterTextSplitter
import os
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from llama_cpp import Llama
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain import LLMChain
from langchain.prompts import PromptTemplate
from langchain.llms  import LlamaCpp
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA

In [None]:
#first version
#now we load in the pdfs
document = []
for file in os.listdir("docs"):
    print(file)
    if file.endswith(".pdf"):
        pdf_path = ".\\docs\\"+file
        loader = PyPDFLoader(pdf_path)
        document.extend(loader.load())


In [None]:
# #second version
# def extract_text_from_pdf(file_path):
#     with pdf.plumber.open(file_path) as pdf:
#         text = ""
#         for page in pdf.pages:
#             text += page.extract_text()
        
#     return text

# def extract_many_files(folder_name):
#     for file in os.listdir(folder_name):
#         if file.endswith(".pdf"):
#             pdf_path = ".\\" + folder_name + "\\"+file
#             text = extract_text_from_pdf(pdf_path)
        

In [None]:
#tried 500, 0
#tried 1000, 200
document_splitta=CharacterTextSplitter(chunk_size=500, chunk_overlap=0)
document_chunks=document_splitta.split_documents(document)
len(document_chunks)

In [None]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [None]:
vectordb = Chroma.from_documents(document_chunks, embedding=embeddings, persist_directory='.\\data')
vectordb.persist()

In [None]:
model_name = "QuantFactory/Meta-Llama-3-8B-GGUF"
model_file = "Meta-Llama-3-8B.Q8_0.gguf"
model_path = ".\\llms\\Meta-Llama-3-8B-Instruct.Q8_0.gguf"

In [None]:
# llm = Llama(
#     model_path=model_path,
#     n_ctx=4096,
#     n_gpu_layers=80,
#     n_batch=521
# )


In [None]:
llm = LlamaCpp(model_path=model_path,temperature=0.3,n_gpu_layers= 80,
        vocab_only = False,
        use_mmap = True,
        use_mlock = False,
        # Context Params                                                                                                                                                                                                                                                          
        seed=  0xFFFFFFFF,
        n_ctx = 4096,
        n_batch = 521,
        n_threads= None,
        rope_freq_base = 0.0,
        rope_freq_scale = 0.0,
        f16_kv = True,
        logits_all = False,
        embedding = False,
        # Sampling Params                                                                                                                                                                                                                                                         
        last_n_tokens_size = 64,
        # LoRA Params                                                                                                                                                                                                                                                             
        lora_base = None,
        lora_scale = 1.0,
        lora_path = None,
        # Backend Params                                                                                                                                                                                                                                                          
        numa= False,
        # Chat Format Params                                                                                                                                                                                                                                                      
        # Misc                                                                                                                                                                                                                                                                    
        verbose = True,
        max_tokens=5000)

In [None]:
def rag_it(question):
    """Given a question return an answer using the rag pipeline"""
    question = question[0:-1] + "using only the documents given and no prior knowledge"
    docs = vectordb.similarity_search(question)
    rag_pipeline = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff",retriever=vectordb.as_retriever())
    print("Here are the docs that were picked up: ")
    for doc in docs:
        print(doc)
    return (rag_pipeline(question))['result']

In [None]:
question = "Given these documents, give me the difference between the iPhone 11 Pro Max and the iPhone 13 Pro Max's features."
docs = vectordb.similarity_search(question)
docs

In [None]:
rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff",retriever=vectordb.as_retriever()
)

In [None]:
answer = rag_pipeline("Given these documents, give me the difference between the iPhone 11 Pro Max and the iPhone 13 Pro Max's features using only these documents and not prior knowledge.")


In [None]:
print(answer['result'])

In [None]:
question = "What are the dimensions of iPhone 11 Pro Max and iPhone SE?"
answer2 =  rag_it(question)
print(answer2)

In [None]:
question = "What are the display sizes of the iPhone 11 Pro Max and iPhone SE?"
answer =  rag_it(question)
print(answer)

In [None]:
question = "What is the display of iPhone 11 Pro Max?"
answer =  rag_it(question)
print(answer)

In [None]:
question = "What are the display sizes of the iPhone SE and iPhone 13 Pro Max?"
answer =  rag_it(question)
print(answer)

In [None]:
# memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
# prompt_template = PromptTemplate.from_template('Use these documents to answer questions that I ask about them')

In [None]:
# llama_model = LLMChain(llm=llm, prompt=prompt_template)
# pdf_qa=ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectordb.as_retriever(search_kwargs={'k':6}),verbose=False,memory=memory)

In [None]:
# query="What is the main topic of the documents?"
# response=pdf_qa({"question": query})
# print(response["answer"])

In [None]:
# query="Given these documents, give me the difference between the iPhone 11 Pro max and the iPhone 13 Pro Max features."
# response=pdf_qa({"question": query})
# print(response["answer"])