In [10]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.llms.ollama import Ollama

In [15]:
from langchain.embeddings import HuggingFaceEmbeddings
embed_model = LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
)

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [12]:
llm = Ollama(model="llama3.1")

In [20]:
documents = SimpleDirectoryReader('./documents').load_data()
documents

[Document(id_='6f000b14-c429-4cea-bb51-b3b9ef97567a', embedding=None, metadata={'page_label': '1', 'file_name': '2404.14047v1.pdf', 'file_path': 'd:\\Main_Projects\\Embeddings_Rag\\documents\\2404.14047v1.pdf', 'file_type': 'application/pdf', 'file_size': 266223, 'creation_date': '2024-08-08', 'last_modified_date': '2024-07-21'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='How Good Are Low-bit Quantized LLAMA3 Models?\nAn Empirical Study\nWei Huang∗\nThe University of Hong Kong\nweih@connect.hku.hkXudong Ma∗\nBeihang University\nmacaronlin@buaa.edu.cn\nHaotong Qin†\nETH Zurich\nhaotong.qin@pbl.ee.ethz.chXingyu Zheng\nBeihang University\nxingyuzheng@buaa.edu.cn\nChengtao Lv\nBeihang University\nlvchengtao@buaa.edu.cnHong Chen\nBeihang University\n1837

In [16]:
Settings.embed_model = embed_model
Settings.llm = llm

In [21]:
index = VectorStoreIndex.from_documents(documents)

In [24]:
query_engine = index.as_query_engine()

# Run a query
response = query_engine.query("what is llm quatization in 100 words?")
print(response)

LLM (Large Language Model) quantization refers to the process of reducing the bit width or precision of a large language model's weights and activations, while minimizing the loss of accuracy. This technique involves compressing the model to make it more computationally efficient and memory-friendly, without sacrificing its performance. Quantization can be achieved through various methods, including post-training quantization and LoRA-finetuning quantization. The goal is to enable LLMs to run at a lower computational cost, ultimately driving progress in generative artificial intelligence.


In [28]:
temp = str(response)
temp = temp.split(" ")

print(len(temp))

81


# Using Embeddings

In [None]:
import numpy as np
texts = [doc.text for doc in documents]

# Get embeddings for all documents
embeddings = embed_model.get_text_embedding_batch(texts)
len(embeddings[0])

In [36]:
embeddings_array = np.array(embeddings)

In [42]:
from sklearn.metrics.pairwise import cosine_similarity

def query_documents(query_text, top_k=3):
    # Get the embedding for the query
    query_embedding = embed_model.get_text_embedding(query_text)
    
    # Compute cosine similarities
    similarities = cosine_similarity([query_embedding], embeddings_array)[0]
    
    # Get indices of top-k most similar documents
    top_indices = similarities.argsort()[-top_k:][::-1]
    
    # Return top-k documents and their similarities
    results = []
    for idx in top_indices:
        results.append({
            'text': documents[idx].text,
            'similarity': similarities[idx],
        })
    
    return results

def generate_answer(query, context):
    # Prepare the prompt for Llama
    prompt = f"""Context information is below.
---------------------
{context}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {query}
Answer: """

    # Generate the answer using Llama
    response = llm.complete(prompt)
    
    return response.text

def rag_pipeline(query):
    # Retrieve relevant documents
    relevant_docs = query_documents(query)
    
    # Prepare context by joining the text of relevant documents
    context = "\n\n".join([doc['text'] for doc in relevant_docs])
    
    # Generate answer using Llama
    answer = generate_answer(query, context)
    
    return answer

# Example usage
query = "Tell me the performance diffrence between non quantizd and quantized models?"
answer = rag_pipeline(query)

print(f"Query: {query}")
print(f"Answer: {answer}")

Query: Tell me the performance diffrence between non quantizd and quantized models?
Answer: According to the text, the performance difference between non-quantized and quantized models is significant. Specifically:

* On the WikiText2 dataset, the Perplexity (PPL) of the non-quantized LLAMA3 model is 6.1, while the PPL of some quantization methods (e.g., GPTQ4, AWQ4, QuIP4) ranges from 6.5 to 13.0, indicating a significant degradation in performance.
* On the PTB dataset, the non-quantized LLAMA3 model has a Perplexity (PPL) of 9.2, while some quantization methods (e.g., GPTQ4, AWQ4) have PPL values ranging from 10.4 to 19.2.
* On the zero-shot evaluation tasks (PIQA, Wino, ARC-e, ARC-c, and Hellaswag), the performance of the non-quantized LLAMA3 model is generally better than that of the quantization methods.

However, it's worth noting that some quantization methods (e.g., SmoothQuant) have relatively small performance differences compared to the non-quantized model.
