# Install Necessary Libraries

In [None]:
%pip install llama-index
%pip install transformers accelerate bitsandbytes
%pip install llama-index-llms-huggingface
%pip install llama-index-embeddings-huggingface
%pip install llama-index-vector-stores-chroma

# Import necessary Libraries

In [31]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core import Settings
from llama_index.core import ServiceContext
from llama_index.core import VectorStoreIndex
from llama_index.core.response.notebook_utils import display_response
chroma_client = chromadb.PersistentClient()
chroma_collection = chroma_client.create_collection("RAG_DB2")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

## Setup

### Data

In [2]:

documents = SimpleDirectoryReader("data").load_data()

In [10]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### LLM


In [11]:
import torch
from transformers import BitsAndBytesConfig
from llama_index.core.prompts import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)


llm = HuggingFaceLLM(
    model_name="mistralai/Mistral-7B-Instruct-v0.1",
    tokenizer_name="mistralai/Mistral-7B-Instruct-v0.1",
    query_wrapper_prompt=PromptTemplate("<s>[INST] {query_str} [/INST] </s>\n"),
    context_window=3900,
    max_new_tokens=256,
    model_kwargs={"quantization_config": quantization_config},
    # tokenizer_kwargs={},
    generate_kwargs={"temperature": 0.2, "top_k": 5, "top_p": 0.95},
    device_map="auto",
)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [12]:

embed_model = "local:BAAI/bge-small-en-v1.5"

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [17]:
Settings.llm = llm
Settings.embed_model=embed_model



In [18]:
service_context=ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embed_model
)

  service_context=ServiceContext.from_defaults(


In [23]:
vector_index=VectorStoreIndex.from_documents(documents,service_context=service_context)

### Index Setup

## Basic Query Engine

### Compact (default)

In [24]:
query_engine = vector_index.as_query_engine(response_mode="compact")

response = query_engine.query("What is Transformer?")

display_response(response)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


**`Final Response:`** The Transformer is a sequence-to-sequence model based on attention mechanisms. It was introduced in a paper by Vaswani et al. in 2017 and has since become a popular choice for sequence-level tasks such as machine translation and text summarization. The Transformer architecture consists of an encoder and a decoder, both of which use stacked self-attention and point-wise, fully connected layers. The encoder processes the input sequence and produces a fixed-length representation of the input, while the decoder generates the output sequence by conditioning on the input sequence and the previously generated output. The Transformer has been shown to outperform previous models on several benchmarks and has become a standard component in many deep learning pipelines.

### Refine

In [34]:
query_engine = vector_index.as_query_engine(response_mode="refine", similarity_top_k=5,verbose=True)

response = query_engine.query("What is Transformer?")

display_response(response)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


**`Final Response:`** The Transformer is a sequence-to-sequence model based on attention, introduced in a paper by Vaswani et al. in 2017. It is designed to be faster and more efficient than recurrent or convolutional-based models for sequence-to-sequence tasks, such as machine translation. The Transformer uses multi-headed self-attention to process input sequences, and it can be trained on parallel data to improve performance. The model has achieved state-of-the-art results on several machine translation tasks, and it has been applied to other tasks such as text summarization and question answering.

The Transformer follows an overall architecture using stacked self-attention and point-wise, fully connected layers for both the encoder and decoder, as shown in Figure 1. The encoder is composed of a stack of N= 6 identical layers, each with two sub-layers: a multi-head self-attention mechanism and a simple, position-wise fully connected feed-forward network. The decoder is also composed of a stack of N= 6 identical layers, with an additional sub-layer that performs multi-head attention over the output of the encoder stack. Residual

### Tree Summarize

In [26]:
query_engine = vector_index.as_query_engine(response_mode="tree_summarize")

response = query_engine.query("What is Transformer?")

display_response(response)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


**`Final Response:`** The Transformer is a sequence-to-sequence model based on attention mechanisms. It was introduced in a paper by Vaswani et al. in 2017 and has since become a popular choice for sequence-level tasks such as machine translation and text summarization. The Transformer architecture consists of an encoder and a decoder, both of which use stacked self-attention and point-wise, fully connected layers. The encoder processes the input sequence and produces a fixed-length representation of the input, while the decoder generates the output sequence by conditioning on the input sequence and the previously generated output. The Transformer has been shown to outperform previous models on several benchmarks and has become a standard component of many state-of-the-art models.