In [None]:
# https://docs.llamaindex.ai/en/stable/examples/low_level/oss_ingestion_retrieval/
# https://netraneupane.medium.com/retrieval-augmented-generation-rag-using-llamaindex-and-mistral-7b-228f93ba670f

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
%pip install llama-index-readers-file
%pip install llama-index-embeddings-huggingface
%pip install llama-index-llms-llama-cpp
%pip -q install sentence-transformers

In [None]:
# sentence transformers
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")

In [8]:
embeded_text = embed_model.get_text_embedding("hello Mapsa Iran Tehran!")
len(embeded_text)

384

In [None]:
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python  --no-cache-dir

In [None]:
# remotely_run = HuggingFaceInferenceAPI(
#     model_name="HuggingFaceH4/zephyr-7b-alpha", token=HF_TOKEN
# )

In [10]:
from llama_index.llms.llama_cpp import LlamaCPP

model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf"

llm = LlamaCPP(
    # You can pass in the URL to a GGUF model to download it automatically
    model_url=model_url,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    verbose=True,
)

Downloading url https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf to path /tmp/llama_index/models/llama-2-13b-chat.Q4_0.gguf
total size (MB): 7365.83


7025it [02:17, 51.18it/s]                          
llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /tmp/llama_index/models/llama-2-13b-chat.Q4_0.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loade

In [None]:
 llm.complete("Hello! Can you tell me a poem about cats and dogs?")

In [None]:
from llama_index.core import SimpleDirectoryReader, Document

documents = SimpleDirectoryReader(
    input_files = ["./survey_on_llms.pdf"]
).load_data()

documents = Document(text = "\n\n".join([doc.text for doc in documents]))

In [None]:
import os
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core import VectorStoreIndex, ServiceContext, StorageContext, load_index_from_storage


def get_build_index(documents,llm,embed_model="local:BAAI/bge-small-en-v1.5", sentence_window_size=3,save_dir="./vector_store/index"):

  node_parser = SentenceWindowNodeParser(
      window_size = sentence_window_size,
      window_metadata_key = "window",
      original_text_metadata_key = "original_text"
  )

  sentence_context = ServiceContext.from_defaults(
      llm = llm,
      embed_model= embed_model,
      node_parser = node_parser,
  )

  if not os.path.exists(save_dir):
        # create and load the index
        index = VectorStoreIndex.from_documents(
            [documents], service_context=sentence_context
        )
        index.storage_context.persist(persist_dir=save_dir)
  else:
      # load the existing index
      index = load_index_from_storage(
          StorageContext.from_defaults(persist_dir=save_dir),
          service_context=sentence_context,
      )

  return index

In [None]:
vector_index = get_build_index(documents=documents, llm=llm, embed_model="local:BAAI/bge-small-en", sentence_window_size=3, save_dir="./vector_store/index")

  sentence_context = ServiceContext.from_defaults(


In [None]:
from llama_index.core.postprocessor import MetadataReplacementPostProcessor, SentenceTransformerRerank

def get_query_engine(sentence_index, similarity_top_k=6, rerank_top_n=2):
  postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
  rerank = SentenceTransformerRerank(
      top_n=rerank_top_n, model="BAAI/bge-reranker-base"
  )
  engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
  )

  return engine

In [None]:
query_engine = get_query_engine(sentence_index=vector_index, similarity_top_k=6, rerank_top_n=2)

In [None]:
from llama_index.core.response.notebook_utils import display_response


while True:
  query=input()
  response = query_engine.query(query)
  display_response(response)
  print("\n")

Is Mistral-7B an open-source llm?



llama_print_timings:        load time =  372903.07 ms
llama_print_timings:      sample time =      31.53 ms /    51 runs   (    0.62 ms per token,  1617.30 tokens per second)
llama_print_timings: prompt eval time =  698922.51 ms /   951 tokens (  734.93 ms per token,     1.36 tokens per second)
llama_print_timings:        eval time = 1660101.17 ms /    50 runs   (33202.02 ms per token,     0.03 tokens per second)
llama_print_timings:       total time = 2359250.25 ms /  1001 tokens


**`Final Response:`** Based on the provided context information, Mistral-7B is not an open-source LLM. According to reference [196], it is a pre-trained language model for chatbots that is not open-source.



