In [None]:
# https://docs.llamaindex.ai/en/stable/examples/low_level/oss_ingestion_retrieval/
# https://netraneupane.medium.com/retrieval-augmented-generation-rag-using-llamaindex-and-mistral-7b-228f93ba670f

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
%pip install llama-index-readers-file
%pip install llama-index-embeddings-huggingface
%pip install llama-index-llms-huggingface
# %pip install llama-index-llms-llama-cpp

In [None]:
!pip install "transformers[torch]" "huggingface_hub[inference]"

In [None]:
# !CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python  --no-cache-dir

In [18]:
# sentence transformers
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")

In [8]:
embeded_text = embed_model.get_text_embedding("hello Mapsa Iran Tehran!")
len(embeded_text)

384

In [None]:
import os

# os.environ["HUGGING_FACE_TOKEN"] =  #YOUR_TOKEN
HF_TOKEN = os.getenv("HUGGING_FACE_TOKEN")

In [15]:
from llama_index.llms.huggingface import (
    HuggingFaceInferenceAPI,
    HuggingFaceLLM,
)

# locally_run = HuggingFaceLLM(model_name="HuggingFaceH4/zephyr-7b-beta")

remotely_run = HuggingFaceInferenceAPI(
    model_name="HuggingFaceH4/zephyr-7b-beta", token=HF_TOKEN
)

In [16]:
remotely_run.complete("Hello! Can you tell me a poem about cats and dogs?")

CompletionResponse(text="\n\nSure, here's a short poem about cats and dogs:\n\nCats and dogs, they say,\nShould never be friends,\nBut in my house they play,\nAnd share their love until the end.\n\nCats will climb and pounce,\nDogs will chase and bark,\nBut in my home they've found,\nA place where love is never dark.\n\nCats will purr and snuggle,\nDogs will wag and lick,\nIn my house they've learned to mingle,\nAnd love each other, thick and thick.\n\nSo let the world say what it will,\nIn my house, they'll always be,\nCats and dogs, side by side,\nIn perfect harmony.", additional_kwargs={}, raw=None, logprobs=None, delta=None)

In [None]:
# from llama_index.llms.llama_cpp import LlamaCPP

# model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf"

# llm = LlamaCPP(
#     # You can pass in the URL to a GGUF model to download it automatically
#     model_url=model_url,
#     # optionally, you can set the path to a pre-downloaded model instead of model_url
#     model_path=None,
#     temperature=0.1,
#     max_new_tokens=256,
#     # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
#     context_window=3900,
#     # kwargs to pass to __call__()
#     generate_kwargs={},
#     # kwargs to pass to __init__()
#     # set to at least 1 to use GPU
#     model_kwargs={"n_gpu_layers": 1},
#     verbose=True,
# )

In [53]:
from llama_index.core import SimpleDirectoryReader, Document

documents = SimpleDirectoryReader(
    "/content/data/"
).load_data()


# documents = SimpleDirectoryReader(
#     input_files = ["/content/data/SurveyLLM.pdf"]
# ).load_data()

# documents = Document(text = "\n\n".join([doc.text for doc in documents]))

In [19]:
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.node_parser import SentenceSplitter


# create the sentence window node parser w/ default settings
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

# base node parser is a sentence splitter
text_splitter = SentenceSplitter()

from llama_index.core import Settings

Settings.llm = remotely_run
Settings.embed_model = embed_model
Settings.text_splitter = text_splitter

In [None]:
nodes = node_parser.get_nodes_from_documents(documents)
base_nodes = text_splitter.get_nodes_from_documents(documents)

from llama_index.core import VectorStoreIndex

sentence_index = VectorStoreIndex(nodes)
base_index = VectorStoreIndex(base_nodes)

from llama_index.core.postprocessor import MetadataReplacementPostProcessor

query_engine = sentence_index.as_query_engine(
    similarity_top_k=2,
    # the target key defaults to `window` to match the node_parser's default
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)
window_response = query_engine.query(
    "Is Mistral-7B an open-source llm??"
)
print(window_response)

In [35]:
# import os
# from llama_index.core.node_parser import SentenceWindowNodeParser
# from llama_index.core import VectorStoreIndex, ServiceContext, StorageContext, load_index_from_storage


# def get_build_index(documents,llm,embed_model="local:BAAI/bge-small-en-v1.5", sentence_window_size=3,save_dir="./vector_store/index"):

#   node_parser = SentenceWindowNodeParser(
#       window_size = sentence_window_size,
#       window_metadata_key = "window",
#       original_text_metadata_key = "original_text"
#   )

#   sentence_context = ServiceContext.from_defaults(
#       llm = remotely_run,
#       embed_model= embed_model,
#       node_parser = node_parser,
#   )

#   if not os.path.exists(save_dir):
#         # create and load the index
#         index = VectorStoreIndex.from_documents(
#             [documents], service_context=sentence_context
#         )
#         index.storage_context.persist(persist_dir=save_dir)
#   else:
#       # load the existing index
#       index = load_index_from_storage(
#           StorageContext.from_defaults(persist_dir=save_dir),
#           service_context=sentence_context,
#       )

#   return index

In [36]:
# vector_index = get_build_index(documents=documents, llm=remotely_run, embed_model="local:BAAI/bge-small-en", sentence_window_size=3, save_dir="./vector_store/index")

  sentence_context = ServiceContext.from_defaults(


In [37]:
# from llama_index.core.postprocessor import MetadataReplacementPostProcessor, SentenceTransformerRerank

# def get_query_engine(sentence_index, similarity_top_k=6, rerank_top_n=2):
#   postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
#   rerank = SentenceTransformerRerank(
#       top_n=rerank_top_n, model="BAAI/bge-reranker-base"
#   )
#   engine = sentence_index.as_query_engine(
#         similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
#   )

#   return engine

In [38]:
# query_engine = get_query_engine(sentence_index=vector_index, similarity_top_k=6, rerank_top_n=2)

config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

In [39]:
# from llama_index.core.response.notebook_utils import display_response


# while True:
#   query=input()
#   response = query_engine.query(query)
#   display_response(response)
#   print("\n")

Is Mistral-7B an open-source llm?


**`Final Response:`** No, Mistral-7B is not an open-source LLM. It is a specific model mentioned in the references provided, but no information is given about its availability or licensing. The references provided are for two different papers, one titled "Mistral 7b" and the other titled "DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales." Without further context or information, it is unclear whether Mistral-7B is a publicly available LLM or not.



what is the SLMs?


**`Final Response:`** SLMs, also known as n-gram language models, are developed based on statistical learning methods that rose in the 1990s. The basic idea is to build the word prediction model based on the Markov assumption, such as predicting the next word based on the most recent context. SLMs have been widely applied to enhance task performance in information retrieval and natural language processing, but they often suffer from the curse of dimensionality due to the difficulty in accurately estimating high-order language models. Specially designed smoothing strategies such as back-off estimation and Good–Turing estimation have been introduced to alleviate the data sparsity problem.





KeyboardInterrupt: Interrupted by user