In [17]:
import torch

from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import messages_to_prompt, completion_to_prompt
llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    # model_url='https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf',
    model_url='https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf',
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=4096,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": -1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from C:\Users\iamaj\AppData\Local\llama_index\models\mistral-7b-instruct-v0.2.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128

In [18]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core import Document

documents = SimpleDirectoryReader(
    input_files = ["./documents/python_tutorial.pdf"]
).load_data()

documents = Document(text = "\n\n".join([doc.text for doc in documents]))

In [19]:
import os
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core import VectorStoreIndex, ServiceContext
def get_build_index(documents,llm,embed_model="local:BAAI/bge-small-en-v1.5",sentence_window_size=3,save_dir="./vector_store/index"):
  
  node_parser = SentenceWindowNodeParser(
      window_size = sentence_window_size,
      window_metadata_key = "window",
      original_text_metadata_key = "original_text"
  )

  sentence_context = ServiceContext.from_defaults(
      llm = llm,
      embed_model= embed_model,
      node_parser = node_parser,
  )

  if not os.path.exists(save_dir):
        # create and load the index
        index = VectorStoreIndex.from_documents(
            [documents], service_context=sentence_context
        )
        index.storage_context.persist(persist_dir=save_dir)
  else:
      # load the existing index
      index = load_index_from_storage(
          StorageContext.from_defaults(persist_dir=save_dir),
          service_context=sentence_context,
      )

  return index

In [20]:
vector_index = get_build_index(documents=documents, llm=llm, embed_model="local:BAAI/bge-small-en-v1.5", sentence_window_size=3, save_dir="./vector_store/index")


  sentence_context = ServiceContext.from_defaults(
  from .autonotebook import tqdm as notebook_tqdm
config.json: 100%|██████████| 743/743 [00:00<00:00, 743kB/s]
model.safetensors: 100%|██████████| 133M/133M [00:41<00:00, 3.25MB/s] 
tokenizer_config.json: 100%|██████████| 366/366 [00:00<?, ?B/s] 
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 5.89MB/s]
tokenizer.json: 100%|██████████| 711k/711k [00:00<00:00, 3.02MB/s]
special_tokens_map.json: 100%|██████████| 125/125 [00:00<?, ?B/s] 


In [25]:
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from llama_index.core.postprocessor imp ort SentenceTransformerRerank


def get_query_engine(sentence_index, similarity_top_k=6, rerank_top_n=2):
  postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
  rerank = SentenceTransformerRerank(
      top_n=rerank_top_n, model="BAAI/bge-reranker-base"
  )
  engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
  )

  return engine

In [26]:
query_engine = get_query_engine(sentence_index=vector_index, similarity_top_k=6, rerank_top_n=2)


config.json: 100%|██████████| 799/799 [00:00<?, ?B/s] 
model.safetensors: 100%|██████████| 1.11G/1.11G [08:33<00:00, 2.17MB/s]
tokenizer_config.json: 100%|██████████| 443/443 [00:00<?, ?B/s] 
sentencepiece.bpe.model: 100%|██████████| 5.07M/5.07M [00:01<00:00, 2.56MB/s]
tokenizer.json: 100%|██████████| 17.1M/17.1M [00:11<00:00, 1.49MB/s]
special_tokens_map.json: 100%|██████████| 279/279 [00:00<?, ?B/s] 


In [32]:
query = 'what is this PDF about?'
response = query_engine.query(query)
print(response)

Llama.generate: prefix-match hit

llama_print_timings:        load time =   93227.21 ms
llama_print_timings:      sample time =      35.35 ms /   120 runs   (    0.29 ms per token,  3394.72 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =   35030.69 ms /   120 runs   (  291.92 ms per token,     3.43 tokens per second)
llama_print_timings:       total time =   35470.29 ms /   121 tokens


 Based on the provided context information, this PDF appears to be a Python tutorial designed for beginners, students, and experienced professionals looking to enhance their skills in software development and data science using the Python programming language. The tutorial is based on the latest Python 3.11.2 version and emphasizes Python's beginner-friendly features such as being interpreted, interactive, object-oriented, high-level, dynamically-typed, and garbage-collected. The tutorial also highlights the importance of Python's interactive shell in getting familiar with the language and testing library functionality.
