In [6]:
# - You need to install both C++ Desktop development from windows and Windows 10 SDK 
#!pip install llama-index
#!pip install llama-index-embeddings-huggingface
#!pip install llama-index-llms-llama-cpp

In [1]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
   
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path="models/mistral-7b-instruct-v0.2.Q2_K.gguf",
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 0},
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from models/mistral-7b-instruct-v0.2.Q2_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:             

In [9]:
response = llm.complete("Hello! Can you tell me a poem about cats and dogs?")
print(response.text)


llama_print_timings:        load time =   10909.03 ms
llama_print_timings:      sample time =      46.11 ms /   195 runs   (    0.24 ms per token,  4229.29 tokens per second)
llama_print_timings: prompt eval time =   10908.80 ms /    76 tokens (  143.54 ms per token,     6.97 tokens per second)
llama_print_timings:        eval time =   47497.83 ms /   194 runs   (  244.83 ms per token,     4.08 tokens per second)
llama_print_timings:       total time =   58960.88 ms /   270 tokens


 Certainly! Here's a classic poem that compares cats and dogs:

In a house of my own, there's a cat and there's a dog,
Each one with a different way to beg for a bog.
The cat with his paw, and the dog with his eyes,
They both have their ways to ensnare the cries.

The cat with his grace, and the dog with his might,
They share the same home in the soft, quiet night.
The cat with his purr, and the dog with his bark,
They each have their charms to leave their mark.

The cat with his secrets, and the dog with his loyalty,
They both have their stories to tell, in their own way,
In the house of my own, where the cat and the dog play,
Their bond is a testament to the joy of each day.


In [10]:
response_iter = llm.stream_complete("Can you write me a poem about fast cars?")
for response in response_iter:
    print(response.delta, end="", flush=True)

Llama.generate: prefix-match hit


 In the realm where metal beasts come to play,
Where engines roar and rubber hits the way,
Lies the thrill of speed, a captivating display,
Of fast cars, in their grand and glorious array.

Engines purring, revving with might,
Beneath the sun's resplendent light,
They race against the clock, day and night,
In the dance of power, an exhilarating sight.

Sleek bodies glisten in the morning dew,
Reflections of the world, a breathtaking view,
Through the windshield, life's complexities subdue,
As the fast cars weave through the landscape, ever new.

Their roar, a testament to human might,
A symphony of sound, an unforgettable sight,
In the realm of fast cars, we find our light,
A thrilling escape from the mundane, day or night.


llama_print_timings:        load time =   10909.03 ms
llama_print_timings:      sample time =      50.09 ms /   212 runs   (    0.24 ms per token,  4232.72 tokens per second)
llama_print_timings: prompt eval time =    2058.85 ms /    14 tokens (  147.06 ms per token,     6.80 tokens per second)
llama_print_timings:        eval time =   51999.47 ms /   211 runs   (  246.44 ms per token,     4.06 tokens per second)
llama_print_timings:       total time =   54806.53 ms /   225 tokens


In [3]:
response_iter = llm.stream_complete("Explain nuclear fission in detail?")
for response in response_iter:
    print(response.delta, end="", flush=True)

 Nuclear fission is a process where the nucleus of an atom is split into two or more smaller nuclei, releasing energy in the form of heat and radiation. This process is initiated most commonly by neutron irradiation of an atomic nucleus, which can cause the nucleus to become unstable and split.

The most common type of nuclear fission is that of uranium-235 (U-235). When a neutron collides with the U-235 nucleus, it causes the nucleus to become unstable and undergo fission. The U-235 nucleus can split into two smaller nuclei, such as barium-141 and krypton-92, along with some neutrons and gamma radiation. The total mass of the resulting nuclei is slightly less than that of the original U-235 nucleus, resulting in the release of energy in the form of heat.

The released neutrons can then go on to cause further fissions in other U-235 nuclei, leading to a chain reaction. This is the fundamental principle behind the operation of nuclear power plants and atomic bombs.

The energy released 


llama_print_timings:        load time =   15865.81 ms
llama_print_timings:      sample time =      58.48 ms /   256 runs   (    0.23 ms per token,  4377.27 tokens per second)
llama_print_timings: prompt eval time =   15865.66 ms /    71 tokens (  223.46 ms per token,     4.48 tokens per second)
llama_print_timings:        eval time =   60267.06 ms /   255 runs   (  236.34 ms per token,     4.23 tokens per second)
llama_print_timings:       total time =   77031.65 ms /   326 tokens


In [7]:
response_iter = llm.stream_complete("I have 10 apples and 2 oranges, I ate 13 apples and 1 orange. I stole 2 apples from my friend. How many apples do I have? - Note this might be a trick questin")
for response in response_iter:
    print(response.delta, end="", flush=True)

Llama.generate: prefix-match hit


 Based on the information provided, you initially had 10 apples, you ate 13 apples (10 apples that you had + 3 apples that were not mentioned but can be inferred were eaten), and you stole 2 apples. Therefore, you currently have 12 apples (10 apples that you initially had + 2 apples that you stole).


llama_print_timings:        load time =   15865.81 ms
llama_print_timings:      sample time =      18.02 ms /    84 runs   (    0.21 ms per token,  4662.26 tokens per second)
llama_print_timings: prompt eval time =    1500.48 ms /    13 tokens (  115.42 ms per token,     8.66 tokens per second)
llama_print_timings:        eval time =   19015.41 ms /    83 runs   (  229.10 ms per token,     4.36 tokens per second)
llama_print_timings:       total time =   20788.91 ms /    96 tokens


<h1> Query engine

In [13]:
from llama_index.core import set_global_tokenizer
from transformers import AutoTokenizer

set_global_tokenizer(
    AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2").encode
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [14]:
# use Huggingface embeddings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [15]:
# load documents
documents = SimpleDirectoryReader(
    "./data"
).load_data()

In [17]:
# create vector store index
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

In [21]:
# set up query engine
query_engine = index.as_query_engine(llm=llm,streaming=True)

In [22]:
streaming_response = query_engine.query(
    "What did the author do growing up?",
)

In [23]:
for text in streaming_response.response_gen:
    streaming_response.print_response_stream()


Llama.generate: prefix-match hit


 author grew up working on writing and programming outside of school. They wrote short stories with little plot and tried programming on an IBM 1401 computer in 9th grade, but couldn't figure out what to do with it due to the lack of input options. They were impressed by the microcomputers that came later, which allowed for real-time interaction, and started programming on a TRS-80 microcomputer in about 1980. Despite their interest in programming, they didn't plan to study it in college and instead pursued philosophy. However, they found philosophy courses to be boring and eventually switched to study Artificial Intelligence (AI) instead. They also discovered a passion for painting and started taking art classes at Harvard while still in a PhD program in computer science. They applied to art schools and eventually attended Rhode Island School of Design (RISD) for a BFA program.


llama_print_timings:        load time =   16329.43 ms
llama_print_timings:      sample time =      41.75 ms /   191 runs   (    0.22 ms per token,  4574.74 tokens per second)
llama_print_timings: prompt eval time =  122394.67 ms /   912 tokens (  134.20 ms per token,     7.45 tokens per second)
llama_print_timings:        eval time =   48871.36 ms /   190 runs   (  257.22 ms per token,     3.89 tokens per second)
llama_print_timings:       total time =  171973.91 ms /  1102 tokens


In [24]:
streaming_response = query_engine.query(
    "What is this story about?",
)

In [25]:
for text in streaming_response.response_gen:
    streaming_response.print_response_stream()


Llama.generate: prefix-match hit


 story is about the author's experiences with programming and writing before and during college. The author shares their early attempts at writing short stories and programming on an IBM 1401 computer in junior high school. They also discuss their later experiences with microcomputers and their fascination with the ability to write programs directly into the computer. The author also shares their college experiences, where they studied philosophy but found it to be uninteresting and eventually switched to Artificial Intelligence (AI).


llama_print_timings:        load time =   16329.43 ms
llama_print_timings:      sample time =      21.95 ms /   100 runs   (    0.22 ms per token,  4556.22 tokens per second)
llama_print_timings: prompt eval time =  204567.49 ms /  1534 tokens (  133.36 ms per token,     7.50 tokens per second)
llama_print_timings:        eval time =   25646.00 ms /    99 runs   (  259.05 ms per token,     3.86 tokens per second)
llama_print_timings:       total time =  230567.40 ms /  1633 tokens


In [27]:
streaming_response = query_engine.query(
    "tell me an interesting fact about in the story?",
)
for text in streaming_response.response_gen:
    streaming_response.print_response_stream()
print(response)

Llama.generate: prefix-match hit


 author's first experiences with programming were on an IBM 1401 computer in junior high school, where they were unable to write programs that relied on input from punched cards due to lacking data stored on such cards. They were also unable to write programs that didn't require input, as they didn't have enough mathematical knowledge to calculate interesting things. Their first clear memory of programming was learning that programs could run indefinitely without terminating, causing an issue for the data center manager. With microcomputers, they were able to write programs in real-time and started programming seriously, but they initially planned to study philosophy in college until they discovered that the field was mostly dealing with edge cases that other fields had ignored.


llama_print_timings:        load time =   16329.43 ms
llama_print_timings:      sample time =      34.58 ms /   152 runs   (    0.23 ms per token,  4395.73 tokens per second)
llama_print_timings: prompt eval time =   79551.71 ms /   590 tokens (  134.83 ms per token,     7.42 tokens per second)
llama_print_timings:        eval time =   38146.37 ms /   151 runs   (  252.62 ms per token,     3.96 tokens per second)
llama_print_timings:       total time =  118224.92 ms /   741 tokens


None


In [28]:
streaming_response = query_engine.query(
    "What is the author's family - tell me about his parents, siblings if any? Also tell me his food preferences",
)
for text in streaming_response.response_gen:
    streaming_response.print_response_stream()
print(response)

Llama.generate: prefix-match hit


 author does not provide any specific information about his parents or siblings in the provided context. As for food preferences, there is no mention of them in the text.


llama_print_timings:        load time =   16329.43 ms
llama_print_timings:      sample time =       7.24 ms /    34 runs   (    0.21 ms per token,  4695.48 tokens per second)
llama_print_timings: prompt eval time =  248260.23 ms /  1900 tokens (  130.66 ms per token,     7.65 tokens per second)
llama_print_timings:        eval time =    8522.72 ms /    33 runs   (  258.26 ms per token,     3.87 tokens per second)
llama_print_timings:       total time =  256904.83 ms /  1933 tokens


None


In [29]:
streaming_response = query_engine.query(
    "Where did he work, what all programmming languages did he create or code?",
)
for text in streaming_response.response_gen:
    streaming_response.print_response_stream()
print(response)

Llama.generate: prefix-match hit


 individual in the text describes his experiences with programming starting from the IBM 1401 computer in the late 1960s using an early version of Fortran, to microcomputers like the TRS-80 in the late 1970s and early 1980s when he wrote simple games and a word processor using simple BASIC. In college, he was more interested in philosophy but switched to Artificial Intelligence (AI) due to boredom with philosophy courses. He mentions McCarthy's Lisp but does not create or code it himself. Instead, he attempts to create a new Lisp interpreter called Bel from March 2015 to October 2019 in Arc using a collection of hacks to make it work. He writes essays during his work on Bel but stops writing them to focus on Bel's development. He moves to England in 2016 with his family and finishes Bel in the fall of 2019. Bel is a spec rather than an implementation expressed as code. The individual continues writing essays after Bel's completion. Therefore, the programming languages he mentions in th


llama_print_timings:        load time =   16329.43 ms
llama_print_timings:      sample time =      55.32 ms /   256 runs   (    0.22 ms per token,  4627.37 tokens per second)
llama_print_timings: prompt eval time =  247702.42 ms /  1891 tokens (  130.99 ms per token,     7.63 tokens per second)
llama_print_timings:        eval time =   66465.54 ms /   255 runs   (  260.65 ms per token,     3.84 tokens per second)
llama_print_timings:       total time =  315121.41 ms /  2146 tokens


None


In [30]:
streaming_response = query_engine.query(
    "Give me the list of people he has mentioned in the document?",
)
for text in streaming_response.response_gen:
    streaming_response.print_response_stream()
print(response)

Llama.generate: prefix-match hit


 document mentions the following people:

1. John McCarthy
2. Rich Draves
3. The data center manager
4. My friend who built his own microcomputer (name unmentioned)
5. My father
6. Paul Graham (mentioned in the present tense in the author's reflection)
7. The readers of HN (mentioned in the past tense)
8. The boys (mentioned in the present tense)
9. The data processing staff at the junior high school (implied)
10. The designers of Fortran (implied)
11. The designers of the IBM 1401 (implied)
12. The designers of the TRS-80 and Apple II (implied)
13. The designers of other computers (implied)
14. The authors of the other fields of study (implied)


llama_print_timings:        load time =   16329.43 ms
llama_print_timings:      sample time =      44.49 ms /   197 runs   (    0.23 ms per token,  4427.96 tokens per second)
llama_print_timings: prompt eval time =  253107.58 ms /  1887 tokens (  134.13 ms per token,     7.46 tokens per second)
llama_print_timings:        eval time =   53417.42 ms /   196 runs   (  272.54 ms per token,     3.67 tokens per second)
llama_print_timings:       total time =  307262.96 ms /  2083 tokens


None
