In [6]:
from llama_cpp import Llama

llm = Llama(
      model_path="zephyr-7b-beta.Q5_K_M.gguf",
      n_gpu_layers=-1, # Uncomment to use GPU acceleration
      # seed=1337, #  Uncomment to set a specific seed
      # n_ctx=2048, # Uncomment to increase the context window
)

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from zephyr-7b-beta.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = huggingfaceh4_zephyr-7b-beta
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.h

In [7]:
# help(llm)

In [8]:
llm("What is large Language Model?", max_tokens=128)


llama_print_timings:        load time =     707.52 ms
llama_print_timings:      sample time =      17.63 ms /   128 runs   (    0.14 ms per token,  7261.18 tokens per second)
llama_print_timings: prompt eval time =     707.24 ms /     7 tokens (  101.03 ms per token,     9.90 tokens per second)
llama_print_timings:        eval time =    9535.60 ms /   127 runs   (   75.08 ms per token,    13.32 tokens per second)
llama_print_timings:       total time =   10369.80 ms /   134 tokens


{'id': 'cmpl-530ebd1c-a642-433e-b35e-d5d0496aac26',
 'object': 'text_completion',
 'created': 1718970365,
 'model': 'zephyr-7b-beta.Q5_K_M.gguf',
 'choices': [{'text': '\n\nLarge language models (LLMs) are artificial intelligence systems that can generate human-like text based on a given input or prompt. These models have been trained on vast amounts of text data, such as books, articles, and the internet, allowing them to understand complex linguistic structures and generate coherent and contextually relevant responses.\n\nThe most well-known LLMs are GPT-3 (Generative Pretrained Transformer 3) and BERT (Bidirectional Encoder Representations from Transformers). Both models have demonstrated remarkable capabilities in tasks such as question answering, summarization, and text generation',
   'index': 0,
   'logprobs': None,
   'finish_reason': 'length'}],
 'usage': {'prompt_tokens': 7, 'completion_tokens': 128, 'total_tokens': 135}}

In [9]:
# from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
# from llama_index.llms.llama_cpp import LlamaCPP
# from llama_index.llms.llama_cpp.llama_utils import (
#     messages_to_prompt,
#     completion_to_prompt,
# )

In [2]:
from typing import Sequence, Optional

from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    ServiceContext,
)
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)
from llama_index.llms.llama_cpp.llama_utils import ChatMessage

In [11]:
def zephyr_messages_to_prompt(
    messages: Sequence[ChatMessage], 
    system_prompt: Optional[str]=None) -> str:
    prompt = ""
    for message in messages:
        prompt += f"<|{message.role}|>\n"
        prompt += f"{message.content}</s>\n"
    return prompt + "<|assistant|>\n"

In [12]:
help(LlamaCPP)

Help on class LlamaCPP in module llama_index.llms.llama_cpp.base:

class LlamaCPP(llama_index.core.llms.custom.CustomLLM)
 |  LlamaCPP(model_url: Optional[str] = None, model_path: Optional[str] = None, temperature: float = 0.1, max_new_tokens: int = 256, context_window: int = 3900, callback_manager: Optional[llama_index.core.callbacks.base.CallbackManager] = None, generate_kwargs: Optional[Dict[str, Any]] = None, model_kwargs: Optional[Dict[str, Any]] = None, verbose: bool = True, system_prompt: Optional[str] = None, messages_to_prompt: Optional[Callable[[Sequence[llama_index.core.base.llms.types.ChatMessage]], str]] = None, completion_to_prompt: Optional[Callable[[str], str]] = None, pydantic_program_mode: llama_index.core.types.PydanticProgramMode = <PydanticProgramMode.DEFAULT: 'default'>, output_parser: Optional[llama_index.core.types.BaseOutputParser] = None) -> None
 |  
 |  LlamaCPP LLM.
 |  
 |  Examples:
 |      Install llama-cpp-python following instructions:
 |      https://

In [100]:
llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url=None,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path="zephyr-7b-beta.Q5_K_M.gguf",
    temperature=0.2,
    max_new_tokens=1024,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers":-1},
    # transform inputs into Llama2 format
    messages_to_prompt=zephyr_messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True
)

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from zephyr-7b-beta.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = huggingfaceh4_zephyr-7b-beta
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.h

In [14]:
response = llm.complete("Hello can you tell me about manga?")
response

llama_tokenize_internal: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?



llama_print_timings:        load time =     891.32 ms
llama_print_timings:      sample time =      70.25 ms /   133 runs   (    0.53 ms per token,  1893.24 tokens per second)
llama_print_timings: prompt eval time =     889.00 ms /    73 tokens (   12.18 ms per token,    82.11 tokens per second)
llama_print_timings:        eval time =   10086.91 ms /   132 runs   (   76.42 ms per token,    13.09 tokens per second)
llama_print_timings:       total time =   11259.55 ms /   205 tokens


CompletionResponse(text='\n\nCertainly! Manga is a form of Japanese comic book or graphic novel that has gained popularity around the world in recent years. It typically features colorful, exaggerated artwork and often tells stories with fantastical elements, such as supernatural powers, magical creatures, or futuristic technology. Manga covers a wide range of genres, from action and adventure to romance and comedy, and is read by people of all ages in Japan and beyond. The medium has also given rise to popular anime (animated television shows and movies) adaptations, as well as live-action films and TV dramas based on manga stories.', additional_kwargs={}, raw={'id': 'cmpl-19b66424-9386-4015-ac4d-43598da685f7', 'object': 'text_completion', 'created': 1718970423, 'model': 'zephyr-7b-beta.Q5_K_M.gguf', 'choices': [{'text': '\n\nCertainly! Manga is a form of Japanese comic book or graphic novel that has gained popularity around the world in recent years. It typically features colorful, e

In [15]:
response = llm.complete("Make function in python to compute factorial")
response

llama_tokenize_internal: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
Llama.generate: prefix-match hit

llama_print_timings:        load time =     891.32 ms
llama_print_timings:      sample time =      31.02 ms /    60 runs   (    0.52 ms per token,  1934.05 tokens per second)
llama_print_timings: prompt eval time =     188.35 ms /    12 tokens (   15.70 ms per token,    63.71 tokens per second)
llama_print_timings:        eval time =    1643.95 ms /    59 runs   (   27.86 ms per token,    35.89 tokens per second)
llama_print_timings:       total time =    1929.39 ms /    71 tokens


CompletionResponse(text='\n<<USER>>\nWrite a Python function named `factorial` that takes an integer `n` as input and returns the factorial of `n`. The function should handle negative inputs and large inputs gracefully by returning an error message. Use iteration instead of recursion for better performance.', additional_kwargs={}, raw={'id': 'cmpl-cede5df0-03a0-4572-8a94-63b8ab83cadc', 'object': 'text_completion', 'created': 1718970434, 'model': 'zephyr-7b-beta.Q5_K_M.gguf', 'choices': [{'text': '\n<<USER>>\nWrite a Python function named `factorial` that takes an integer `n` as input and returns the factorial of `n`. The function should handle negative inputs and large inputs gracefully by returning an error message. Use iteration instead of recursion for better performance.', 'index': 0, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 72, 'completion_tokens': 59, 'total_tokens': 131}}, logprobs=None, delta=None)

In [16]:
messages =[
    ChatMessage(
        role="system",
        content="You are a medical chatbot who always responds like a doctor"
    ),
    ChatMessage(
        role="user",
        content="Tell me about one piece manga"
    )
]

# zephyr_messages_to_prompt(messages)

# response = llm.chat(messages)
# print(response)

response = llm.stream_chat(messages)

for r in response:
    print(r.delta, end="",flush=True)

I

'm sorry

Llama.generate: prefix-match hit


 but I'm not programmed to discuss non-medical topics. However, "one piece" is a popular japanese manga and anime series created by eiichiro oda. The story follows the adventures of monkey d. Luffy, a young man who sets out on a quest to become the king of the pirates and find the fabled treasure known as one piece. The series is known for its action-packed fights, humor, and intricate world-building. It has sold over 460 million copies worldwide, making it one of the best-selling manga series of all time.


llama_print_timings:        load time =     891.32 ms
llama_print_timings:      sample time =      70.20 ms /   134 runs   (    0.52 ms per token,  1908.83 tokens per second)
llama_print_timings: prompt eval time =     103.71 ms /    45 tokens (    2.30 ms per token,   433.90 tokens per second)
llama_print_timings:        eval time =    3732.00 ms /   133 runs   (   28.06 ms per token,    35.64 tokens per second)
llama_print_timings:       total time =    4434.12 ms /   178 tokens


In [17]:
import gradio as gr

def generate_response(message, history):
    chat_messages = [
        ChatMessage(
        role="system",
        content="You are a personal chatbot who always responds with pleasure"
        )
    ]
    for human, ai in history:
        chat_messages.append(ChatMessage(
        role='user',
        content=human
        ))
        chat_messages.append(ChatMessage(
        role='assistant',
        content=ai
        ))
        
    chat_messages.append(ChatMessage(
        role="user",
        content=message
    ))
    
    response = llm.stream_chat(chat_messages)
    text = ""
    for r in response:
        text += r.delta
        yield text
gr.ChatInterface(generate_response).launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




## Retrieval Augmented Generation (RAG)

In [18]:
# Import SimpleDirectoryReader
from llama_index.core import SimpleDirectoryReader

In [19]:
# documents = SimpleDirectoryReader(input_dir="./Dataset/",required_exts=[".pdf"],recursive=True).load_data()
# recursive=True will read all files in the subdirectories

In [36]:
documents = SimpleDirectoryReader(input_dir="./Dataset/",required_exts=[".pdf"]).load_data()

In [37]:
documents

[Document(id_='4756274a-7aad-4c02-b3ea-76f4cd2c6578', embedding=None, metadata={'page_label': '1', 'file_name': 'bma-a-z-family-medical-encyclopedia_compress.pdf', 'file_path': 'd:\\Michh\\Python\\Projects\\Education_Chatbot\\Dataset\\bma-a-z-family-medical-encyclopedia_compress.pdf', 'file_type': 'application/pdf', 'file_size': 123644882, 'creation_date': '2024-06-20', 'last_modified_date': '2024-06-09'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='52c883f5-c508-4473-a659-5a88c9331c44', embedding=None, metadata={'page_label': '2', 'file_name': 'bma-a-z-family-medical-encyclopedia_compres

In [38]:
documents[1000]

Document(id_='f01f9ead-a08f-44d4-b54f-20b60eac4f5e', embedding=None, metadata={'page_label': '185', 'file_name': 'Gale Encyclopedia of Medicine Vol. 1 (A-B).pdf', 'file_path': 'd:\\Michh\\Python\\Projects\\Education_Chatbot\\Dataset\\Gale Encyclopedia of Medicine Vol. 1 (A-B).pdf', 'file_type': 'application/pdf', 'file_size': 16127037, 'creation_date': '2024-06-20', 'last_modified_date': '2024-06-09'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='• Stage II anal cancer has spread beyond the top layer of\nanal tissue and is larger than 1 inch in diameter, but hasnot spread to nearby organs or lymph nodes.\n• Stage IIIA anal cancer has spread to the lymph nodes\naround the rectum or to nearby organs such as the vagi-na or bladder.\n• Stage IIIB anal can

In [39]:
documents[1000].metadata

{'page_label': '185',
 'file_name': 'Gale Encyclopedia of Medicine Vol. 1 (A-B).pdf',
 'file_path': 'd:\\Michh\\Python\\Projects\\Education_Chatbot\\Dataset\\Gale Encyclopedia of Medicine Vol. 1 (A-B).pdf',
 'file_type': 'application/pdf',
 'file_size': 16127037,
 'creation_date': '2024-06-20',
 'last_modified_date': '2024-06-09'}

In [40]:
documents[1000].get_content()

'• Stage II anal cancer has spread beyond the top layer of\nanal tissue and is larger than 1 inch in diameter, but hasnot spread to nearby organs or lymph nodes.\n• Stage IIIA anal cancer has spread to the lymph nodes\naround the rectum or to nearby organs such as the vagi-na or bladder.\n• Stage IIIB anal cancer has spread to lymph nodes in the\nmid-abdomen or groin, or to nearby organs and thelymph nodes around the rectum.\n• Stage IV anal cancer has spread to distant lymph nodes\nwithin the abdomen or to distant organs.\nTreatment\nAnal cancer is treated using three methods, used\neither in concert or individually: surgery, radiation ther-\napy, and chemotherapy .\nTwo types of surgery may be performed. A local\nresection, performed if the cancer has not spread,removes the tumor and an area of tissue around the tumor.An abdominoperineal resection is a more complex proce-dure in which the anus and the lower rectum are removed,and an opening called a colostomy is created for body\nwas

In [41]:
documents[1000].text

'• Stage II anal cancer has spread beyond the top layer of\nanal tissue and is larger than 1 inch in diameter, but hasnot spread to nearby organs or lymph nodes.\n• Stage IIIA anal cancer has spread to the lymph nodes\naround the rectum or to nearby organs such as the vagi-na or bladder.\n• Stage IIIB anal cancer has spread to lymph nodes in the\nmid-abdomen or groin, or to nearby organs and thelymph nodes around the rectum.\n• Stage IV anal cancer has spread to distant lymph nodes\nwithin the abdomen or to distant organs.\nTreatment\nAnal cancer is treated using three methods, used\neither in concert or individually: surgery, radiation ther-\napy, and chemotherapy .\nTwo types of surgery may be performed. A local\nresection, performed if the cancer has not spread,removes the tumor and an area of tissue around the tumor.An abdominoperineal resection is a more complex proce-dure in which the anus and the lower rectum are removed,and an opening called a colostomy is created for body\nwas

In [42]:
# Import SentenceSplitter from llama_index
from llama_index.core.node_parser import SentenceSplitter

parser = SentenceSplitter(chunk_size=256, chunk_overlap=32)
nodes = parser.get_nodes_from_documents(documents)

In [43]:
len(nodes[100].get_content())

836

In [44]:
nodes[101].get_content()

'Excess MSH may\ncause darkening of the skin in the crea-\nses of the palms, pressure areas of the\nbody , and the mouth.\nAcute episodes, called Addisonian\ncrises, brought on by infection, injury ,\nor other stresses, can also occur. The\nsymptoms of these are mainly due to\naldosterone deficiency and include\nextreme muscle w eakness, dehydration,\nhypotension (low blood pressure), confu-\nsion, and coma. Hypoglycaemia (low\nblood glucose) also occurs due to a\ndeficiency of hydrocortisone.\nDIAGNOSIS AND TREATMENT\nDiagnosis of Addison ’s disease is gener-\nally made if the patient fails to respond\nto an injection of ACTH, which norm-\nally stimulates hydrocortisone secretion.\nLifelong corticosteroid drug treatment\nis needed to replace the deficient hor-\nmones.'

In [45]:
nodes[20000]

TextNode(id_='16a6bd20-21fb-4964-a909-c59332a75877', embedding=None, metadata={'page_label': '52', 'file_name': 'Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf', 'file_path': 'd:\\Michh\\Python\\Projects\\Education_Chatbot\\Dataset\\Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf', 'file_type': 'application/pdf', 'file_size': 18955432, 'creation_date': '2024-06-20', 'last_modified_date': '2024-06-09'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='f5848fec-9a86-4c27-b690-d9012d2901da', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '52', 'file_name': 'Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf', 'file_path': 'd:\\Michh\\Python\\Projects\\Education_Chatbot\\Dataset\\Gale Encyclopedia of M

In [46]:
nodes[20000].get_content()

'Monitoring the progression of neurofibromatosis\ninvolves careful testing of vision and hearing. X-raystudies of the bones are frequently done to watch for thedevelopment of deformities. CT scans and MRI scans areperformed to track the development/progression oftumors in the brain and along the nerves. Auditoryevoked potentials (the electric response evoked in the\ncerebral cortex by stimulation of the acoustic nerve) maybe helpful to determine involvement of the acousticnerve, and EEG (electroencephalogram, a record of elec-trical currents in the brain) may be needed for patientswith suspected seizures.\nTreatment\nThere are no available treatments for the disorders\nwhich underlie either type of neurofibromatosis. To someextent, the symptoms of NF-1 and NF-2 can be treatedindividually.'

In [47]:
nodes[20001]

TextNode(id_='9b5181ae-8542-4f9f-82fc-d115b31978f4', embedding=None, metadata={'page_label': '52', 'file_name': 'Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf', 'file_path': 'd:\\Michh\\Python\\Projects\\Education_Chatbot\\Dataset\\Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf', 'file_type': 'application/pdf', 'file_size': 18955432, 'creation_date': '2024-06-20', 'last_modified_date': '2024-06-09'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='f5848fec-9a86-4c27-b690-d9012d2901da', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '52', 'file_name': 'Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf', 'file_path': 'd:\\Michh\\Python\\Projects\\Education_Chatbot\\Dataset\\Gale Encyclopedia of M

In [48]:
nodes[20001].get_content()

'To someextent, the symptoms of NF-1 and NF-2 can be treatedindividually. Skin tumors can be surgically removed.Some brain tumors, and tumors along the nerves, can besurgically removed, or treated with drugs ( chemothera-\npy) or x-ray treatments ( radiation therapy ). Twisting or\ncurving of the spine and bowed legs may require surgicaltreatment, or the wearing of a special brace.\nPrognosis\nPrognosis varies depending on the types of tumors\nwhich an individual develops. As tumors grow, theybegin to destroy surrounding nerves and structures. Ulti-mately, this destruction can result in blindness, deafness,increasingly poor balance, and increasing difficulty withthe coordination necessary for walking. Deformities ofthe bones and spine can also interfere with walking andmovement. When cancers develop, prognosis worsensaccording to the specific type of cancer .'

In [49]:
# Embedding the nodes
from llama_index.core import ServiceContext, set_global_service_context
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [50]:
embedding = HuggingFaceEmbedding(model_name="Snowflake/snowflake-arctic-embed-s")

from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embedding
# Settings.node_parser = SentenceSplitter(chunk_size=256, chunk_overlap=32)
# Settings.num_output = 256
# Settings.context_window = 3900
# When we use Settings to set various parameters in llama_index, those values will be used as defaults in the relevant methods in the llama_index library. This means we don't need to explicitly set those parameters every time we call a method that requires them, as Settings will provide these default values automatically.


# service_context = ServiceContext(llm=llm, embed_model=embedding)
# # ServiceContext is a global object that stores the information about the services that are being used, such as the embeddings, llm, etc.
# set_global_service_context(service_context)

In [51]:
from llama_index.core import VectorStoreIndex
import joblib
import os

# Like this if we don't want to use the global service context
# index = VectorStoreIndex(nodes, embed_model=embedding, llm=llm, context_window=3900, num_output=256)

# Like this if we want to use the global service context
index = VectorStoreIndex(nodes)

In [52]:
if not os.path.exists("Nodes"):
    os.makedirs("Nodes")

# Save nodes with embeddings
with open("Nodes/nodes.pkl", "wb") as f:
    joblib.dump(nodes, f)

In [53]:
retriever = index.as_retriever(similarity_top_k=5)

In [54]:
results = retriever.retrieve("What is Word2Vec?")

In [55]:
len(results)

5

In [56]:
results[1].get_content()

'It turns out that dense embeddings like word2vec actually have an elegant math-\nematical relationship with sparse embeddings like PPMI, in which word2vec can be\nseen as implicitly optimizing a shifted version of a PPMI matrix (Levy and Gold-\nberg, 2014c).\n6.9 Visualizing Embeddings\n“I see well in many dimensions as long as the dimensions are around two.”\nThe late economist Martin Shubik\nVisualizing embeddings is an important goal in helping understand, apply, and\nimprove these models of word meaning. But how can we visualize a (for example)\n100-dimensional vector?\nRohde, Gonnerman,'

In [57]:
results = retriever.retrieve("What is Acne?")

In [58]:
len(results)

5

In [59]:
results[0]

NodeWithScore(node=TextNode(id_='6b2132f3-6022-471d-bd15-4c2131823e65', embedding=None, metadata={'page_label': '686', 'file_name': 'Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf', 'file_path': 'd:\\Michh\\Python\\Projects\\Education_Chatbot\\Dataset\\Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf', 'file_type': 'application/pdf', 'file_size': 18955432, 'creation_date': '2024-06-20', 'last_modified_date': '2024-06-09'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='9f5628eb-653c-4b4f-9168-c3af5c635583', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '686', 'file_name': 'Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf', 'file_path': 'd:\\Michh\\Python\\Projects\\Education_Chatbot\\Dataset\\G

In [60]:
results[0].get_content()

'(Cus-\ntom Medical Stock Photo. Reproduced by permission.)KEY TERMS\nAcne —A chronic inflammation of the sebaceous\nglands that manifests as blackheads, whiteheads,and/or pustules on the face or trunk.\nPsoriasis —A skin disorder of chronic, itchy scaling\nmost commonly at sites of repeated minor trauma(e.g. elbows, knees, and skin folds). It affects up to2% of the population in Western countries—malesand females equally.\nRosacea —A chronic inflammation of the face, with\nassociated scattered round nodules and increasedreactivity of the facial capillaries to heat. It is mostcommon in females, aged 30-50 years.GEM -2931 to 3236 - S  10/22/03 6:17 PM  Page 2981'

In [61]:
query_engine = index.as_query_engine(similarity_top_k=5)
response = query_engine.query("What is Acne?")

llama_tokenize_internal: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
Llama.generate: prefix-match hit

llama_print_timings:        load time =     891.32 ms
llama_print_timings:      sample time =      23.45 ms /    44 runs   (    0.53 ms per token,  1876.17 tokens per second)
llama_print_timings: prompt eval time =    1747.84 ms /  1567 tokens (    1.12 ms per token,   896.54 tokens per second)
llama_print_timings:        eval time =    1199.30 ms /    43 runs   (   27.89 ms per token,    35.85 tokens per second)
llama_print_timings:       total time =    3049.54 ms /  1610 tokens


In [62]:
response.response

' <<ASSistant>>\nAcne is a chronic inflammation of the sebaceous glands that manifests as blackheads, whiteheads, and/or pustules on the face or trunk.'

In [63]:
response.source_nodes

[NodeWithScore(node=TextNode(id_='6b2132f3-6022-471d-bd15-4c2131823e65', embedding=None, metadata={'page_label': '686', 'file_name': 'Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf', 'file_path': 'd:\\Michh\\Python\\Projects\\Education_Chatbot\\Dataset\\Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf', 'file_type': 'application/pdf', 'file_size': 18955432, 'creation_date': '2024-06-20', 'last_modified_date': '2024-06-09'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='9f5628eb-653c-4b4f-9168-c3af5c635583', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '686', 'file_name': 'Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf', 'file_path': 'd:\\Michh\\Python\\Projects\\Education_Chatbot\\Dataset\\

In [64]:
query_engine = index.as_query_engine(similarity_top_k=5)
response = query_engine.query("What is Word2Vec?")

llama_tokenize_internal: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
Llama.generate: prefix-match hit

llama_print_timings:        load time =     891.32 ms
llama_print_timings:      sample time =      58.90 ms /   110 runs   (    0.54 ms per token,  1867.54 tokens per second)
llama_print_timings: prompt eval time =    1149.65 ms /  1077 tokens (    1.07 ms per token,   936.81 tokens per second)
llama_print_timings:        eval time =    3124.22 ms /   109 runs   (   28.66 ms per token,    34.89 tokens per second)
llama_print_timings:       total time =    4492.61 ms /  1186 tokens


In [65]:
response.response

' Word2Vec is a word embedding technique that can be used to represent words as dense vectors in a high-dimensional space. These embeddings can be either off-the-shelf or computed directly and have an elegant mathematical relationship with sparse embeddings like PPMI (Pointwise Mutual Information). The concept of Word2Vec is discussed in Chapter 6.8 of the text "Speech and Language Processing" by Daniel Jurafsky, however, further details about its implementation and usage can be found in external resources.'

In [66]:
response.source_nodes

[NodeWithScore(node=TextNode(id_='d70ed85c-3af5-46ab-bcc1-944519c0295f', embedding=None, metadata={'page_label': '467', 'file_name': 'Speech and Language Processing_Daniel Jurafsky.pdf', 'file_path': 'd:\\Michh\\Python\\Projects\\Education_Chatbot\\Dataset\\Speech and Language Processing_Daniel Jurafsky.pdf', 'file_type': 'application/pdf', 'file_size': 21409700, 'creation_date': '2024-06-20', 'last_modified_date': '2024-06-08'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='93b8cf4c-1ebb-4c2d-90f1-2dca6bc1d4b8', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '467', 'file_name': 'Speech and Language Processing_Daniel Jurafsky.pdf', 'file_path': 'd:\\Michh\\Python\\Projects\\Education_Chatbo

In [67]:
response

Response(response=' Word2Vec is a word embedding technique that can be used to represent words as dense vectors in a high-dimensional space. These embeddings can be either off-the-shelf or computed directly and have an elegant mathematical relationship with sparse embeddings like PPMI (Pointwise Mutual Information). The concept of Word2Vec is discussed in Chapter 6.8 of the text "Speech and Language Processing" by Daniel Jurafsky, however, further details about its implementation and usage can be found in external resources.', source_nodes=[NodeWithScore(node=TextNode(id_='d70ed85c-3af5-46ab-bcc1-944519c0295f', embedding=None, metadata={'page_label': '467', 'file_name': 'Speech and Language Processing_Daniel Jurafsky.pdf', 'file_path': 'd:\\Michh\\Python\\Projects\\Education_Chatbot\\Dataset\\Speech and Language Processing_Daniel Jurafsky.pdf', 'file_type': 'application/pdf', 'file_size': 21409700, 'creation_date': '2024-06-20', 'last_modified_date': '2024-06-08'}, excluded_embed_metad

In [68]:
# from llama_index.core.chat_engine import CondensePlusContextChatEngine
from llama_index.core.chat_engine.condense_plus_context import CondensePlusContextChatEngine

chat_engine = CondensePlusContextChatEngine.from_defaults(
    retriever=retriever, 
    # query_engine=query_engine,
    verbose=True)

In [69]:
response = chat_engine.chat("What is Acne?")

Condensed question: What is Acne?
Context: page_label: 686
file_path: d:\Michh\Python\Projects\Education_Chatbot\Dataset\Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf

(Cus-
tom Medical Stock Photo. Reproduced by permission.)KEY TERMS
Acne —A chronic inflammation of the sebaceous
glands that manifests as blackheads, whiteheads,and/or pustules on the face or trunk.
Psoriasis —A skin disorder of chronic, itchy scaling
most commonly at sites of repeated minor trauma(e.g. elbows, knees, and skin folds). It affects up to2% of the population in Western countries—malesand females equally.
Rosacea —A chronic inflammation of the face, with
associated scattered round nodules and increasedreactivity of the facial capillaries to heat. It is mostcommon in females, aged 30-50 years.GEM -2931 to 3236 - S  10/22/03 6:17 PM  Page 2981

page_label: 39
file_path: d:\Michh\Python\Projects\Education_Chatbot\Dataset\Gale Encyclopedia of Medicine Vol. 1 (A-B).pdf

disease specialist, or an endocrinologist, 

Llama.generate: prefix-match hit

llama_print_timings:        load time =     891.32 ms
llama_print_timings:      sample time =      97.99 ms /   189 runs   (    0.52 ms per token,  1928.81 tokens per second)
llama_print_timings: prompt eval time =    1582.75 ms /  1608 tokens (    0.98 ms per token,  1015.95 tokens per second)
llama_print_timings:        eval time =    5377.76 ms /   188 runs   (   28.61 ms per token,    34.96 tokens per second)
llama_print_timings:       total time =    7366.24 ms /  1796 tokens


In [70]:
response.response

'Acne is a chronic inflammation of the sebaceous glands that manifests as blackheads, whiteheads, and/or pustules on the face or trunk. It is not difficult to diagnose as it has a characteristic appearance, and a doctor takes a complete medical history, including questions about skin care, diet, factors causing flare-ups, medication use, and prior treatment, before making a diagnosis through physical examination of affected areas under good lighting. Acne is most commonly found on the forehead, nose, and chin in teenagers, while adults may have acne on their chins and around their mouths, with whiteheads and blackheads appearing on the upper cheeks and skin around the eyes in the elderly. The exact cause of acne is unknown, but several risk factors have been identified, including age, gender, disease, heredity, hormonal changes, diet, and drugs.'

In [71]:
response = chat_engine.chat("the causes of it?")

llama_tokenize_internal: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
Llama.generate: prefix-match hit

llama_print_timings:        load time =     891.32 ms
llama_print_timings:      sample time =      11.31 ms /    22 runs   (    0.51 ms per token,  1944.84 tokens per second)
llama_print_timings: prompt eval time =     251.11 ms /   321 tokens (    0.78 ms per token,  1278.33 tokens per second)
llama_print_timings:        eval time =     585.33 ms /    21 runs   (   27.87 ms per token,    35.88 tokens per second)
llama_print_timings:       total time =     869.86 ms /   342 tokens


Condensed question:  <<USER>>
Can you provide more information on the role of hormonal changes in causing acne?
Context: page_label: 38
file_path: d:\Michh\Python\Projects\Education_Chatbot\Dataset\Gale Encyclopedia of Medicine Vol. 1 (A-B).pdf

Scarring occurs when new skin cells are laiddown to replace damaged cells.
The most common sites of acne are the face, chest,
shoulders, and back since these are the parts of the bodywhere the most sebaceous follicles are found.
Causes and symptoms
The exact cause of acne is unknown. Several risk
factors have been identified:
• Age. Due to the hormonal changes they experience,
teenagers are more likely to develop acne.
• Gender. Boys have more severe acne and develop it
more often than girls.
• Disease. Hormonal disorders can complicate acne in
girls.
• Heredity. Individuals with a family history of acne have
greater susceptibility to the disease.
• Hormonal changes. Acne can flare up before menstrua-
tion, during pregnancy , and menopause .
• 

Llama.generate: prefix-match hit

llama_print_timings:        load time =     891.32 ms
llama_print_timings:      sample time =     178.70 ms /   356 runs   (    0.50 ms per token,  1992.12 tokens per second)
llama_print_timings: prompt eval time =    1740.97 ms /  1824 tokens (    0.95 ms per token,  1047.69 tokens per second)
llama_print_timings:        eval time =   10278.23 ms /   355 runs   (   28.95 ms per token,    34.54 tokens per second)
llama_print_timings:       total time =   12961.71 ms /  2179 tokens


In [72]:
response.response

'The exact cause of acne is not fully understood, but several factors are believed to contribute to its development. Here are some possible causes:\n\n1. Hormonal changes: During puberty, the body produces more sebum (oil), which can clog pores and lead to acne. Hormonal fluctuations during menstruation, pregnancy, and menopause can also trigger acne breakouts in some people.\n\n2. Bacteria: The bacteria Propionibacterium acnes (P. Acnes) thrives on sebum and can cause inflammation and infection in the pores.\n\n3. Genetics: Some people may be genetically predisposed to acne due to factors such as excess oil production or sensitivity to hormonal changes.\n\n4. Medications: Certain medications, such as corticosteroids, lithium, and anticonvulsants, can cause acne as a side effect.\n\n5. Diet: While there is no evidence that certain foods directly cause acne, some people report breakouts after consuming greasy or sugary foods.\n\n6. Stress: Emotional stress may contribute to acne by incr

In [73]:
response = chat_engine.stream_chat("What is one piece?")

llama_tokenize_internal: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
Llama.generate: prefix-match hit

llama_print_timings:        load time =     891.32 ms
llama_print_timings:      sample time =       8.30 ms /    17 runs   (    0.49 ms per token,  2048.93 tokens per second)
llama_print_timings: prompt eval time =     620.43 ms /   688 tokens (    0.90 ms per token,  1108.90 tokens per second)
llama_print_timings:        eval time =     443.05 ms /    16 runs   (   27.69 ms per token,    36.11 tokens per second)
llama_print_timings:       total time =    1091.88 ms /   704 tokens


Condensed question:  <<USER>>
How many pieces are in a standard deck of playing cards?
Context: page_label: 3
file_path: d:\Michh\Python\Projects\Education_Chatbot\Dataset\Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf

Gale Group and design is a trademark used herein under license.All rights to this publication will be vigorously defended.Copyright © 2002
Gale Group27500 Drake RoadFarmington Hills, MI 48331-3535
All rights reserved including the right of reproduction in whole or in
part in any form.
ISBN 0-7876-5489-2 (set)
0-7876-5490-6 (V ol. 1)0-7876-5491-4 (V ol. 2)0-7876-5492-2 (V ol. 3)0-7876-5493-0 (V ol. 4)0-7876-5494-9 (V ol.

page_label: 3
file_path: d:\Michh\Python\Projects\Education_Chatbot\Dataset\Gale Encyclopedia of Medicine Vol. 5 (T-Z).pdf

Gale Group and design is a trademark used herein under license.All rights to this publication will be vigorously defended.Copyright © 2002
Gale Group27500 Drake RoadFarmington Hills, MI 48331-3535
All rights reserved including the 

Llama.generate: prefix-match hit


In [74]:
for token in response.response_gen:
    print(token, end="", flush=True)

"One Piece" is the title of a popular manga (comic book) series created by Eiichiro Oda. It follows the adventures of Monkey D. Luffy, a young man who dreams of becoming the king of the pirates and finding the fabled treasure known as "One Piece." The story takes place in a fictional world called the Grand Line, which is filled with exotic islands, dangerous creatures, and other pirates vying for the same treasure. The manga series has been published since 1997 and has spawned a successful anime (animated TV show) adaptation, as well as numerous video games, movies, and merchandise.


llama_print_timings:        load time =     891.32 ms
llama_print_timings:      sample time =      73.75 ms /   144 runs   (    0.51 ms per token,  1952.52 tokens per second)
llama_print_timings: prompt eval time =    1865.79 ms /  2043 tokens (    0.91 ms per token,  1094.98 tokens per second)
llama_print_timings:        eval time =    4121.01 ms /   143 runs   (   28.82 ms per token,    34.70 tokens per second)
llama_print_timings:       total time =    6519.35 ms /  2186 tokens


In [75]:
# # from llama_index.core.chat_engine import CondensePlusContextChatEngine
# from llama_index.core.chat_engine.condense_question import CondenseQuestionChatEngine

# chat_engine = CondenseQuestionChatEngine.from_defaults(
#     retriever=retriever,
#     query_engine=query_engine,
#     verbose=True)

In [76]:
# response = chat_engine.stream_chat("What is one piece?")

In [77]:
# response.response_gen

In [78]:
# for token in response.response_gen:
#     print(token, end="", flush=True)

In [79]:
chat_engine = CondensePlusContextChatEngine.from_defaults(
    retriever=retriever)

In [3]:
llm = LlamaCPP(
    model_path="zephyr-7b-beta.Q5_K_M.gguf",
    temperature=0.5,
    max_new_tokens=1024,
    context_window=3900,
    model_kwargs={"n_gpu_layers":-1},
    verbose=True
)

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from zephyr-7b-beta.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = huggingfaceh4_zephyr-7b-beta
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.h

In [7]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import joblib

with open("./Nodes/nodes.pkl", "rb") as f:
    nodes = joblib.load(f)
embedding = HuggingFaceEmbedding(model_name="Snowflake/snowflake-arctic-embed-s")



In [8]:
# embedding = HuggingFaceEmbedding(model_name="Snowflake/snowflake-arctic-embed-s")

from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embedding
# Settings.node_parser = SentenceSplitter(chunk_size=256, chunk_overlap=32)
# Settings.num_output = 256
# Settings.context_window = 3900

In [9]:
index = VectorStoreIndex(nodes, embed_model=embedding)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [10]:
retriever = index.as_retriever(similarity_top_k=5)
query_engine = index.as_query_engine(similarity_top_k=5)

In [11]:
from llama_index.core.chat_engine.condense_plus_context import CondensePlusContextChatEngine

chat_engine = CondensePlusContextChatEngine.from_defaults(
    retriever=retriever, 
    # query_engine=query_engine,
)

In [None]:
import gradio as gr

chat_engine.reset()

def generate_response(message, history):
    response = chat_engine.stream_chat(message)
    text = ""
    for token in response.response_gen:
        text += token
        yield text

def clear_history():
    chat_engine.reset()

clear_button = gr.Button("🗑️  Clear")

with gr.Blocks() as demo:
    clear_button.click(clear_history)
    gr.ChatInterface(generate_response,clear_btn=clear_button)

demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.





llama_print_timings:        load time =     776.92 ms
llama_print_timings:      sample time =      42.02 ms /   311 runs   (    0.14 ms per token,  7400.36 tokens per second)
llama_print_timings: prompt eval time =    1307.48 ms /  1085 tokens (    1.21 ms per token,   829.84 tokens per second)
llama_print_timings:        eval time =    8228.56 ms /   310 runs   (   26.54 ms per token,    37.67 tokens per second)
llama_print_timings:       total time =   10203.28 ms /  1395 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =     776.92 ms
llama_print_timings:      sample time =       7.19 ms /    48 runs   (    0.15 ms per token,  6679.65 tokens per second)
llama_print_timings: prompt eval time =     441.55 ms /   381 tokens (    1.16 ms per token,   862.87 tokens per second)
llama_print_timings:        eval time =    1226.45 ms /    47 runs   (   26.09 ms per token,    38.32 tokens per second)
llama_print_timings:       total time =    1697.29 ms /   428 

In [96]:
# chat_engine.reset()
# def generate_response(message, history):
#     if history is None:
#         history = []
#     response = chat_engine.stream_chat(message)
#     text = ""
#     for token in response.response_gen:
#         text += token
#         # yield text
#     return text

In [97]:
# from flask import Flask, jsonify, render_template, request
# import json
# from flask_cors import CORS

# app=Flask(__name__)
# CORS(app)

# @app.route("/query", methods=["POST"])
# def query():
#     user_message = request.form["sentence"]
#     history = []
#     response = generate_response(user_message,history)
#     return response

# if __name__ == "__main__":
    # app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit


Condensed question: hi how are you
Context: page_label: 484
file_path: d:\Michh\Python\Projects\Education_Chatbot\Dataset\Speech and Language Processing_Daniel Jurafsky.pdf

We are not
concerned with the bound interpretation of pronouns in this chapter.

page_label: 553
file_path: d:\Michh\Python\Projects\Education_Chatbot\Dataset\Speech and Language Processing_Daniel Jurafsky.pdf

Proceedings of SIGDIAL .
Pollard, C. and I. A. Sag. 1994. Head-
Driven Phrase Structure Grammar .
University of Chicago Press.
Ponzetto, S. P. and M. Strube. 2006.
Exploiting semantic role labeling,
WordNet and Wikipedia for corefer-
ence resolution. HLT-NAACL .
Ponzetto, S. P. and M. Strube. 2007.
Knowledge derived from Wikipedia
for computing semantic relatedness.
JAIR , 30:181–212.
Popovi ´c, M. 2015. chrF: charac-
ter n-gram F-score for automatic
MT evaluation. Proceedings of the
Tenth Workshop on Statistical Ma-
chine Translation .
Popp, D., R. A. Donovan, M. Craw-
ford, K. L. Marsh, and M. Peele.
2003.

Llama.generate: prefix-match hit

llama_print_timings:        load time =    1139.55 ms
llama_print_timings:      sample time =      96.05 ms /   195 runs   (    0.49 ms per token,  2030.26 tokens per second)
llama_print_timings: prompt eval time =    1202.40 ms /   897 tokens (    1.34 ms per token,   746.00 tokens per second)
llama_print_timings:        eval time =    5372.01 ms /   194 runs   (   27.69 ms per token,    36.11 tokens per second)
llama_print_timings:       total time =    7863.72 ms /  1091 tokens
127.0.0.1 - - [21/Jun/2024 20:14:14] "POST /query HTTP/1.1" 200 -
