In [1]:
from colorama import Fore, Style

def print_message(message_type, message):
    if message_type == "INFO":
        print(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} {message}")
    elif message_type == "ERROR":
        print(f"{Fore.RED}[ERROR]{Style.RESET_ALL} {message}")
    elif message_type == "SUCCESS":
        print(f"{Fore.GREEN}[SUCESS]{Style.RESET_ALL} {message}")
    else:
        print(f"{message}")

print_message("INFO", "This is an info")
print_message("ERROR", "This is an error")

[33m[INFO][0m This is an info
[31m[ERROR][0m This is an error


In [2]:
import os
import requests
from tqdm import tqdm

In [3]:
def download_file_with_progress(url, output_path):
   
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        total_size = int(response.headers.get('content-length', 0))
        with open(output_path, 'wb') as file, tqdm(
            desc=output_path,
            total=total_size,
            unit='B',
            unit_scale=True,
            unit_divisor=1024,
        ) as progress_bar:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    file.write(chunk)
                    progress_bar.update(len(chunk))
        print_message("SUCCESS", "File has been successfully downloaded.")
    else:
        print_message("ERROR", f"Something went wrong. Status code: {response.status_code}")

In [4]:
def insert_pdf_file(pdf_file_name: str) -> str:
    if pdf_file_name[-4:] != ".pdf":
        pdf_file_name += ".pdf"
    if not os.path.exists(pdf_file_name):
        print_message("INFO", "File doesn't exist, Insert Url here")
        url = input(">")

        download_file_with_progress(url, pdf_file_name)
    else:
        print_message("SUCCESS", "The file already exists")
        return pdf_file_name

In [5]:
pdf_file_name = "Pattern Recognition and Machine - Christopher M. Bishop"
pdf_file_name = insert_pdf_file(pdf_file_name=pdf_file_name)

[32m[SUCESS][0m The file already exists


In [6]:
import fitz
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
    cleaned_text = text.replace('\n', ' ').strip()
    return cleaned_text

def open_and_read_pdf(pdf_file_name: str) -> list[dict]:
    doc = fitz.open(pdf_file_name)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_texts.append({
                "page_number": page_number + 1,
                "page_char_count": len(text),
                "page_word_count": len(text.split(' ')),
                "page_sentence_count_raw": len(text.split(". ")),
                "page_token_count": len(text) / 4,
                "text": text
        })
    return pages_and_texts

In [7]:
pages_and_texts = open_and_read_pdf(pdf_file_name=pdf_file_name)

0it [00:00, ?it/s]


KeyboardInterrupt



In [None]:
def store_and_embed_pdf_file(pdf_file_name: str):
    pdf_file_name = insert_pdf_file(pdf_file_name=pdf_file_name)
    pages_and_texts = open_and_read_pdf(pdf_file_name_pdf_file_name)
    

In [None]:
import random 

random.sample(pages_and_texts, k=3)

## Get some more info on the data of the book

In [None]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

In [None]:
df.describe().round(2)

## Splitting pages into sentences

In [None]:
from spacy.lang.en import English

nlp = English()

nlp.add_pipe("sentencizer")

doc = nlp("This is a sentence. This is another sentence. I like this.")
assert len(list(doc.sents)) == 3

list(doc.sents)


In [None]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item['text']).sents)

    item['sentences'] = [str(sentence) for sentence in item["sentences"]]
    item["page_sentence_count_spacy"] = len(item["sentences"])

In [None]:
random.sample(pages_and_texts, k=1)

In [None]:
df = pd.DataFrame(pages_and_texts)
df.describe().round()

### Making chunks from the text 

In [None]:
num_sentence_chunk_size = 10 

def split_list(input_list: list, slice_size: int =num_sentence_chunk_size) -> list[str]:
    return [input_list[i:i+slice_size + 1] for i in range(0, len(input_list), slice_size)]

In [None]:
tl = list(range(25))
split_list(tl)

In [None]:
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                        slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

In [None]:
random.sample(pages_and_texts, k=2)

In [None]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

In [None]:
import re

pages_and_chunks = []
for i in tqdm(pages_and_texts):
    for sentence_chunk in i['sentence_chunks']:
        chunk_dict = {}
        chunk_dict["page_number"] = i["page_number"]
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)
        chunk_dict["sentence_chunk"] = joined_sentence_chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4
        pages_and_chunks.append(chunk_dict)
    
len(pages_and_chunks)

In [None]:
pages_and_chunks

In [None]:
random.sample(pages_and_chunks, k=1)

In [None]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

## Filter out chunks with less than 20 tokens

In [None]:
min_token_length = 20
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f"Chunk token count: {row[1]['chunk_token_count']} | Text: {row[1]['sentence_chunk']}")

In [None]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")

In [None]:
random.sample(pages_and_chunks_over_min_token_len, k=1)

## Embedding chunks

In [None]:
test_sentences = ["This is a test for the embedding model",
                 "this is a second sentence for the model",
                 "The sky is blue"]

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                     device="cuda")


embeddings = embedding_model.encode(test_sentences,
                                    batch_size=32,
                                    convert_to_tensor=True)


In [None]:
sentences_and_embeddings = dict(zip(test_sentences, embeddings))

In [None]:
sentences_and_embeddings

In [None]:
%%time
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

# TODO
## Creating a chromadb client for storing embeddings

> **NOTE** maybe not
>
> [INFO] Time taken to get scores on 1765000 embeddings: 0.00286 seconds.

In [None]:
import chromadb
from chromadb.utils import embedding_functions
chroma_client = chromadb.Client()

In [None]:
collection = chroma_client.create_collection(name="embeddings")

In [None]:
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

In [None]:
# results = collection.query(
#     query_texts=["This is a query document about hawaii"], # Chroma will embed this for you
#     n_results=2 # how many results to return
# )
# print(results)

In [None]:
%%time

text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]
random.sample(text_chunks, k=3)

## adding the embeddings to the chunks

In [None]:
pages_and_chunks_over_min_token_len[:4]

In [None]:
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

In [None]:
# Embed all the text in batches
from time import perf_counter as timer

start_time = timer()
text_chunks_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32,
                                               convert_to_tensor=True)
end_time = timer()
print(end_time - start_time)
text_chunks_embeddings

### Saving embeddings into a file

In [None]:
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = f"embeddings/{pdf_file_name}.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [None]:
# reading the csv 
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

In [None]:
text_chunks_and_embedding_df_load["sentence_chunk"].iloc[357]

# Rag - Search and Answer

### Similarity search

In [1]:
from colorama import Fore, Style

def print_message(message_type, message):
    if message_type == "INFO":
        print(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} {message}")
    elif message_type == "ERROR":
        print(f"{Fore.RED}[ERROR]{Style.RESET_ALL} {message}")
    elif message_type == "SUCCESS":
        print(f"{Fore.GREEN}[SUCESS]{Style.RESET_ALL} {message}")
    else:
        print(f"{message}")

print_message("INFO", "This is an info")
print_message("ERROR", "This is an error")

[33m[INFO][0m This is an info
[31m[ERROR][0m This is an error


In [2]:
import random

import torch
import numpy as np
import pandas as pd
from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("embeddings/1706.03762v7.pdf.csv")

# Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)

In [None]:
text_chunks_and_embedding_df

In [None]:
embeddings.shape

## semantic search pipeline


1. Define a query string.
2. Turn the query string into an embedding
3. Perform a dot product or cosine similarity function between the text embedding and the query embedding
4. Sort the results from k in descending order

In [6]:
query = "What is an encoder?"
print(f"Query: {query}")

# embed query
query_embedding = embedding_model.encode(query, convert_to_tensor=True)


# Get similarity scores with dot product (use cosine similarity if outputs are not normalized)

from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()

print_message("INFO", f"Time taken to get scores on {len(embeddings)} embeddings: {end_time - start_time:.5f} seconds.")

# 4 get top-k results

top_results_dot_product = torch.topk(dot_scores, k=3)
top_results_dot_product

Query: What is an encoder?
[33m[INFO][0m Time taken to get scores on 41 embeddings: 0.00121 seconds.


torch.return_types.topk(
values=tensor([0.5157, 0.5099, 0.4239], device='cuda:0'),
indices=tensor([5, 4, 9], device='cuda:0'))

In [7]:
text_chunks_and_embedding_df["sentence_chunk"].iloc[5]

'Figure 1: The Transformer - model architecture. The Transformer follows this overall architecture using stacked self-attention and point-wise, fully connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1, respectively.3.1 Encoder and Decoder Stacks Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position- wise fully connected feed-forward network. We employ a residual connection [11] around each of the two sub-layers, followed by layer normalization [1]. That is, the output of each sub-layer is LayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer itself. To facilitate these residual connections, all sub-layers in the model, as well as the embedding layers, produce outputs of dimension dmodel = 512. Decoder: The decoder is also composed of a stack of N = 6 identical layers. 

In [8]:
dot = torch.dot(embeddings[309], query_embedding)
print(f"just dot prod {dot:.4f}")
dot = dot / (torch.sqrt(torch.sum(embeddings[309] ** 2)) *  torch.sqrt(torch.sum(query_embedding ** 2))) 
print(f"cosine similarity {dot:.4f}")


IndexError: index 309 is out of bounds for dimension 0 with size 41

In [None]:
 (torch.sqrt(torch.sum(embeddings[309] ** 2)) *  torch.sqrt(torch.sum(query_embedding ** 2))) 

## Testing $ \times 1000 $ embeddings

In [None]:
larger_embeddings = torch.rand(1000*embeddings.shape[0], 768).to(device)
print(f'Embeddings shape {larger_embeddings.shape}')

In [None]:
start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=larger_embeddings)[0]
end_time = timer()
# print(dot_scores.shape)

print_message("INFO", f"Time taken to get scores on {len(larger_embeddings)} embeddings: {end_time - start_time:.5f} seconds.")


top_results_dot_product = torch.topk(dot_scores, k=100)
top_results_dot_product

### Implementing a Re-Ranker

- Re-rank the top k=100 results
- Select the top=5 results

In [None]:
top_results_dot_product[1]
top_k_chunks = [text_chunks_and_embedding_df["sentence_chunk"].iloc[int(i)] for i in top_results_dot_product[1]]
# top_k_chunks = [i for i in top_results_dot_product[1]]

In [None]:
top_k_chunks[:5]

In [None]:
from sentence_transformers import CrossEncoder

# Load the model, here we use our base sized model
model = CrossEncoder("mixedbread-ai/mxbai-rerank-base-v1")

In [None]:
results = model.rank(query, top_k_chunks, return_documents=True, top_k=5)

In [None]:
results

### Functionizing the semantic pipeline

In [1]:
def retrieve_relevant_resources(query: str,
                              embeddings: torch.tensor,
                              model: SentenceTransformer=embedding_model,
                              n_resources_to_return: int=5,
                              print_time: bool=True):
    """
    Embeds the query with a model and returns the top k scores and indices from the embeddings.
    """
    
    query_embedding = model.encode(query, convert_to_tensor=True)

    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()
    
    if print_time:
        print_message("INFO", f"Time taken to get scores on {len(embeddings)} embeddings: {end_time - start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores,
                                k=n_resources_to_return)
    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 n_resources_to_return: int=5):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.

    Note: Requires pages_and_chunks to be formatted in a specific way (see above for reference).
    """
    
    scores, indices = retrieve_relevant_resources_torch(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)
    
    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicie
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
        print_wrapped(pages_and_chunks[index]["sentence_chunk"])
        # Print the page number too so we can reference the textbook further and check the results
        print(f"Page number: {pages_and_chunks[index]['page_number']}")
        print("\n")

NameError: name 'embedding_model' is not defined

In [None]:
retrieve_relevant_resources(query="k-means", embeddings=embeddings)

## Connecting to an LLM

In [10]:
# pip install bitsandbytes accelerate
from transformers.utils import is_flash_attn_2_available
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import TextStreamer

quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                        bnb_4bit_compute_dtype=torch.float16)
model_id = "meta-llama/Llama-3.2-3B-Instruct"
## Flash attention gpu

if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
    attn_implementation = "flash_attention_2"
else:
    attn_implementation = "sdpa"
print_message("INFO", f"Using attention implementation: {attn_implementation}")


tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    quantization_config=quantization_config,
    low_cpu_mem_usage=True,
    attn_implementation=attn_implementation
)

[33m[INFO][0m Using attention implementation: sdpa


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e

#### Getting numbers of parameters of the LLM

In [12]:
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

print(get_model_num_params(model) / 1000000000)

1.80346368


In [13]:
def get_model_mem_size(model: torch.nn.Module):
    """
    Get how much memory a PyTorch model takes up.

    See: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822
    """
    # Get model parameters and buffer sizes
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # Calculate various model sizes
    model_mem_bytes = mem_params + mem_buffers # in bytes
    model_mem_mb = model_mem_bytes / (1024**2) # in megabytes
    model_mem_gb = model_mem_bytes / (1024**3) # in gigabytes

    return {"model_mem_bytes": model_mem_bytes,
            "model_mem_mb": round(model_mem_mb, 2),
            "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(model)

{'model_mem_bytes': 2197648640, 'model_mem_mb': 2095.84, 'model_mem_gb': 2.05}

In [None]:
input_text = "What is ridge regression"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
chat = [
    { "role": "user", "content": input_text },
]

In [None]:
question = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

question = tokenizer(question, return_tensors="pt").to(device)

streamer = TextStreamer(tokenizer, skip_prompt=True)

In [None]:
question["input_ids"].shape

In [None]:
_ = model.generate(**question, streamer=streamer,
                            pad_token_id=tokenizer.eos_token_id,
                            temperature=0.1,
                            max_length=2048,
                            do_sample=True,
                            top_p=0.5,
                            repetition_penalty=1.25)


## Augment the prompt with context

In [122]:
COMPLETE_SYSTEM_PROMPT = """You are an advanced AI assistant with access to specific document context. You must strictly adhere to these guidelines:

FOUNDATIONAL RULES:
1. You can ONLY provide information that is explicitly present in the given context
2. You must NEVER use external knowledge, even if you have it
3. You must NEVER make assumptions or inferences beyond the context
4. Every response must include relevant quotes from the context
5. You must indicate confidence level in your responses

REQUIRED RESPONSE FORMAT:
```
Confidence: [High|Medium|Low]
Relevant Quotes: [Include word-for-word quotes from the content chunks that support your answer]
Answer: [Your response]
```

RESPONSE GUIDELINES:

FOR QUESTIONS WITH AVAILABLE INFORMATION:
- Begin with "Based on the provided context..."
- Include direct quotes using `quotes`
- Cite specific sections or page numbers when available
- Structure complex answers with clear headings
- End with any important caveats or limitations


FOR QUESTIONS WITH NO INFORMATION:
Response Template:
"I apologize, but I cannot find information about [query] in the provided context. I can only provide information that is explicitly present in these documents.


ERROR HANDLING:
- If context is corrupted: "I'm unable to properly process the provided context. Please ensure the document text is correctly formatted."
- If context is empty: "No context has been provided. I require specific document context to provide accurate answers."

PROHIBITED BEHAVIORS:
1. Never speculate or extrapolate beyond the context
2. Never combine external knowledge with context
3. Never make assumptions about missing information
4. Never provide personal opinions or interpretations
5. Never claim uncertainty about factual content that is clearly present in context

EXAMPLE INTERACTIONS:

Good Response:
User: "What are the key findings of the study?"
Assistant: "Confidence: High
Relevant Quotes: ['The study found a 23% increase in efficiency', 'Cost reduction of $1.2M annually']
Answer: Based on the provided context, the study identified two key findings. First, there was a significant efficiency improvement of 23%. Second, the implementation resulted in annual cost savings of $1.2 million.
Source Sections: [Section 4.2, Page 12]"

Partial Information:
User: "What were the project costs and timeline?"
Assistant: "Confidence: Medium
Relevant Quotes: ['Project costs totaled $500,000']
Answer: Based on the provided context, I can only speak to the project costs, which were $500,000. The timeline information is not present in the available documents. Would you like to know more about the cost breakdown, which is detailed in the context?"


CONTEXT:    
{context}

QUERY:
{query}

IMPLEMENTATION NOTES:
1. Always verify context relevance before processing
2. Maintain consistent formatting in responses
3. Always inlcude the relevant quote section

Anwser:
"""

# FOR QUESTIONS WITH PARTIAL INFORMATION:
# - Clearly state what aspects you can and cannot address
# - Explain which parts are missing from the context
# - Provide the available partial information with appropriate caveats
# - Suggest how the user might refine their question


In [128]:
COMPLETE_SYSTEM_PROMPT = """
You are a highly knowledgeable assistant with expertise in extracting and synthesizing information. Use the provided context to answer the question as accurately and comprehensively as possible. Your response should be based strictly on the context given, without introducing external information or assumptions.

If the context is insufficient to answer the question, respond clearly with: "The provided context does not contain enough information to answer this question."

### Instructions:s
1. Base your answer entirely on the provided context.
2. Do not include information not explicitly mentioned in the context.
3. If the question asks for an explanation, summarize relevant parts of the context in your answer.
4. Maintain a professional, concise, and accurate tone.

REQUIRED RESPONSE FORMAT:
```
Confidence: [High|Medium|Low]
Relevant Quotes: [Include word-for-word quotes from the retrived document chunks that support your answer]
Answer: [Your response]
```



EXAMPLE INTERACTIONS:

Good Response:
User: "What are the key findings of the study?"
Assistant: "Confidence: High
Relevant Quotes: ['The study found a 23% increase in efficiency', 'Cost reduction of $1.2M annually']
Answer: Based on the provided context, the study identified two key findings. First, there was a significant efficiency improvement of 23%. Second, the implementation resulted in annual cost savings of $1.2 million."

Partial Information:
User: "What were the project costs and timeline?"
Assistant: "Confidence: Medium
Relevant Quotes: ['Project costs totaled $500,000']
Answer: Based on the provided context, I can only speak to the project costs, which were $500,000. The timeline information is not present in the available documents. Would you like to know more about the cost breakdown, which is detailed in the context?"

IMPLEMENTATION NOTES:
1. Always verify context relevance before processing
2. Always aintain consistent formatting in responses
3. Always inlcude the relevant quote section



### Context:
{context}

### Query:
{query}



### Answer:
"""

In [123]:
def prompt_formatter(query: str,
                    context_items: list[dict]) -> str:
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    
    base_prompt = COMPLETE_SYSTEM_PROMPT.format(context=context, query=query)

    #prompt template for instruction tune model

    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                           tokenize=False,
                                           add_generation_prompt=True)
    
    return prompt

In [132]:
# query = "What is multi-head attention?"
# query = "Expain the architecture of a transformer"
# query = "How can I bake a cake?"
query = "How many layers does a decoder have?"
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings,
                                              n_resources_to_return=5
                                            )

context_items = [pages_and_chunks[i] for i in indices]

prompt = prompt_formatter(query,context_items)


question = tokenizer(prompt, return_tensors="pt").to(device)

streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model.generate(**question, streamer=streamer,
                            pad_token_id=tokenizer.eos_token_id,
                            temperature=0.4,
                            max_length=4096,
                            do_sample=True,
                            top_p=1,
                            repetition_penalty=1.25)


[33m[INFO][0m Time taken to get scores on 41 embeddings: 0.00023 seconds.
Confidence: High
Relevant Quote: 
Answer: According to the provided text, the decoder consists of a stack of N = 6 identical layers, just like the encoder. Therefore, it has six layers as well.<|eot_id|>


In [133]:
print(prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 04 Dec 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

You are a highly knowledgeable assistant with expertise in extracting and synthesizing information. Use the provided context to answer the question as accurately and comprehensively as possible. Your response should be based strictly on the context given, without introducing external information or assumptions.

If the context is insufficient to answer the question, respond clearly with: "The provided context does not contain enough information to answer this question."

### Instructions:s
1. Base your answer entirely on the provided context.
2. Do not include information not explicitly mentioned in the context.
3. If the question asks for an explanation, summarize relevant parts of the context in your answer.
4. Maintain a professional, concise, and accurate tone.

REQUIRED RESPONSE FORMAT:
```
Confidence: [High