#### Load CSV.

Load CSV with chunks and its embedding.

Process the CSV into right format.

Create the embedding model.

Create a search pipeline for the query and the embeddings.

In [42]:
import random
import torch
import pandas as pd
import numpy as np
import helpers
import importlib
importlib.reload(helpers)
from helpers import import_chunks_with_embeddings, get_chunks_embeddings_as_tensor

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
csv_path = "./data/filtered_all_chapters_embeddings_df.csv"



chunks_with_embeddings = import_chunks_with_embeddings(csv_path)

embeddings = get_chunks_embeddings_as_tensor(chunks_with_embeddings).to(device)
embeddings.shape

torch.Size([2898, 768])

In [43]:
# Create the model
from sentence_transformers import SentenceTransformer, util

embedding_model = SentenceTransformer('all-mpnet-base-v2', device=device)

##### Search Pipeline


In [48]:
from helpers import retrieve_relevant_resources, print_top_results_and_scores

query = "What is a STUB?"

print_top_results_and_scores(
    *retrieve_relevant_resources(query=query, 
                                 embeddings=embeddings, 
                                 embedding_model=embedding_model),
    chunks_with_embeddings
)

Time taken to compute dot scores on (2898): 3.735499922186136e-05 seconds
Score: 0.5530380010604858
Chapter: Object-based architectural style
Text:
The server-side stub is often referred to as a skeleton as it provides the bare
means for letting the server middleware access the user-defined objects. In
practice, it often contains incomplete code in the form of a language-specific
class that needs to be further specialized by the developer.


Score: 0.5495573282241821
Chapter: 4.2.2 Parameter passing
Text:
The function of the client stub is to take its parameters, pack them into a
message, and send them to the server stub. While this sounds straightforward, it
is not quite as simple as it at first appears.


Score: 0.47215867042541504
Chapter: Note 4.8 (Advanced: Implementing stubs as global references revisited)
To provide a more in-depth insight in the working of sockets, let us look at a
more elaborate example, namely the use of stubs as global references.
Text:
To use a stub as a gl

### LLM

In [56]:
import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

Available GPU memory: 24 GB


In [59]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available
from transformers import BitsAndBytesConfig

# 4-bit quantization configuration
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

# Attention implementation, either 'sdpa' or 'flash_attention_2'
if(is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
    attn_implementation = "flash_attention_2"
else:
    attn_implementation = "sdpa"
print(f"Using {attn_implementation} attention")

# Model
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

# Instantiate tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, 
                                                 torch_dtype=torch.float16,
                                                 quantization_config=None,
                                                 low_cpu_mem_usage=False,
                                                 attn_implementation=attn_implementation)

llm_model = llm_model.to("cuda")


Using sdpa attention


loading file tokenizer.json from cache at /home/buddy/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e5e23bbe8e749ef0efcf16cad411a7d23bd23298/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /home/buddy/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e5e23bbe8e749ef0efcf16cad411a7d23bd23298/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/buddy/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e5e23bbe8e749ef0efcf16cad411a7d23bd23298/tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
loading configuration file config.json from cache at /home/buddy/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e5e23bbe8e749ef0efcf16cad411a7d23bd23298/config.json
Model config LlamaConfig {
  "_name_or_p

In [60]:
llm_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head)

In [61]:
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

get_model_num_params(llm_model)

8030261248

In [62]:
def get_model_mem_size(model: torch.nn.Module):
    """
    Get how much memory a PyTorch model takes up.

    See: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822
    """
    # Get model parameters and buffer sizes
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # Calculate various model sizes
    model_mem_bytes = mem_params + mem_buffers # in bytes
    model_mem_mb = model_mem_bytes / (1024**2) # in megabytes
    model_mem_gb = model_mem_bytes / (1024**3) # in gigabytes

    return {"model_mem_bytes": model_mem_bytes,
            "model_mem_mb": round(model_mem_mb, 2),
            "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(llm_model)

{'model_mem_bytes': 16194748416,
 'model_mem_mb': 15444.52,
 'model_mem_gb': 15.08}