#### Load CSV.

Load CSV with chunks and its embedding.

Process the CSV into right format.

Create the embedding model.

Create a search pipeline for the query and the embeddings.

In [1]:
import random
import torch
import pandas as pd
import numpy as np
import helpers
import importlib
importlib.reload(helpers)
from helpers import import_chunks_with_embeddings, get_chunks_embeddings_as_tensor

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
csv_path = "./data/filtered_all_chapters_embeddings_df.csv"



chunks_with_embeddings = import_chunks_with_embeddings(csv_path)

embeddings = get_chunks_embeddings_as_tensor(chunks_with_embeddings).to(device)
embeddings.shape

torch.Size([2898, 768])

In [2]:
# Create the model
from sentence_transformers import SentenceTransformer, util

embedding_model = SentenceTransformer('all-mpnet-base-v2', device=device)



##### Search Pipeline


In [3]:
from helpers import retrieve_relevant_resources, print_top_results_and_scores

query = "What is a STUB?"

print_top_results_and_scores(
    *retrieve_relevant_resources(query=query, 
                                 embeddings=embeddings, 
                                 embedding_model=embedding_model),
    chunks_with_embeddings
)

Time taken to compute dot scores on (2898): 8.900801185518503e-05 seconds
Score: 0.5530380010604858
Chapter: Object-based architectural style
Text:
The server-side stub is often referred to as a skeleton as it provides the bare
means for letting the server middleware access the user-defined objects. In
practice, it often contains incomplete code in the form of a language-specific
class that needs to be further specialized by the developer.


Score: 0.5495573282241821
Chapter: 4.2.2 Parameter passing
Text:
The function of the client stub is to take its parameters, pack them into a
message, and send them to the server stub. While this sounds straightforward, it
is not quite as simple as it at first appears.


Score: 0.47215867042541504
Chapter: Note 4.8 (Advanced: Implementing stubs as global references revisited)
To provide a more in-depth insight in the working of sockets, let us look at a
more elaborate example, namely the use of stubs as global references.
Text:
To use a stub as a gl

### LLM

In [4]:
import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

Available GPU memory: 24 GB


In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available
from transformers import BitsAndBytesConfig

# 4-bit quantization configuration
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

# Attention implementation, either 'sdpa' or 'flash_attention_2'
if(is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
    attn_implementation = "flash_attention_2"
else:
    attn_implementation = "sdpa"
print(f"Using {attn_implementation} attention")

# Model
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

# Instantiate tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, 
                                                 torch_dtype=torch.float16,
                                                 low_cpu_mem_usage=False,
                                                 attn_implementation=attn_implementation)

llm_model = llm_model.to("cuda")


Using sdpa attention


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
!nvidia-smi

Thu May  2 21:10:08 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.67                 Driver Version: 550.67         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        Off |   00000000:01:00.0 Off |                  Off |
|  0%   42C    P2             70W /  450W |   16409MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [7]:
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

get_model_num_params(llm_model)

8030261248

In [8]:
def get_model_mem_size(model: torch.nn.Module):
    """
    Get how much memory a PyTorch model takes up.

    See: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822
    """
    # Get model parameters and buffer sizes
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # Calculate various model sizes
    model_mem_bytes = mem_params + mem_buffers # in bytes
    model_mem_mb = model_mem_bytes / (1024**2) # in megabytes
    model_mem_gb = model_mem_bytes / (1024**3) # in gigabytes

    return {"model_mem_bytes": model_mem_bytes,
            "model_mem_mb": round(model_mem_mb, 2),
            "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(llm_model)

{'model_mem_bytes': 16194748416,
 'model_mem_mb': 15444.52,
 'model_mem_gb': 15.08}

### Generate text with Llama 3 8B

In [11]:
model = llm_model

In [14]:
input_text = "What is the meaning of process and thread?"
print(f"Input text: {input_text}")

# Prompt template
message_template = [
    { "role": "system", "content": "You are Study-Buddy. An educatinal chatbot that will aid students in their studies." },
    { "role": "user", "content": input_text }
]

input_ids = tokenizer.apply_chat_template(
    message_template,
    tokenize=False, # keep as raw text
    add_generation_prompt=True
)

print(f"Prompt formatted:\n{input_ids}")


Input text: What is the meaning of process and thread?


In [16]:
print(f"Prompt formatted:\n{input_ids}")

Prompt formatted:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are Study-Buddy. An educatinal that will aid students in their studies.<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the meaning of process and thread?<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [1]:
model = llm_model
model

NameError: name 'llm_model' is not defined