# Retrieval-Augmented Generation (RAG)

This notebook utilizes the power of Phi-3, a small language model (SLM) developed by Microsoft, to implement RAG - Retrival-Augmented Generation.

According to Google: 'RAG (Retrieval-Augmented Generation) is an AI framework that combines the strengths of traditional information retrieval systems (such as search and databases) with the capabilities of generative large language models (LLMs). By combining your data and world knowledge with LLM language skills, grounded generation is more accurate, up-to-date, and relevant to your specific needs.' 

One of the videos that I found helpful in implementing RAG with Phi-3 was: https://www.youtube.com/watch?v=_4hFNzY1iMA&t=555s

--Note--

If using this notebook, make sure you are either using a GPU and have at least 25GB of memory available during run time.

In [46]:
%pip install llama-index-embeddings-huggingface llama-index-llms-huggingface
import warnings
warnings.filterwarnings('ignore')
import huggingface_hub, torch
import transformers
import trl
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, RagTokenizer, RagRetriever, RagSequenceForGeneration
from trl import SFTTrainer
import peft
import bitsandbytes
import sys, logging
from peft import LoraConfig
from trl import SFTTrainer
import llama_index
from llama_index.llms.openai import OpenAI

Python(16309) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [47]:
import os
# os.environ

In [48]:
###################
# Hyper-parameters
###################
training_config = {
    "bf16": False,
    "do_eval": False,
    "learning_rate": 5.0e-06,
    "log_level": "info",
    "logging_steps": 20,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 1,
    "max_steps": -1,
    "output_dir": "../Save_Models/phi3_fine_tuning/checkpoint.dir",
    "overwrite_output_dir": True,
    "per_device_eval_batch_size": 2,
    "per_device_train_batch_size": 2,
    "remove_unused_columns": True,
    "save_steps": 60,
    "save_total_limit": 1,
    "seed": 0,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "gradient_accumulation_steps": 1,
    "warmup_ratio": 0.2,
    # "deepspeed": "/content/drive/MyDrive/Phi-3FineTuning/deepspeed.json",
    }

peft_config = {
    "r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "bias": "none",
    "task_type": "CAUSAL_LM",
    "target_modules": "all-linear",
    "modules_to_save": None,
}

if torch.cuda.is_bf16_supported():
    compute_dtype = torch.bfloat16
else:
    compute_dtype = torch.float16

train_conf = TrainingArguments(**training_config)
peft_conf = LoraConfig(**peft_config)

[INFO|training_args.py:2162] 2025-01-16 13:21:50,549 >> PyTorch: setting up devices
[INFO|training_args.py:1841] 2025-01-16 13:21:50,611 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [49]:
import datasets
logger = logging.getLogger(__name__)

###############
# Setup logging
###############
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = train_conf.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

# Log on each process a small summary
logger.warning(
    f"Process rank: {train_conf.local_rank}, device: {train_conf.device}, n_gpu: {train_conf.n_gpu}"
    + f" distributed training: {bool(train_conf.local_rank != -1)}, 16-bits training: {train_conf.fp16}"
)
logger.info(f"Training/evaluation parameters {train_conf}")
logger.info(f"PEFT parameters {peft_conf}")

2025-01-16 13:21:50 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_s

In [50]:
model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
    attn_implementation="eager",  # loading the model with flash-attenstion support
    torch_dtype=torch.bfloat16,
    device_map=None
)

In [51]:
model = AutoModelForCausalLM.from_pretrained('../Save_Models/phi3_tuned', **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained('../Save_Models/phi3_tuned')

[INFO|configuration_utils.py:694] 2025-01-16 13:21:50,897 >> loading configuration file ../Save_Models/phi3_tuned/config.json
[INFO|configuration_utils.py:694] 2025-01-16 13:21:51,128 >> loading configuration file ../Save_Models/phi3_tuned/config.json
[INFO|configuration_utils.py:768] 2025-01-16 13:21:51,132 >> Model config Phi3Config {
  "_name_or_path": "../Save_Models/phi3_tuned",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads"

In [52]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, SummaryIndex, SimpleDirectoryReader, Settings, ServiceContext
from llama_index.llms.huggingface import HuggingFaceLLM
from accelerate import init_empty_weights, load_checkpoint_and_dispatch

In [53]:
def messages_to_prompt(messages):
  prompt = ''
  system_found = False
  for message in messages:
    if message.role == 'system':
      prompt += f'<|system|>\n{message.content}<|end|>\n'
      system_found = True
    elif message.role == 'user':
      prompt += f'<|user|>\n{message.content}<|end|>\n'
    elif message.role == 'assistant':
      prompt += f'<|assistant|>\n{message.content}<|end|>\n'
    else:
      prompt += f'<|user|>\n{message.content}<|end|>\n'

  prompt += '<|assistant|>\n'

  if not system_found:
    prompt = (
        '<|system|>\nYou are a helpful AI assistant that answers questions to the best of your knowledge base.<|end|>' + prompt
    )
  
  return prompt

llm = HuggingFaceLLM(
    # disk_offload,
    model_name="microsoft/Phi-3-mini-4k-instruct",
    tokenizer=tokenizer,
    model_kwargs={'trust_remote_code':True},
    generate_kwargs={'do_sample': True, 'temperature': 0.1},
    query_wrapper_prompt=(
        '<|system|>\n'
        'You are a helpful AI assistant that answers questions to the best of your knowledge base.<|end|>\n'
        '<|user|>\n'
        '{query_str}<|end|>\n'
        '<|assistant|>\n'
    ),
    messages_to_prompt=messages_to_prompt,
    is_chat_model=True
)

[INFO|configuration_utils.py:696] 2025-01-16 13:21:53,400 >> loading configuration file config.json from cache at /Users/smittyboop/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/config.json
[INFO|configuration_utils.py:696] 2025-01-16 13:21:53,473 >> loading configuration file config.json from cache at /Users/smittyboop/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/config.json
[INFO|configuration_utils.py:768] 2025-01-16 13:21:53,474 >> Model config Phi3Config {
  "_name_or_path": "microsoft/Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop"



In [54]:
Settings.llm = llm

Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

[INFO|configuration_utils.py:696] 2025-01-16 13:22:28,796 >> loading configuration file config.json from cache at /Users/smittyboop/Library/Caches/llama_index/models--BAAI--bge-small-en-v1.5/snapshots/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/config.json
[INFO|configuration_utils.py:768] 2025-01-16 13:22:28,798 >> Model config BertConfig {
  "_name_or_path": "BAAI/bge-small-en-v1.5",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.48.0.dev0",
  "type_vocab_size"

In [55]:
documents = SimpleDirectoryReader("../Data/RAG Data").load_data()
documents[0]

Document(id_='a00ef8f8-c667-4d92-9c17-993912411b35', embedding=None, metadata={'file_path': '/Users/smittyboop/Desktop/Potential Talents Project/PhiuSFBv0NkHjLd3/Notebooks/../Data/RAG Data/potential-talents - Aspiring human resources - seeking human resources.csv', 'file_name': 'potential-talents - Aspiring human resources - seeking human resources.csv', 'file_type': 'text/csv', 'file_size': 9314, 'creation_date': '2025-01-15', 'last_modified_date': '2025-01-15'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text="1, 2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional, Houston, Texas, 85, n

In [60]:
vector_index = VectorStoreIndex.from_documents(documents, chunk_size_limit=128)

summary_index = SummaryIndex.from_documents(documents)

In [61]:
query_engine = vector_index.as_query_engine(response_mode='compact')

response = query_engine.query("aspiring human resources")

RuntimeError: MPS backend out of memory (MPS allocated: 8.63 GB, other allocations: 8.78 MB, max allowed: 9.07 GB). Tried to allocate 543.47 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [31]:
from llama_index.core.response.notebook_utils import display_response
display_response(response)

NameError: name 'response' is not defined