<a href="https://colab.research.google.com/github/Uday-ashes-uday/RAG-GEMMA-nutrify/blob/main/RAG_gemma2_it.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

!pip install transformers bitsandbytes accelerate huggingface_hub gradio

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.0-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting gradio
  Downloading gradio-4.44.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.0-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradi

In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.1.1-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.3/245.3 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.1.1


# A pipeline to combine everything

In [None]:
import random
import numpy as np
import pandas as pd
import torch
from time import perf_counter as timer
from sentence_transformers import util,SentenceTransformer

device="cuda" if torch.cuda.is_available() else "cpu"

#load csv

import pandas as pd

data=pd.read_csv("/content/drive/MyDrive/RAG_gemma/text_chunks_and_embedding.csv")


data["embedding"]=data["embedding"].apply(lambda x : np.fromstring(x.strip("[]"),sep=" "))


embeddings_co=torch.tensor(np.stack(data["embedding"].to_list(),axis=0),dtype=torch.float32).to(device)


emb_model=SentenceTransformer(model_name_or_path="all-mpnet-base-v2",device="cpu")


#functionalize dot scores


def retrieve_relevant_resources(query:str,
                                embedding:torch.Tensor,
                                model:SentenceTransformer,
                                n_resources:int,
                                print_timer:bool=True):

  query_emb=model.encode(query,convert_to_tensor=True).to(device)

  start_timer=timer()

  dot_scores=util.dot_score(query_emb,embedding)

  top_results_k=torch.topk(dot_scores,k=n_resources)

  end_timer=timer()

  print(f"[INFO] this results in time taken {end_timer-start_timer:.5f}")

  scores,indices=top_results_k

  return scores,indices


# prompt: import HF token from google colab secret keys
from google.colab import userdata
hf_token = userdata.get('HF_KEY')

import gradio as gr
import torch
import transformers
from transformers import AutoTokenizer,AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available
from transformers import BitsAndBytesConfig



from huggingface_hub import login

login(token=hf_token, add_to_git_credential=True)

device="cuda" if torch.cuda.is_available() else "cpu"


quantization_config=BitsAndBytesConfig(load_in_4bit=True,
                                       bnb_4bit_compute_dtype=torch.float16)


if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)(0)>=8):
    attn_implementation="flash_attention_2"
else:
    attn_implementation="sdpa"



model_id="google/gemma-2-2b-it"

#instantiate tokenizer

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")

#instantiate the model

llm_model=AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                               torch_dtype=torch.float16,
                                               #quantization_config=False,
                                               low_cpu_mem_usage=False,
                                               attn_implementation=attn_implementation)

llm_model.to(device)



def ask_me_any(query:str,
        chunks_and_embs=data,
        model=llm_model,
        temperature:float=0.7,
        max_new_tokens:int=256,
        format_answer_text=True,
        return_answer_only=True,
        emb_model=emb_model):


  """Takes a query, finds the relevant resources/ context and generates an
   answer to the query based on the relevant resources"""


  scores,indices=retrieve_relevant_resources(query=query,
                                             embedding=embeddings_co,
                                             model=emb_model,
                                             n_resources=5)


  #scores,indices=retrieve_relevant_resources(query=query,
  #                                            embedding=chunks_and_embs,
  #                                            model=model,n_resources=5)

  context="-"+"\n".join([chunks_and_embs.loc[ind.item()]['sentence_chunk'] for ind in indices[0]])

  base_prompt_in=f"""Based on the following context items, Please answer the following query,
                     context items:{context}, query:{query} Answer: """


  dialogue_prompt=[
                    {"role":"user",
                     "content":f"""Based on the following context items, Please answer the following query,
                     context items:{context}, query:{query} Answer: """}
                   ]


  tp=tokenizer.apply_chat_template(conversation=dialogue_prompt,
                              tokenize=False,
                              add_generation_prompt=True)


  input_ids=tokenizer(tp,return_tensors="pt").to(device)

  outputs=llm_model.generate(**input_ids,
                           temperature=0.7,
                           do_sample=True,
                           max_new_tokens=256)

  out_d=tokenizer.decode(outputs[0])

  out_r=out_d.replace(tp,"").replace("bos","").replace("*"," ")

  return out_r



def end_to_end(query:str):
  die=ask_me_any(query)
  return die

##instatiate a gradio interface that takes end_to_end as function input and returns the query

iface = gr.Interface(
    fn=end_to_end,
    inputs=gr.Textbox(lines=2, placeholder="Enter your query here..."), # Explicitly define input as Textbox
    outputs=gr.Textbox(lines=15, label="Answer"), # Explicitly define output as Textbox
)
iface.launch()




Token is valid (permission: fineGrained).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://17486c8ff1cde387a5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


