**Gemma RAG LLM setup**

This notebook should be run in Google Colab or similar site, where high GPU processing power is available. In Google Colab, the A100 GPU works best.

**Loading packages, libraries and secrets into notebook**

In [None]:
# In Google Colab Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Installing the required packages
!pip install pandas==2.1.4 numpy==1.23.5 pymongo gradio langchain_mongodb sentence_transformers tensorflow==2.15
!pip install -U transformers
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# install below if using GPU
!pip install accelerate

In [None]:
# Importing the required functions and modules
import gradio as gr
from gradio.themes.base import Base
from sentence_transformers import SentenceTransformer # https://huggingface.co/thenlper/gte-large
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoConfig
import torch
import gc

In [None]:
# Importing the required functions and modules
import gradio as gr
from gradio.themes.base import Base
from sentence_transformers import SentenceTransformer # https://huggingface.co/thenlper/gte-large
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoConfig
import torch
import gc

***Accessing secrets***

In [None]:
# Accessing the secrets from the environment variables
#load_dotenv()
#MONGO_URI_SQL = os.getenv("MONGO_URI_SQL")
#MONGO_URI_schema = os.getenv("MONGO_URI_Schema")
#HF_Token = os.getenv("HF_TOKEN")

# In Google Colab, you can use the following code to access the secret
from google.colab import userdata
HF_Token = userdata.get('HF_TOKEN')

***Loading the Tokenizer and LLM-Model***

The 7 billion Gemma model version has been selected for better performance, however a 2 billion version exists, requiring less processing power. To use the 2 billion version, the "7b" in the code below can be swapped for "2b".

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b-it")
# CPU Enabled uncomment below 👇🏽
# model = AutoModelForCausalLM.from_pretrained("google/gemma-7b-it")
# GPU Enabled use below 👇🏽
model = AutoModelForCausalLM.from_pretrained("google/gemma-7b-it", device_map="auto")

In [None]:
query = "SELECT professional_id ,  last_name ,  cell_number FROM Professionals WHERE state  =  'Indiana' UNION SELECT T1.professional_id ,  T1.last_name ,  T1.cell_number FROM Professionals AS T1 JOIN Treatments AS T2 ON T1.professional_id  =  T2.professional_id GROUP BY T1.professional_id HAVING count(*)  >  2"

output_length = len(query.split())*3 # word count of SQL query multiplied by four

def process_query(query):
    # Generate response
    def generate_response(query):
        combined_information = (
            f"Instructions: Generate a natural language Translation stating what the Query wants to achieve followed by an Explanation stating how the Query is composed and how it works."
            f"Go through it step by step and formulate the Translation and Explanation in simple and concise language."
            f"Keep the word count in line with the Length number.\n\n"
            f"Length: {output_length}"
            f"Query: {query}\n\n"
            f"Response:\n"
        )

        # Moving tensors to GPU and generating a response
        input_ids = tokenizer(combined_information, return_tensors="pt").to("cuda")
        response = model.generate(**input_ids, max_new_tokens=1000)
        decoded_response = tokenizer.decode(response[0], skip_special_tokens=True).strip()

        # Post-processing: Extracting the content after 'Response:\n'
        if "Response:" in decoded_response:
            decoded_response = decoded_response.split("Response:", 1)[-1].strip()

        # Clear GPU memory for `input_ids` and `response`
        del input_ids, response
        torch.cuda.empty_cache()
        gc.collect()

        return decoded_response

    # Return the final generated response
    return generate_response(query)

***Chat interface setup***

Change cell type below to Python, when running only this script. Markdown format for testing.

In [None]:
# Create a web interface for the app, using Gradio
with gr.Blocks(theme=Base(), title="Question Answering App using Vector Search + RAG") as demo:
    gr.Markdown(
        """
        # Question Answering App using Atlas Vector Search + RAG Architecture
        """)
    textbox = gr.Textbox(label="Enter your SQL statement:")
    with gr.Row():
        button = gr.Button("Submit", variant="primary")
    with gr.Column():
        output = gr.Textbox(lines=1, max_lines=30, label="Natural language translation and explanation:")

# Call chain_invoke function upon clicking the Submit button

    button.click(process_query, textbox, outputs=output)

demo.launch()