**Gemma RAG LLM setup**

In [None]:
# Installing the required packages
%pip3 install langchain pymongo gradio requests langchain_community langchain_core langchain_mongodb sentence_transformers transformers python-dotenv

In [None]:
# Importing the required libraries
from pymongo import MongoClient
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain.document_loaders import DirectoryLoader
from langchain.chains import RetrievalQA
import gradio as gr
from gradio.themes.base import Base
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from sentence_transformers import SentenceTransformer # https://huggingface.co/thenlper/gte-large
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
from dotenv import load_dotenv

***Accessing secrets***

In [None]:
# In Google Colab, you can use the following code to access the secret
#from google.colab import userdata
#HF_Token = userdata.get('HF_Token')
MONGO_URI = os.getenv("MONGO_URI")

# In your local environment, you can use the following code to access the secret
load_dotenv()
HF_Token = os.getenv("HF_Token")
MONGO_URI = os.getenv("MONGO_URI")

***Generating the embedding***

# MongoDB setup
client = MongoClient(MONGO_URI)
dbName = "MTGemma"
collectionName = "MTGemma"
collection = client[dbName][collectionName]
index_name = "vector_index"

# Embedding model setup
embedding_model = SentenceTransformer("thenlper/gte-large")

class CustomEmbeddingFunction:
    def __init__(self, model):
        self.model = model

    def embed_documents(self, texts):
        """Embeds a list of documents."""
        embeddings = self.model.encode(texts)
        return embeddings.tolist()

    def embed_query(self, text):
        """Embeds a single query."""
        embedding = self.model.encode(text)
        return embedding.tolist()

# Wrap the SentenceTransformer model
embedding_function = CustomEmbeddingFunction(embedding_model)

# Vector store setup
vector_store = MongoDBAtlasVectorSearch(
    client=client,
    database=dbName,
    collection=collection,
    index_name=index_name,
    embedding=embedding_function,
    text_key="Query"
)

***Loading the Tokenizer and LLM-Model***

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b-it")
# CPU Enabled uncomment below 👇🏽
# model = AutoModelForCausalLM.from_pretrained("google/gemma-7b-it")
# GPU Enabled use below 👇🏽
model = AutoModelForCausalLM.from_pretrained("google/gemma-7b-it", device_map="auto")

***Chain setup***

In [None]:
question = "SELECT T2.name ,  T2.capacity FROM concert AS T1 JOIN stadium AS T2 ON T1.stadium_id  =  T2.stadium_id WHERE T1.year  >=  2014 GROUP BY T2.stadium_id ORDER BY count(*) DESC LIMIT 1?"

retriever = vector_store.as_retriever(search_kwargs={"k": 4})

def logging_retriever_function(retriever, question):
    documents = retriever.invoke(question)
    print("Retrieved Documents:")
    for doc in documents:
        print(doc)
    return documents

def get_source_information(question):
    retrieved_docs = logging_retriever_function(retriever, question)
    # Assuming doc itself contains what you need
    source_information = "\n".join([str(doc) for doc in retrieved_docs])  # If doc is a string or convertible to string
    # Now use source_information as needed...
    return source_information

information_summary = get_source_information(question)

def generate_response(question):
    combined_information = (
          f"Instructions: Provide a natural language *translation* stating what the SQL statement achieves and *explanation* of the SQL code provided in the Question string. The Context string gives you examples of similar queries, you can use them to support your translation and explanation. Go through it step by step and if you can't answer the question, reply I don't know.\n\n"
          f"Question: {question}\n\n"
          f"Context: {information_summary}\n\n"
          f"Response:\n"
    )
    # Moving tensors to GPU
    input_ids = tokenizer(combined_information, return_tensors="pt").to("cuda")
    response = model.generate(**input_ids, max_new_tokens=1000)
    decoded_response = tokenizer.decode(response[0], skip_special_tokens=True).strip()

    # Post-processing: Extracting the content after 'Response:\n'
    if "Response:" in decoded_response:
        decoded_response = decoded_response.split("Response:", 1)[-1].strip()

    return decoded_response

# Example usage
result = generate_response(question)
print(result)

***Chat interface setup***

In [None]:
# Define the chain_invoke function
def chain_invoke(question):
    # Execute the chain with the logging retriever
    result = generate_response(question)
    # Return the result
    return result

# Create a web interface for the app, using Gradio
with gr.Blocks(theme=Base(), title="Question Answering App using Vector Search + RAG") as demo:
    gr.Markdown(
        """
        # Question Answering App using Atlas Vector Search + RAG Architecture
        """)
    textbox = gr.Textbox(label="Enter your SQL statement:")
    with gr.Row():
        button = gr.Button("Submit", variant="primary")
    with gr.Column():
        output = gr.Textbox(lines=1, max_lines=30, label="Natural language translation and explanation:")

# Call chain_invoke function upon clicking the Submit button

    button.click(chain_invoke, textbox, outputs=output)

demo.launch()