<center><h1><b>Databricks Inferencing Serving Endpoints</b></h1></center>

### ```Generic Setup```

#### **Imports**

In [26]:
import os
import re
import chromadb
from openai import OpenAI
from pypdf import PdfReader
from langchain_databricks import ChatDatabricks
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    SentenceTransformersTokenTextSplitter,
)

#### **Envs**

In [27]:
# Fetch the values using os.environ
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
DATABRICKS_ENDPOINT = os.getenv("DATABRICKS_ENDPOINT")
DB_NAME = os.getenv("DB_NAME")

#### **OpenAI Initialization**

In [28]:
client = OpenAI(
  api_key=OPENAI_API_KEY,
  base_url=DATABRICKS_ENDPOINT
)

#### **DB Initialization**

In [29]:
def db_init_embedd():
    embedding_function = SentenceTransformerEmbeddingFunction()

    chroma_client = chromadb.Client()

    # Instead of just storing it to memory we are now saving it locally.
    # chroma_client = chromadb.PersistentClient(path=DB_LOCATION)

    # get_or_create_collection : This will either get the collection or creates it
    chroma_collection = chroma_client.get_or_create_collection(
        DB_NAME, embedding_function=embedding_function
    )

    return chroma_collection

In [30]:
chroma_collection = db_init_embedd()

In [31]:
file_paths = ["database/demo.pdf"]

#### **Chunking Text**

In [None]:
def clean_text(text):
    # Replace non-breaking spaces with regular spaces
    text = text.replace('\xa0', ' ')
    # Remove multiple spaces, tabs, or newlines
    text = re.sub(r'\s+', ' ', text)
    # Strip leading and trailing spaces
    return text.strip()

def embeddings_creation(file_paths):
    pdf_texts = []
    for file_path in file_paths:
        reader = PdfReader(file_path)
        pdf_texts.extend([clean_text(p.extract_text()) for p in reader.pages if p.extract_text()])

    # Filter the empty strings
    pdf_texts = [text for text in pdf_texts if text]

    character_splitter = RecursiveCharacterTextSplitter(
        # It will split on the basis of these below characters like newline etc
        separators=["\n\n", "\n", ". ", " ", ""],
        # If after splitting at separators, it got a big length then it will break down into chunk size of 1000 characters maximum
        chunk_size=1000,
        chunk_overlap=0,
    )
    
    character_split_texts = character_splitter.split_text("\n\n".join(pdf_texts))

    token_splitter = SentenceTransformersTokenTextSplitter(
        chunk_overlap=0, tokens_per_chunk=256
    )  # tokens_per_chunk is context window which means that it one chunk would have 256 tokens

    # We shall use all the chunks made by character text splitter and we are resplitting them using the token text splitter
    token_split_texts = []
    for text in character_split_texts:
        token_split_texts += token_splitter.split_text(text)

    ids = [str(i) for i in range(len(token_split_texts))]

    return ids , token_split_texts

In [34]:
ids, token_split_texts = embeddings_creation(file_paths)

#### **Inserting Chunking data**

In [None]:
def storing_embeddings_db(chroma_collection, ids, token_split_texts):
    chroma_collection.add(ids=ids, documents=token_split_texts)

    return "Stored Embeddings in Vector DB"

In [35]:
store_data_to_db = storing_embeddings_db(chroma_collection, ids, token_split_texts)

### ```Inferencing Langchain vs OpenAI```

#### **OpenAI**

In [39]:
# ------------------------------- Llama3.1 Databricks --------------------------------
def rag(client, chroma_collection, query):
    # Here chroma automatically embeds using the embedding function we have used above the query and give retrieved documents
    results = chroma_collection.query(query_texts=[query], n_results=5)
    retrieved_documents = results["documents"][0]

    information = "\n\n".join(retrieved_documents)

    chat_completion = client.chat.completions.create(
        messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant. Your users are asking questions about information contained in reports or files. You will be shown the user's question, and the relevant information from the files or reports. Answer the user's question using only this information."
        },
        {
            "role": "user",
            "content": f"Query: {query} , Information: {information}"
        }
        ],
        model="llama3-1",
        max_tokens=512
    )

    return chat_completion.choices[0].message.content

In [40]:
query = "what are some countries that are listed in this document?"

In [41]:
result = rag(client, chroma_collection, query)
print(result)



According to the document, the countries listed as neighbors of Great Britain are:

1. Denmark (to the north)
2. Germany (to the east)
3. Switzerland (to the south)
4. Austria (to the south)
5. France (to the west)


#### **Langchain**

In [42]:
query = "what are some countries that are listed in this document?"

# Here chroma automatically embeds using the embedding function we have used above the query and give retrieved documents
results = chroma_collection.query(query_texts=[query], n_results=5)
retrieved_documents = results["documents"][0]

information = "\n\n".join(retrieved_documents)

template = f"""
            "prompt":f"You are a helpful expert research assistant. Your users are asking questions about information contained in reports or files."
                "You will be shown the user's question, and the relevant information from the files or reports. Answer the user's question using only this information." 
                "Question: {query}. \n Information: {information}"
        """



In [43]:
chat_model = ChatDatabricks(endpoint="llama3-1", 
                            temperature=0.5,
                            max_tokens=512)  

In [44]:
chat_model_output = chat_model.invoke(template)

In [45]:
# Accessing the content attribute of the AIMessage object
content = chat_model_output.content

# Print or process the content
print(content)

The countries listed in this document are:

1. Denmark
2. Germany
3. Switzerland
4. Austria
5. France
