# Embeddings Experimentation

Experiment with different embeddings technologies and techniques.

# Setup Notebook

## Imports

In [14]:
# Import Standard Libraries
import chromadb
import os
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
from dotenv import load_dotenv
from IPython.display import Markdown
from google.api_core import retry

In [3]:
# Import Standard Libraries
from gensim.test.utils import common_texts
from gensim.models.Doc2Vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile

ModuleNotFoundError: No module named 'gensim.models.Doc2Vec'

In [1]:
# Import Standard Libraries
from tensorflow.keras.preprocessing.text import Tokenizer

ModuleNotFoundError: No module named 'tensorflow'

## Load Environment Variables

In [7]:
# Load environment variables
load_dotenv('./../../.env')

True

In [11]:
# Read environment variables
google_api_key = os.getenv('GOOGLE_API_KEY')

## Configure Authentication

In [12]:
# Set the Google AI Studio API Key for genai SDK
genai.configure(api_key=google_api_key)

# Read Data

In [16]:
# Define few documents
document_1 = "Operating the Climate Control System  Your Googlecar has a climate control system that allows you to adjust the temperature and airflow in the car. To operate the climate control system, use the buttons and knobs located on the center console.  Temperature: The temperature knob controls the temperature inside the car. Turn the knob clockwise to increase the temperature or counterclockwise to decrease the temperature. Airflow: The airflow knob controls the amount of airflow inside the car. Turn the knob clockwise to increase the airflow or counterclockwise to decrease the airflow. Fan speed: The fan speed knob controls the speed of the fan. Turn the knob clockwise to increase the fan speed or counterclockwise to decrease the fan speed. Mode: The mode button allows you to select the desired mode. The available modes are: Auto: The car will automatically adjust the temperature and airflow to maintain a comfortable level. Cool: The car will blow cool air into the car. Heat: The car will blow warm air into the car. Defrost: The car will blow warm air onto the windshield to defrost it."
document_2 = 'Your Googlecar has a large touchscreen display that provides access to a variety of features, including navigation, entertainment, and climate control. To use the touchscreen display, simply touch the desired icon.  For example, you can touch the "Navigation" icon to get directions to your destination or touch the "Music" icon to play your favorite songs.'
document_3 = "Shifting Gears Your Googlecar has an automatic transmission. To shift gears, simply move the shift lever to the desired position.  Park: This position is used when you are parked. The wheels are locked and the car cannot move. Reverse: This position is used to back up. Neutral: This position is used when you are stopped at a light or in traffic. The car is not in gear and will not move unless you press the gas pedal. Drive: This position is used to drive forward. Low: This position is used for driving in snow or other slippery conditions."

documents = [document_1, document_2, document_3]

In [22]:
# Define some text to embed
texts = [
    'The quick brown fox jumps over the lazy dog.',
    'The quick rbown fox jumps over the lazy dog.',
    'teh fast fox jumps over the slow woofer.',
    'a quick brown fox jmps over lazy dog.',
    'brown fox jumping over dog',
    'fox > dog',
    'The five boxing wizards jump quickly.',
    'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus et hendrerit massa. Sed pulvinar, nisi a lobortis sagittis, neque risus gravida dolor, in porta dui odio vel purus.',
]

# Tokeniser

## Simple Usage

In [2]:
# Define the data
data = [
"The earth is spherical.",
"The earth is a planet.",
"I like to eat at a restaurant."]

# Filter the punctuations, tokenize the words and index them to integers
tokenizer = Tokenizer(num_words=15, filters="!\"#$%&()*+,-./:;<=>?[\\]^_'{|}~\t\n", lower=True, split=' ')

# Fit the tokeniser
tokenizer.fit_on_texts(data)

# Translate each sentence into its word-level IDs, and then one-hot encode those IDs
ID_sequences = tokenizer.texts_to_sequences(data)
binary_sequences = tokenizer.sequences_to_matrix(ID_sequences)

print("ID dictionary:\n", tokenizer.word_index)
print("\nID sequences:\n", ID_sequences)
print("\n One-hot encoded sequences:\n", binary_sequences )

NameError: name 'Tokenizer' is not defined

# Text Embeddings

## Gemini

In [27]:
# Use Gemini
response = genai.embed_content(model='models/text-embedding-004',
                               content=texts,
                               task_type='semantic_similarity')

print(response['embedding'][0])

[-0.0922682, 0.012113783, -0.0063794684, 0.0369048, 0.02203019, 0.048805293, 0.0133289965, 0.021541007, 0.02705553, -0.0043925485, -0.011962657, 0.069833845, 0.010891116, 0.06149802, 0.049922608, 0.022119658, 0.0178632, 0.050096616, 0.0029887455, -0.008278692, 0.005999019, -0.0040636063, 0.015202278, -0.021813663, -0.02274539, -0.032121718, -0.00033075613, -0.022031343, 0.028694874, -0.04976425, 0.025347114, 0.07245508, 0.003248612, -0.002201165, 0.059787363, -0.0057624904, -0.02634191, -0.003782781, 0.048398733, 0.0066004205, -0.05704449, -0.022870814, -0.058460187, 0.013212032, 0.01834201, -0.076267265, -0.02015255, 0.014425502, 0.0024263572, -0.011807308, 0.056245767, -0.0031448188, 0.0013501083, -0.009426735, -0.020425193, 0.0015862642, 0.0039356016, 0.0071027516, -0.032657158, -0.0345763, -0.009775383, -0.032960888, 0.006124403, -0.003723441, 0.046425253, -0.03982145, -0.046581678, -0.027215622, 0.03325059, 0.013590055, -0.026209503, 0.050803315, -0.06645644, 0.041311942, -0.04812

# Document Embeddings

## Gensim

In [None]:
#train model on a sequence of documents tagged with their IDs
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
model = Doc2Vec(documents, vector_size=8, window=3, min_count=1, workers=6)

# persist model to disk, and load it to infer on new documents
model_file = get_tmpfile("Doc2Vec_v1")
model.save(model_file)
model = Doc2Vec.load(model_file)
model.infer_vector(["human", "interface"])

# Embedding Database

## RAG with ChromaDB

In [13]:
class GeminiEmbeddingFunction(EmbeddingFunction):
    """
    Function that generates embeddings through Gemini
    """
    # Specify whether to generate embeddings for documents, or queries
    document_mode = True

    def __call__(self, input: Documents) -> Embeddings:
        
        # Switch between embedding documents or query
        if self.document_mode:
            embedding_task = "retrieval_document"
        else:
            embedding_task = "retrieval_query"

        retry_policy = {"retry": retry.Retry(predicate=retry.if_transient_error)}

        response = genai.embed_content(
            model="models/text-embedding-004",
            content=input,
            task_type=embedding_task,
            request_options=retry_policy,
        )
        return response["embedding"]

In [17]:
# Create a ChromaDB and populate it with the embeddings from documents
DB_NAME = "googlecardb"
embed_fn = GeminiEmbeddingFunction()
embed_fn.document_mode = True

chroma_client = chromadb.Client()
db = chroma_client.get_or_create_collection(name=DB_NAME, embedding_function=embed_fn)

db.add(documents=documents, ids=[str(i) for i in range(len(documents))])

In [18]:
db.count()

3

In [19]:
# Use the query method to search in the ChromaDB
# Switch to query mode when generating embeddings.
embed_fn.document_mode = False

# Search the Chroma DB using the specified query.
query = "How do you use the touchscreen to play music?"

result = db.query(query_texts=[query], n_results=1)
[[passage]] = result["documents"]

Markdown(passage)

Your Googlecar has a large touchscreen display that provides access to a variety of features, including navigation, entertainment, and climate control. To use the touchscreen display, simply touch the desired icon.  For example, you can touch the "Navigation" icon to get directions to your destination or touch the "Music" icon to play your favorite songs.

Now create a RAG system through the document found to enrich the pompt

In [20]:
# Parse the strings
passage_oneline = passage.replace("\n", " ")
query_oneline = query.replace("\n", " ")

# This prompt is where you can specify any guidance on tone, or what topics the model should stick to, or avoid.
prompt = f"""You are a helpful and informative bot that answers questions using text from the reference passage included below. 
Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. 
However, you are talking to a non-technical audience, so be sure to break down complicated concepts and 
strike a friendly and converstional tone. If the passage is irrelevant to the answer, you may ignore it.

QUESTION: {query_oneline}
PASSAGE: {passage_oneline}
"""
print(prompt)

You are a helpful and informative bot that answers questions using text from the reference passage included below. 
Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. 
However, you are talking to a non-technical audience, so be sure to break down complicated concepts and 
strike a friendly and converstional tone. If the passage is irrelevant to the answer, you may ignore it.

QUESTION: How do you use the touchscreen to play music?
PASSAGE: Your Googlecar has a large touchscreen display that provides access to a variety of features, including navigation, entertainment, and climate control. To use the touchscreen display, simply touch the desired icon.  For example, you can touch the "Navigation" icon to get directions to your destination or touch the "Music" icon to play your favorite songs.



In [21]:
# Prompt the LLM
model = genai.GenerativeModel("gemini-1.5-flash-latest")
answer = model.generate_content(prompt)
Markdown(answer.text)

You can easily play music on your Googlecar by tapping the "Music" icon on the touchscreen display! 
