# Embedding Models, Vector Store and Augemented Generation

In [3]:
!python3 -m pip install chromadb gensim --quiet

## Working with **Embeddings**

In [4]:
import gensim.downloader as api

# Load the GloVe model from Gensim-data repository
# Here we use 'glove-wiki-gigaword-50' as an example. Other dimensions/models are available as well.
glove_model = api.load('glove-wiki-gigaword-50')
word = 'python'
word_vector = glove_model[word]
print(word_vector.shape)

(50,)


In [5]:
word_vector

array([ 0.5897  , -0.55043 , -1.0106  ,  0.41226 ,  0.57348 ,  0.23464 ,
       -0.35773 , -1.78    ,  0.10745 ,  0.74913 ,  0.45013 ,  1.0351  ,
        0.48348 ,  0.47954 ,  0.51908 , -0.15053 ,  0.32474 ,  1.0789  ,
       -0.90894 ,  0.42943 , -0.56388 ,  0.69961 ,  0.13501 ,  0.16557 ,
       -0.063592,  0.35435 ,  0.42819 ,  0.1536  , -0.47018 , -1.0935  ,
        1.361   , -0.80821 , -0.674   ,  1.2606  ,  0.29554 ,  1.0835  ,
        0.2444  , -1.1877  , -0.60203 , -0.068315,  0.66256 ,  0.45336 ,
       -1.0178  ,  0.68267 , -0.20788 , -0.73393 ,  1.2597  ,  0.15425 ,
       -0.93256 , -0.15025 ], dtype=float32)

In [8]:
v1 = glove_model["king"]
v2 = glove_model['ruler']
v3 = glove_model['table']

In [9]:
v1

array([ 0.50451 ,  0.68607 , -0.59517 , -0.022801,  0.60046 , -0.13498 ,
       -0.08813 ,  0.47377 , -0.61798 , -0.31012 , -0.076666,  1.493   ,
       -0.034189, -0.98173 ,  0.68229 ,  0.81722 , -0.51874 , -0.31503 ,
       -0.55809 ,  0.66421 ,  0.1961  , -0.13495 , -0.11476 , -0.30344 ,
        0.41177 , -2.223   , -1.0756  , -1.0783  , -0.34354 ,  0.33505 ,
        1.9927  , -0.04234 , -0.64319 ,  0.71125 ,  0.49159 ,  0.16754 ,
        0.34344 , -0.25663 , -0.8523  ,  0.1661  ,  0.40102 ,  1.1685  ,
       -1.0137  , -0.21585 , -0.15155 ,  0.78321 , -0.91241 , -1.6106  ,
       -0.64426 , -0.51042 ], dtype=float32)

In [10]:
# prompt: python code to calculate similarity between two vectors

import numpy as np
similarity = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
print(similarity)


0.74342537


In [11]:
similarity = np.dot(v1, v3) / (np.linalg.norm(v1) * np.linalg.norm(v3))
print(similarity)

0.2848273


In [12]:
glove_model.most_similar("stocks")

[('stock', 0.8653818368911743),
 ('markets', 0.8522835969924927),
 ('prices', 0.8431004285812378),
 ('market', 0.8400351405143738),
 ('traders', 0.8257467150688171),
 ('trading', 0.8112872838973999),
 ('investors', 0.8083530068397522),
 ('indexes', 0.7902355194091797),
 ('dealers', 0.7884277701377869),
 ('shares', 0.7868536114692688)]

In [13]:
glove_model.most_similar("trading")

[('stock', 0.9012669920921326),
 ('exchange', 0.898104190826416),
 ('futures', 0.8487032651901245),
 ('trades', 0.8236047029495239),
 ('traded', 0.8166490793228149),
 ('stocks', 0.8112873435020447),
 ('market', 0.8051413893699646),
 ('prices', 0.7966799139976501),
 ('closing', 0.7950035929679871),
 ('closed', 0.7914804220199585)]

In [14]:
glove_model.most_similar("amazing")

[('incredible', 0.9189565181732178),
 ('fantastic', 0.8799790143966675),
 ('awesome', 0.8620665669441223),
 ('wonderful', 0.8537988662719727),
 ('terrific', 0.8482187390327454),
 ('marvelous', 0.8439217805862427),
 ('astonishing', 0.8103041052818298),
 ('remarkable', 0.8091045022010803),
 ('exciting', 0.79411780834198),
 ('unbelievable', 0.7916541695594788)]

In [None]:
glove_model.vectors.shape

### Google Gemini Embedding

In [16]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

vector = embeddings.embed_query("Python is an amazing programming language.")
len(vector)

3072

## Vector Database: **ChromaDB**

In [20]:
import chromadb
chroma_client = chromadb.Client()

In [21]:
collection = chroma_client.create_collection(name="my_collection")

In [22]:
collection.add(
    embeddings=[[1.2, 2.3, 4.5], [6.7, 8.2, 9.2]],
    documents=["Python is a programming language", "langchain is an framework"],
    metadatas=[{"source": "my_source"}, {"source": "my_source"}],
    ids=["id1", "id2"]
)

In [23]:
collection.get(ids=["id1"])

{'ids': ['id1'],
 'embeddings': None,
 'documents': ['Python is a programming language'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'source': 'my_source'}]}

In [24]:
results = collection.query(
    query_embeddings=[[7.1, 9.1, 6.1]],
    n_results=1
)
results

{'ids': [['id2']],
 'embeddings': None,
 'documents': [['langchain is an framework']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'source': 'my_source'}]],
 'distances': [[10.580000877380371]]}

## Context Based Generation

In [None]:
from langchain.prompts import PromptTemplate

# Sample context and question
context = """
NASA, the National Aeronautics and Space Administration, is an independent agency of the U.S. 
federal government responsible for the civilian space program, as well as aeronautics and aerospace 
research. NASA was established in 1958 and has led numerous space missions, including the Apollo 
moon-landing missions, Mars rover explorations, and the development of the International Space Station.
"""
question = "What are some of NASA's major achievements?"

# Prompt template that embeds context and question
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="Given the following context:\n{context}\n\n Answer the question:\n{question}\n  Only use information from the provided context, do no use any other knowledge or infor to answer the question"
)

In [None]:

prompt = prompt_template.format(context=context, question=question)
model_name = "gemini-2.0-flash"

from langchain.chat_models import init_chat_model
llm = init_chat_model(model_name, model_provider="google_genai")

In [None]:
# Generate response
response = llm.invoke(prompt)
print(response.content)

Some of NASA's major achievements include the Apollo moon-landing missions, Mars rover explorations, and the development of the International Space Station.
