# Embedding and store vector database

### Embedding using api

In [1]:
import getpass
import os

if not os.environ.get("GOOGLE_API_KEY"):
  os.environ["GOOGLE_API_KEY"] = 'AIzaSyC25KhSrP9q6CPmGppr44vUVZASFXFsR6g'

from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
english_grammars = [
    """
    Present Simple
    Topic: Present Simple
    Content:
    The Present Simple tense is used to describe habits, routines, general truths, and repeated actions. It is formed with the base form of the verb for all subjects except third person singular, which adds -s or -es.
    Examples:
    I walk to school every day.
    She plays the piano very well.
    Water boils at 100 degrees Celsius.
    Notes:
    Often used with frequency adverbs: always, usually, sometimes, never.
    Questions and negatives use do/does:
    Do you like coffee?
    He doesn‚Äôt watch TV in the morning
    """,
    """
    Past Simple
    Topic: Past Simple
    Content:
    The Past Simple tense is used to describe actions that were completed in the past at a specific time. It is formed by adding -ed to regular verbs, while irregular verbs have unique past forms.
    Examples:
    I visited my grandparents last weekend.
    She wrote a letter to her friend yesterday.
    They went to the beach two days ago.
    Notes:
    Time expressions often used: yesterday, last week, in 2010, two days ago.
    Questions and negatives use did:
    Did you see that movie?
    He didn‚Äôt finish his homework on time.
    """,
    """
    Future Simple
    Topic: Future Simple
    Content:
    The Future Simple tense is used to describe actions that will happen in the future. It is formed with "will" followed by the base form of the verb.
    Examples:
    I will travel to Japan next year.
    She will start her new job tomorrow.
    They will meet us at the restaurant later.
    Notes:
    Often used with time expressions: tomorrow, next week, in the future.
    """,
    """
    Present Continuous
    Topic: Present Continuous
    Content:
    The Present Continuous tense is used to describe actions that are happening right now or around the current time. It is formed with the verb "to be" (am/is/are) followed by the -ing form of the verb.
    Examples:
    I am reading a book at the moment.
    She is studying for her exams this week.
    They are playing soccer in the park right now.
    Notes:
    Often used with time expressions: now, at the moment, currently.
    """
]

### Custom embedding

In [3]:
import numpy as np
from langchain.embeddings.base import Embeddings
from sentence_transformers import SentenceTransformer

class HFEmbedding(Embeddings):
    def __init__(self, model_name="hiieu/halong_embedding"):
        self.model = SentenceTransformer(model_name)

    def embed_query(self, text: str) -> np.ndarray:
        return self.model.encode([text]).squeeze(0)  # shape: (embedding_dim,)
        

    def embed_documents(self, texts: list[str]) -> np.ndarray:
        vectors = []
        for text in texts:
            vectors.append(self.embed_query(text))
        return np.vstack(vectors)  # shape: (n_docs, embedding_dim)
    def __call__(self, texts):
        if isinstance(texts, str):
            return self.embed_query(texts)
        elif isinstance(texts, list):
            return self.embed_documents(texts)
        else:
            raise ValueError("Input should be a string or a list of strings.")
embeddings = HFEmbedding()

In [4]:
from langchain_core.documents import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings  # ho·∫∑c embeddings kh√°c
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
from langchain_qdrant import QdrantVectorStore
import os

# if not os.environ.get("GOOGLE_API_KEY"):
#   os.environ["GOOGLE_API_KEY"] = 'AIzaSyC25KhSrP9q6CPmGppr44vUVZASFXFsR6g'
# # Embeddings
# embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

# Sample documents
docs = [
    Document(page_content=english_grammars[0], metadata={"source": "wiki"}),
    Document(page_content=english_grammars[1], metadata={"source": "wiki"}),
    Document(page_content=english_grammars[2], metadata={"source": "wiki"}),
    Document(page_content=english_grammars[3], metadata={"source": "history_book"}),
]

# Connect to Qdrant Docker
client = QdrantClient(host="localhost", port=6333)

collection_name = "my_docs"

# check if collection exists, if yes delete it
if client.collection_exists(collection_name):
    client.delete_collection(collection_name)
    print(f"‚úÖ Collection '{collection_name}' deleted")
else:
    print(f"‚ö†Ô∏è Collection '{collection_name}' does not exist")
    
# Create collection if not exists
# collection_name = "my_docs"
vector_size = len(embeddings.embed_query("sample text"))

if not client.collection_exists(collection_name):
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
    )

# # Create VectorStore
vector_store = QdrantVectorStore(
    client=client,
    collection_name=collection_name,
    embedding=embeddings,
)

# Add documents
vector_store.add_documents(docs)
print(f"Added {len(docs)} documents to Qdrant collection '{collection_name}'")

# Test query
query = "Which city is the capital of Vietnam?"
results = vector_store.similarity_search(query, k=3)
print("üîé Search results:")
for r in results:
    print("-", r.page_content, "| metadata:", r.metadata)


‚úÖ Collection 'my_docs' deleted
Added 4 documents to Qdrant collection 'my_docs'
üîé Search results:
- 
    Future Simple
    Topic: Future Simple
    Content:
    The Future Simple tense is used to describe actions that will happen in the future. It is formed with "will" followed by the base form of the verb.
    Examples:
    I will travel to Japan next year.
    She will start her new job tomorrow.
    They will meet us at the restaurant later.
    Notes:
    Often used with time expressions: tomorrow, next week, in the future.
     | metadata: {'source': 'wiki', '_id': '04175923-c169-43b0-952a-ce1bf6fe1a47', '_collection_name': 'my_docs'}
- 
    Present Simple
    Topic: Present Simple
    Content:
    The Present Simple tense is used to describe habits, routines, general truths, and repeated actions. It is formed with the base form of the verb for all subjects except third person singular, which adds -s or -es.
    Examples:
    I walk to school every day.
    She plays the pian

In [None]:
# doc = Document(page_content="The past simple", metadata={"source": "wiki"})
# vector_store.add_documents([doc])

In [None]:
query = "what is the past simple?"
results = vector_store.similarity_search(query, k=3)
print("üîé Search results:")
for r in results:
    print("-", r.page_content, "| metadata:", r.metadata)


# Chat

In [5]:
import getpass
import os
if not os.environ.get("GOOGLE_API_KEY"):
  os.environ["GOOGLE_API_KEY"] = 'AIzaSyC25KhSrP9q6CPmGppr44vUVZASFXFsR6g'
from langchain.chat_models import init_chat_model
llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai")

In [53]:
from typing_extensions import List, TypedDict
from langgraph.graph import START, StateGraph
from langchain_core.documents import Document
from langchain.prompts import PromptTemplate
#Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.format(
    question=state["question"],
    context=docs_content
)
    # print(docs_content)
    response = llm.invoke(messages)
    print(response)
    return {"answer": response.content}


prompt = '''
You are an AI assistant providing helpful advice. 
Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say you don't know, don't try to make up an answer.
Context: {context}
Question: {question}
'''
# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [52]:
response = graph.invoke({"question": "what is the past simple?"})
print(response["answer"])


    Past Simple
    Topic: Past Simple
    Content:
    The Past Simple tense is used to describe actions that were completed in the past at a specific time. It is formed by adding -ed to regular verbs, while irregular verbs have unique past forms.
    Examples:
    I visited my grandparents last weekend.
    She wrote a letter to her friend yesterday.
    They went to the beach two days ago.
    Notes:
    Time expressions often used: yesterday, last week, in 2010, two days ago.
    Questions and negatives use did:
    Did you see that movie?
    He didn‚Äôt finish his homework on time.
    


    Future Simple
    Topic: Future Simple
    Content:
    The Future Simple tense is used to describe actions that will happen in the future. It is formed with "will" followed by the base form of the verb.
    Examples:
    I will travel to Japan next year.
    She will start her new job tomorrow.
    They will meet us at the restaurant later.
    Notes:
    Often used with time expressions: 