## Simple GenAI app
___

Work flow:
1. Load Data -> Docs
2. Docs -> Chunks
3. Chunks -> Vectors
4. Vecotrs -> Vector Embedding
5. Vector Embedding -> Vector Store DB


In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT")
os.environ["LANGCHAIN_TRACKING_V2"] = "true"

### Data Ingestion

- From the website we need to scrape the data

In [3]:
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://www.geeksforgeeks.org/python/python-data-types/")
loader

USER_AGENT environment variable not set, consider setting it to identify your requests.


<langchain_community.document_loaders.web_base.WebBaseLoader at 0x23969a25310>

In [4]:
docs = loader.load()
docs

[Document(metadata={'source': 'https://www.geeksforgeeks.org/python/python-data-types/', 'title': 'Python Data Types - GeeksforGeeks', 'description': 'Your All-in-One Learning Portal: GeeksforGeeks is a comprehensive educational platform that empowers learners across domains-spanning computer science and programming, school education, upskilling, commerce, software tools, competitive exams, and more.', 'language': 'en-US'}, page_content='\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nPython Data Types - GeeksforGeeks\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to content\n\n\n\n\n\n\n\n\n\nCoursesDSA to DevelopmentGATE 2026 PrepGet 3 IBM CertificationsFor Working ProfessionalsInterview 101: DSA & System DesignData Science Training ProgramJAVA Backend Development (Live)Data Analytics TrainingDevOps Engineering (LIVE)Data Structures & Algorithms in PythonFor StudentsPlacement Preparation with DSADa

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_spliter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
documents = text_spliter.split_documents(docs)

In [6]:
len(documents)

25

In [7]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [8]:
from langchain_community.vectorstores import FAISS
vectore_store_db = FAISS.from_documents(documents, embedding_model)

In [9]:
vectore_store_db

<langchain_community.vectorstores.faiss.FAISS at 0x2396c133560>

#### Query from a vector store DB

In [10]:
query = "Data type"
response = vectore_store_db.similarity_search(query)
response

[Document(id='8d5681c9-b70a-47b0-8bc6-7bc38b15dd66', metadata={'source': 'https://www.geeksforgeeks.org/python/python-data-types/', 'title': 'Python Data Types - GeeksforGeeks', 'description': 'Your All-in-One Learning Portal: GeeksforGeeks is a comprehensive educational platform that empowers learners across domains-spanning computer science and programming, school education, upskilling, commerce, software tools, competitive exams, and more.', 'language': 'en-US'}, page_content='Python Data Types\n\n\n\nLast Updated : \n17 May, 2025\n\n\n\n\n\n\n\nSummarize\n\n\n\n\n\n\nComments\n\n\n\n\n\n\n\nImprove\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSuggest changes\n\n\n\n\n\n\n\n\nShare\n\n\n \n\n\nLike Article\n\n\n\nLike\n\n\n\n\n\n\n\n\n\n\nReport'),
 Document(id='1590016a-0ab3-4e9f-8258-81139ec77322', metadata={'source': 'https://www.geeksforgeeks.org/python/python-data-types/', 'title': 'Python Data Types - GeeksforGeeks', 'description': 'Your All-in-One Learning Portal: Gee

In [11]:
for i in response:
    print(i.page_content.replace("\t", " ").replace("\n", " ").replace("  ", ""))
    print()

Python Data TypesLast Updated :17 May, 2025Summarize CommentsImprove Suggest changes Share Like ArticleLike Report

Python Data Types - GeeksforGeeksSkip to content

#int, float, string, list and set x = 50 x = 60.5 x = "Hello World" x = ["geeks", "for", "geeks"] x = ("geeks", "for", "geeks")

Python CoursePython TutorialInterview QuestionsPython QuizPython GlossaryPython ProjectsPractice PythonData Science With PythonPython Web DevDSA with PythonPython OOPs Sign In ▲ Open In App



#### Documents Chain

In [12]:
from langchain_google_genai import ChatGoogleGenerativeAI

# 1. Initialize the LLM
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")
print(llm)

model='models/gemini-2.0-flash' google_api_key=SecretStr('**********') client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x000002396C133E30> default_metadata=() model_kwargs={}


In [22]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain



# 2. Define the prompt with BOTH context and question
prompt = ChatPromptTemplate.from_template("""
You are an assistant answering questions based only on the provided context.

<context>
{context}
</context>

Question: {input}
Answer:
""")

# 3. Create the document chain
documents_chain = create_stuff_documents_chain(llm, prompt)

In [23]:
from langchain_core.documents import Document

# 4. Invoke it with input
response = documents_chain.invoke({
    "input": "How old is tiger?",
    "context": [Document(page_content="Hi! My name is Tiger. I am 2908 years old")]
})

print(response)


Tiger is 2908 years old.


In [25]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.documents import Document

documents_chain = create_stuff_documents_chain(llm, prompt)

context_doc = [Document(page_content="""The tiger (Panthera tigris) is a large cat and a member of the genus Panthera native to Asia.
    It has a powerful, muscular body with a large head and paws, a long tail and orange fur with black, mostly vertical stripes.
    It is traditionally classified into nine recent subspecies, though some recognise only two subspecies, mainland Asian tigers and the island tigers of the Sunda Islands.
    Throughout the tiger's range, it inhabits mainly forests, from coniferous and temperate broadleaf and mixed forests in the Russian Far East and Northeast China to tropical and subtropical moist broadleaf forests on the Indian subcontinent and Southeast Asia.
    The tiger is an apex predator and preys mainly on ungulates, which it takes by ambush.
    It lives a mostly solitary life and occupies home ranges, defending these from individuals of the same sex.
    The range of a male tiger overlaps with that of multiple females with whom he mates.
    Females give birth to usually two or three cubs that stay with their mother for about two years.
    When becoming independent, they leave their mother's home range and establish their own.
    """)]


# 4. Invoke it with input
response = documents_chain.invoke({
    "input": "What is Tiger?",
    "context": context_doc
})

print(response)

The tiger (Panthera tigris) is a large cat and a member of the genus Panthera native to Asia.


#### Reriver Chain

In [31]:
retriever = vectore_store_db.as_retriever()

from langchain.chains import create_retrieval_chain
retriever_chain = create_retrieval_chain(retriever, documents_chain)

In [32]:
retriever_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['FAISS', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000002396C133560>, search_kwargs={}), kwargs={}, config={'run_name': 'retrieve_documents'}, config_factories=[])
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
            | ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, template='\nYou are an assistant answering questions based only on the provided context.\n\n<context>\n{context}\n</context>\n\nQ

In [35]:
# Get the response from the LLM
response = retriever_chain.invoke({"input" : "Data Type"})
response['answer']

'int, float, string, list and set'

In [36]:
response

{'input': 'Data Type',
 'context': [Document(id='8d5681c9-b70a-47b0-8bc6-7bc38b15dd66', metadata={'source': 'https://www.geeksforgeeks.org/python/python-data-types/', 'title': 'Python Data Types - GeeksforGeeks', 'description': 'Your All-in-One Learning Portal: GeeksforGeeks is a comprehensive educational platform that empowers learners across domains-spanning computer science and programming, school education, upskilling, commerce, software tools, competitive exams, and more.', 'language': 'en-US'}, page_content='Python Data Types\n\n\n\nLast Updated : \n17 May, 2025\n\n\n\n\n\n\n\nSummarize\n\n\n\n\n\n\nComments\n\n\n\n\n\n\n\nImprove\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSuggest changes\n\n\n\n\n\n\n\n\nShare\n\n\n \n\n\nLike Article\n\n\n\nLike\n\n\n\n\n\n\n\n\n\n\nReport'),
  Document(id='1590016a-0ab3-4e9f-8258-81139ec77322', metadata={'source': 'https://www.geeksforgeeks.org/python/python-data-types/', 'title': 'Python Data Types - GeeksforGeeks', 'description': '