In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

if os.getenv("GOOGLE_API_KEY") is None:
    raise ValueError("GOOGLE_API_KEY is not set in the environment variables.")
print("GOOGLE_API_KEY is set.")

GOOGLE_API_KEY is set.


In [2]:

from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    # google_api_key=os.getenv("GEMINI_API_KEY"),
    model="gemini-3-flash-preview",
    temperature=0
)
# response = llm.invoke("what is rag application explain to me.")
# print(response.content)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
llm.invoke("what is langchain used for?")

AIMessage(content=[{'type': 'text', 'text': 'At its core, **LangChain** is an open-source framework designed to help developers build applications powered by Large Language Models (LLMs) like GPT-4, Claude, or Llama.\n\nWhile LLMs are powerful on their own, they have limitations: they are "stateless" (they don\'t remember past conversations naturally), they have a cutoff date for their knowledge, and they can\'t interact with your private data or external tools out of the box. **LangChain acts as the "glue"** that connects these models to other sources of data and computation.\n\nHere is a breakdown of what LangChain is used for:\n\n---\n\n### 1. Retrieval Augmented Generation (RAG)\nThis is the most common use case. LangChain allows you to connect an LLM to your own data (PDFs, databases, emails, or websites).\n*   **How it works:** When a user asks a question, LangChain searches your documents for the relevant information and feeds that specific info to the LLM to generate an accurat

## **RAG IMPLEMENTATION WITH PDF FILE**


### **Extracting text from pdf**

In [3]:
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader("Docs\\docker_basic_guide.pdf")
docs = loader.load()     
docs

[Document(metadata={'producer': 'Adobe PDF Library 21.7.131', 'creator': 'Acrobat PDFMaker 21 for Word', 'creationdate': '2021-10-29T12:50:41+02:00', 'source': 'Docs\\docker_basic_guide.pdf', 'file_path': 'Docs\\docker_basic_guide.pdf', 'total_pages': 18, 'format': 'PDF 1.6', 'title': '', 'author': 'ALERRT', 'subject': '', 'keywords': '', 'moddate': '2021-10-29T12:50:45+02:00', 'trapped': '', 'modDate': "D:20211029125045+02'00'", 'creationDate': "D:20211029125041+02'00'", 'page': 0}, page_content='1 \n \n \n \nDocker: GETTING STARTED \nA hands-on step-by-step basic guide to Docker essentials. \n \n \n \n \n \nDeveloped by \n \nThe H3ABionet Pipelines and Computing Work Package, \nComputing Infrastructure project team \n \n \nPrepared for the greater  \n \nH3ABioNet and H3Africa Consortium communities'),
 Document(metadata={'producer': 'Adobe PDF Library 21.7.131', 'creator': 'Acrobat PDFMaker 21 for Word', 'creationdate': '2021-10-29T12:50:41+02:00', 'source': 'Docs\\docker_basic_guide

## **Add YOUR custom metadata to ALL chunks**

In [5]:
for doc in docs:
    doc.metadata.update({
        "category": "docker",
        "author": "Rohan Dabas", 
        "version": "1.0",
        "summary": "Docker basics guide"
    })

# step2: chunking the document into smaller pieces

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
splitter=RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)
chunks=splitter.split_documents(docs)
chunks

[Document(metadata={'producer': 'Adobe PDF Library 21.7.131', 'creator': 'Acrobat PDFMaker 21 for Word', 'creationdate': '2021-10-29T12:50:41+02:00', 'source': 'Docs\\docker_basic_guide.pdf', 'file_path': 'Docs\\docker_basic_guide.pdf', 'total_pages': 18, 'format': 'PDF 1.6', 'title': '', 'author': 'Rohan Dabas', 'subject': '', 'keywords': '', 'moddate': '2021-10-29T12:50:45+02:00', 'trapped': '', 'modDate': "D:20211029125045+02'00'", 'creationDate': "D:20211029125041+02'00'", 'page': 0, 'category': 'docker', 'version': '1.0', 'summary': 'Docker basics guide'}, page_content='1 \n \n \n \nDocker: GETTING STARTED \nA hands-on step-by-step basic guide to Docker essentials. \n \n \n \n \n \nDeveloped by \n \nThe H3ABionet Pipelines and Computing Work Package, \nComputing Infrastructure project team \n \n \nPrepared for the greater  \n \nH3ABioNet and H3Africa Consortium communities'),
 Document(metadata={'producer': 'Adobe PDF Library 21.7.131', 'creator': 'Acrobat PDFMaker 21 for Word', '

## creating embeddings for the chunks

In [2]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(
    model="models/gemini-embedding-001"
)

# chunk_embeddings = embeddings.embed_documents([doc.page_content for doc in chunks])
# len(chunk_embeddings), len(chunk_embeddings[0])

  from .autonotebook import tqdm as notebook_tqdm


# create and store embeddings in local vector store 

In [None]:
from langchain_chroma import Chroma  # New import

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./vectorstore/"
    
)

NameError: name 'chunks' is not defined

## **re-using  vector database**

In [7]:
vectorstore_persisted = Chroma(
    persist_directory="./vectorstore/",
    embedding_function=embeddings
)

### **step 5 : semantic search**

In [1]:
vectorstore.similarity_search("what is docker", k=3)

NameError: name 'vectorstore' is not defined

### **Talk to LLm**

In [11]:
context = vectorstore.similarity_search("what is docker", k=3)

In [12]:
response=llm.invoke(f"what is docker? u can answer using the following context: {context}")
answer_text = response.content[0]['text']
print(answer_text)

Based on the provided documents, Docker can be defined in the following ways:

*   **Official Definition:** Docker is an open platform designed for developers and system administrators of distributed applications.
*   **Technical Definition:** It is an open-source project that automates the deployment of software applications inside containers. It achieves this by providing a layer of abstraction and automation for OS-level virtualization on Linux.
*   **Simplified Explanation:** Docker is a tool that allows users to deploy applications in a "sandbox" called a **container**. This container runs on a host operating system (specifically Linux).
*   **Key Functionality:** It allows developers to package an application along with all of its necessary dependencies into a single, standardized unit for software development.
*   **Comparison to Virtual Machines:** While containers provide isolation similar to virtual machines (VMs), they are more efficient, requiring only a fraction of the com

### **using persistant**


In [9]:
vectorstore_persisted.similarity_search("what is docker", k=3)

[Document(id='4518e65f-5c26-4cce-966b-684f69e5bc40', metadata={'subject': '', 'trapped': '', 'source': 'Docs\\docker_basic_guide.pdf', 'title': '', 'category': 'docker', 'modDate': "D:20211029125045+02'00'", 'producer': 'Adobe PDF Library 21.7.131', 'format': 'PDF 1.6', 'author': 'Rohan Dabas', 'total_pages': 18, 'keywords': '', 'creator': 'Acrobat PDFMaker 21 for Word', 'summary': 'Docker basics guide', 'page': 6, 'moddate': '2021-10-29T12:50:45+02:00', 'version': '1.0', 'file_path': 'Docs\\docker_basic_guide.pdf', 'creationDate': "D:20211029125041+02'00'", 'creationdate': '2021-10-29T12:50:41+02:00'}, page_content='what Docker is, where it fits in, and how it can benefit you. \nSo, what exactly is Docker? Here’s how Docker themselves describe it: \n“Docker is an open platform for developers and sysadmins of distributed applications.” \nWikipedia defines Docker as: \n“an open-source project that automates the deployment of software applications \ninside containers by providing an addi

In [None]:
context = vectorstore_persisted.similarity_search("what is containerization", k=3)
response=llm.invoke(f"what is containerization? u can answer using the following context: {context}")
answer_text = response.content[0]['text']
print(answer_text)

NameError: name 'vectorstore' is not defined