In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

if os.environ['GOOGLE_API_KEY']:
    print("API Key found.")
else:
    raise EnvironmentError("API Key not found in environment variables.")

API Key found.


In [2]:
from langchain_google_genai import ChatGoogleGenerativeAI

model = ChatGoogleGenerativeAI(
    model="gemini-3-flash-preview",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

# RAG Implementation with our PDF data

## Step 1 - Extracting text from the pdf data

In [12]:
from langchain_community.document_loaders.pdf import PyPDFLoader

pdf_path = '../DOCS/POSH_Policy.pdf'

loader = PyPDFLoader(pdf_path)

docs = loader.load()
docs

[Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': 'PyPDF', 'creationdate': '2024-10-01T12:14:56+05:30', 'author': 'Ranjan Tiwari', 'moddate': '2024-10-01T12:14:56+05:30', 'title': 'Microsoft PowerPoint - POSH POLICY PPT', 'source': '../DOCS/POSH_Policy.pdf', 'total_pages': 25, 'page': 0, 'page_label': '1'}, page_content='www.vlcc.com\nVLCC HEALTH CARE LTDPREVENTION OF SEXUAL HARRASSMENT IN WORKLPLACE  POLICY'),
 Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': 'PyPDF', 'creationdate': '2024-10-01T12:14:56+05:30', 'author': 'Ranjan Tiwari', 'moddate': '2024-10-01T12:14:56+05:30', 'title': 'Microsoft PowerPoint - POSH POLICY PPT', 'source': '../DOCS/POSH_Policy.pdf', 'total_pages': 25, 'page': 1, 'page_label': '2'}, page_content='www.vlccwellness.com\nVLCC Health Care Limited(including its subsidiaries) iscommitted to creating a healthy working environment that enablesemployees to work without fear of prejudice, gender bias and in aharassment free 

In [14]:
docs[0].metadata

{'producer': 'Microsoft: Print To PDF',
 'creator': 'PyPDF',
 'creationdate': '2024-10-01T12:14:56+05:30',
 'author': 'Ranjan Tiwari',
 'moddate': '2024-10-01T12:14:56+05:30',
 'title': 'Microsoft PowerPoint - POSH POLICY PPT',
 'source': '../DOCS/POSH_Policy.pdf',
 'total_pages': 25,
 'page': 0,
 'page_label': '1'}

### Creating our own metadata

In [15]:
for i in docs:
    i.metadata = {
                    "source" : "VLCC",
                    "producer" : "HR"
                  }

In [16]:
docs[0].metadata

{'source': 'VLCC', 'producer': 'HR'}

## Step 2 - Splitting the documents into chunks

In [17]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)

chunks=splitter.split_documents(docs)

chunks

[Document(metadata={'source': 'VLCC', 'producer': 'HR'}, page_content='www.vlcc.com\nVLCC HEALTH CARE LTDPREVENTION OF SEXUAL HARRASSMENT IN WORKLPLACE  POLICY'),
 Document(metadata={'source': 'VLCC', 'producer': 'HR'}, page_content='www.vlccwellness.com\nVLCC Health Care Limited(including its subsidiaries) iscommitted to creating a healthy working environment that enablesemployees to work without fear of prejudice, gender bias and in aharassment free workplace to all employees without regard to race,caste, religion, colour, ancestry, marital status, gender, age, nationality,ethnic origin or disability.The Company also believes that all employees of the Company have theright to be treated with dignity.Sexual harassment at the work place or other than work place ifinvolving an employee or employees is a grave offence and is therefore,punishable.There is Zero Tolerance for Sexual Harassment'),
 Document(metadata={'source': 'VLCC', 'producer': 'HR'}, page_content='www.vlccwellness.com\nWh

In [18]:
chunks[0].metadata

{'source': 'VLCC', 'producer': 'HR'}

In [19]:
print(len(chunks))

26


## Step 3 - Create embedding for these chunks

In [20]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

## Step 4 - Store embeddings in the EXISTING Local vector database/store

In [22]:
from langchain_community.vectorstores import Chroma

# now we do not need to create a new vector store, we can use the existing one and just add new documents to it, by creating an instance of it

vector_store = Chroma(
    persist_directory='../Vector',                           # Local vector store directory
    embedding_function=embeddings
)

In [23]:
vector_store.add_documents(chunks)

['7dd466de-604f-469d-b9f5-3199d6e00bec',
 '9b821d1c-2003-4e13-803e-81f2d5956651',
 '597f7a6c-bd7c-4c58-9a5c-3c1dda99825a',
 'a4081037-ada6-4952-90f4-2d5d84e6b2f1',
 '7208b6cf-a6cf-43b0-8e69-f924b421642d',
 'd5c4d5fe-7535-4347-9d6f-917f114a758b',
 '31875890-950c-472e-abcd-3fbf69be39bc',
 'ba2d443d-679f-4e23-a806-2e6eec9fb63c',
 '1ae10c34-cbf8-498b-a6cc-ed412fa405e9',
 'b51dcb4f-5080-415b-9386-d54c5e9badb7',
 'fab7034a-f93e-43fe-91f5-fabe63d5f8da',
 'f8fc777d-c3be-4a23-8b4c-f9f0a2b4c8e4',
 'c46542be-f50c-4caf-a933-0aaff1aee726',
 'f3b4ae64-bfdc-4e45-b89b-5e857100ac57',
 '91c9047f-f791-42ce-8bfe-136d129ad762',
 '807a2c97-eebd-4069-a96a-e7a31aa53c06',
 '931bbc26-7c1c-4b62-97e4-2a1ddb9a45b6',
 'f5518f41-1077-4c5f-99ef-f4c1901dc74b',
 '8206c796-27a0-48ab-a9a4-964f97a0225f',
 '7bf149bf-7b75-4d6a-82ca-dc1a096b0904',
 '83825dc4-cfbc-443c-ab7f-2a3abf321347',
 '8fc3d169-f93f-49cc-994a-f18c0d0d8d53',
 '9f1d0204-c762-405d-9927-5baed9cd3008',
 '271823ab-6f41-4f02-9d49-77c3a05fde16',
 'adf51360-5186-

## Step 5 - Reuse the local vector database
(kernal restarted)

In [1]:
output1 = vector_store.similarity_search("What is the POSH policy?", k=3)
output1

NameError: name 'vector_store' is not defined

In [2]:
from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

vector_store_persist = Chroma(
    persist_directory='../Vector',
    embedding_function=embeddings
)

  vector_store_persist = Chroma(


In [4]:
output1 = vector_store_persist.similarity_search("What is the POSH policy?", k=3)
output1

[Document(metadata={'producer': 'HR', 'source': 'VLCC'}, page_content='www.vlccwellness.com\nWhat is POSH?The Sexual Harassment of Women at Workplace (Prevention, Prohibition and Redressal)Act, 2013, commonly referred to as the ‘POSH Act’ is an Indian law with the objective ofmaking workplaces safer for women by preventing, prohibiting and redressing acts ofsexual harassment against them in the workplace.'),
 Document(metadata={'source': 'VLCC', 'producer': 'HR'}, page_content='25\n•I Declare that I have read and understood the contents of the POSH policy.  Yes No'),
 Document(metadata={'producer': 'HR', 'source': 'VLCC'}, page_content='6\nApplicability and Workplace \nWorkplace is any place visited by the employees arising out of orduring the course of employment including transportation provided bythe employer for undertaking such journey. It also constitutes:•Workplace of an external client•Business Trips, Company guest house, Hotel stay during official travel•Official meetings outs

## Talk to LLM