In [7]:
import os
from dotenv import load_dotenv

# LangChain core
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# Gemini
from langchain_google_genai import (
    GoogleGenerativeAIEmbeddings,
    ChatGoogleGenerativeAI
)
#hugging face for
from sentence_transformers import SentenceTransformer

# Vector DB
from langchain_community.vectorstores import Chroma

load_dotenv()
os.environ["GOOGLE_API_KEY"] = "***********************"

In [8]:
Data_Path="/workspace/assignments/blogs.txt"
with open(Data_Path, "r", encoding="utf-8") as f:
    raw_text = f.read()

print(f"Document length: {len(raw_text)} characters")

Document length: 11202 characters


In [9]:
text_splitter=RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100
)
chunks=text_splitter.split_text(raw_text)
print(f"Total chunks created: {len(chunks)}")
print("\nSample chunk:\n")
print(chunks[0][:500])

Total chunks created: 21

Sample chunk:

Jeffrey Edward Epstein[a] (January 20, 1953 ‚Äì August 10, 2019) was an American financier and convicted child sex offender.[6][7] He began his professional career as a teacher at the Dalton School. After his dismissal from the school in 1976, he entered the banking and finance sector, working at Bear Stearns in various roles, before starting his own firm. Epstein cultivated an elite social circle[8] and procured underage girls who were subjected to repeated rape and sexual violence, by him and hi


In [10]:
# Wrap chunks into LangChain Document objects
docs = [Document(page_content=chunk) for chunk in chunks]

print(f"Total Document objects created: {len(docs)}")


Total Document objects created: 21


In [13]:
from langchain.embeddings.base import Embeddings
import numpy as np

# Load Hugging Face model
hf_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Wrapper for LangChain
class HuggingFaceEmbeddings(Embeddings):
    def __init__(self, model):
        self.model = model

    def embed_documents(self, texts):
        return self.model.encode(texts, convert_to_numpy=True).tolist()

    def embed_query(self, text):
        return self.model.encode([text], convert_to_numpy=True)[0].tolist()

# Initialize LangChain-compatible embeddings
embedding_model = HuggingFaceEmbeddings(hf_model)

# Test embedding
test_vector = embedding_model.embed_query("Test embedding")
print(f"Embedding vector length: {len(test_vector)}")


Embedding vector length: 384


In [15]:
# Initialize Chroma vector database
vector_db = Chroma.from_documents(
    documents=docs,
    embedding=embedding_model,
    persist_directory="./chroma_data"  # Persistent storage
)

# Persist to disk
vector_db.persist()
print("Vector database created and persisted.")


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Vector database created and persisted.


In [16]:

# User query
query = "did epistein also rape childs?"

# 1Ô∏è‚É£ Retrieve relevant chunks using similarity search (cosine similarity)
retrieved_docs = vector_db.similarity_search(query, k=3)

print(f"Retrieved {len(retrieved_docs)} relevant chunks.\n")

# 2Ô∏è‚É£ Combine retrieved chunks into a single context
context = "\n\n".join([doc.page_content for doc in retrieved_docs])

# 3Ô∏è‚É£ Initialize Gemini LLM

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0.2
)

# 4Ô∏è‚É£ Construct RAG prompt
prompt = f"""
You are an AI assistant.

Answer the question using ONLY the context below.
If the answer is not present in the context, say:
"I don't know."

Context:
{context}

Question:
{query}
"""

# 5Ô∏è‚É£ Generate response
response = llm.invoke(prompt)

# 6Ô∏è‚É£ Print final answer
print("üß† Final Answer:\n")
print(response.content)


Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


Retrieved 3 relevant chunks.

üß† Final Answer:

Yes, Epstein procured underage girls who were subjected to repeated rape and sexual violence, by him and his associates. He was also investigated after a parent reported he had sexually abused her 14-year-old daughter, and federal officials identified 36 girls, some as young as 14 years old, whom he had allegedly sexually abused.
