In [1]:
%pip install -U \
langchain \
langchain-core \
langchain-community \
chromadb \
sentence-transformers \
pypdf


Collecting langchain
  Downloading langchain-1.2.9-py3-none-any.whl.metadata (5.7 kB)
Collecting langchain-core
  Downloading langchain_core-1.2.9-py3-none-any.whl.metadata (4.4 kB)
Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting chromadb
  Downloading chromadb-1.5.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting pypdf
  Downloading pypdf-6.7.0-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.1-py3-none-any.whl.metadata (4.2 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.4.0-py3-none-any.whl.metadata (5.8 kB)

In [5]:
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma



# -------------------------
# 1. Create Documents
# -------------------------
raw_docs = [
    Document(
        page_content="Virat Kohli is one of the most successful and consistent batsmen in IPL history. Known for his aggressive batting style and fitness, he has led the Royal Challengers Bangalore in multiple seasons.",
        metadata={"player": "Virat Kohli", "team": "RCB"}
    ),
    Document(
        page_content="Rohit Sharma is the most successful captain in IPL history, leading Mumbai Indians to five titles. He's known for his calm demeanor and ability to play big innings under pressure.",
        metadata={"player": "Rohit Sharma", "team": "MI"}
    ),
    Document(
        page_content="MS Dhoni, famously known as Captain Cool, has led Chennai Super Kings to multiple IPL titles. His finishing skills, wicketkeeping, and leadership are legendary.",
        metadata={"player": "MS Dhoni", "team": "CSK"}
    ),
    Document(
        page_content="Jasprit Bumrah is considered one of the best fast bowlers in T20 cricket. Playing for Mumbai Indians, he is known for his yorkers and death-over expertise.",
        metadata={"player": "Jasprit Bumrah", "team": "MI"}
    ),
    Document(
        page_content="Ravindra Jadeja is a dynamic all-rounder who contributes with both bat and ball. Representing Chennai Super Kings, his quick fielding and match-winning performances make him a key player.",
        metadata={"player": "Ravindra Jadeja", "team": "CSK"}
    )
]


# -------------------------
# 2. Split Documents
# -------------------------
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=30
)

documents = text_splitter.split_documents(raw_docs)


# -------------------------
# 3. Load HF Embeddings
# -------------------------
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


# -------------------------
# 4. Create / Load Chroma DB
# -------------------------
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embeddings,
    persist_directory="./chroma_db"
)

vectorstore.persist()


# -------------------------
# 5. Similarity Search
# -------------------------
print("\n=== Similarity Search ===")
results = vectorstore.similarity_search(
    query="best IPL captain",
    k=3
)

for i, doc in enumerate(results, 1):
    print(f"\nResult {i}")
    print(doc.page_content)
    print("Metadata:", doc.metadata)


# -------------------------
# 6. Similarity Search with Score
# -------------------------
print("\n=== Similarity Search With Score ===")
results_with_score = vectorstore.similarity_search_with_score(
    query="best finisher in IPL",
    k=3
)

for doc, score in results_with_score:
    print(f"\nScore: {score}")
    print(doc.page_content)


# -------------------------
# 7. Metadata Filtered Search
# -------------------------
print("\n=== Filtered Search (Team: MI) ===")
filtered_results = vectorstore.similarity_search(
    query="best bowler",
    k=2,
    filter={"team": "MI"}
)

for doc in filtered_results:
    print("\n", doc.page_content)


# -------------------------
# 8. Retriever (RAG Ready)
# -------------------------
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

print("\n=== Retriever Output ===")
retrieved_docs = retriever.invoke("best all rounder")


for doc in retrieved_docs:
    print("\n", doc.page_content)


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.



=== Similarity Search ===

Result 1
Rohit Sharma is the most successful captain in IPL history, leading Mumbai Indians to five titles. He's known for his calm demeanor and ability to play big innings under pressure.
Metadata: {'player': 'Rohit Sharma', 'team': 'MI'}

Result 2
Rohit Sharma is the most successful captain in IPL history, leading Mumbai Indians to five titles. He's known for his calm demeanor and ability to play big innings under pressure.
Metadata: {'team': 'MI', 'player': 'Rohit Sharma'}

Result 3
MS Dhoni, famously known as Captain Cool, has led Chennai Super Kings to multiple IPL titles. His finishing skills, wicketkeeping, and leadership are legendary.
Metadata: {'player': 'MS Dhoni', 'team': 'CSK'}

=== Similarity Search With Score ===

Score: 0.964094340801239
Rohit Sharma is the most successful captain in IPL history, leading Mumbai Indians to five titles. He's known for his calm demeanor and ability to play big innings under pressure.

Score: 0.964094340801239
Ro