In [1]:
# pip install chromadb tiktoken openai langchain_openai

from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
# from dotenv import load_dotenv

# load_dotenv()



  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
# Manually structured Document objects for 10 IPL players
doc1 = Document(
    page_content="Virat Kohli: The highest run-getter in IPL history and former captain of RCB.",
    metadata={"team": "RCB", "role": "Batsman", "runs": 8004}
)

doc2 = Document(
    page_content="MS Dhoni: Legendary captain of CSK known for finishing games and 5 IPL titles.",
    metadata={"team": "CSK", "role": "Wicket-keeper Batsman", "titles": 5}
)

doc3 = Document(
    page_content="Rohit Sharma: Led Mumbai Indians to 5 titles and is a prolific top-order batsman.",
    metadata={"team": "MI", "role": "Batsman", "titles": 5}
)

doc4 = Document(
    page_content="David Warner: The most successful overseas batsman with the most fifties in IPL.",
    metadata={"team": "DC", "role": "Batsman", "runs": 6565}
)

doc5 = Document(
    page_content="Suresh Raina: Known as 'Mr. IPL' for his consistent performance over a decade for CSK.",
    metadata={"team": "CSK", "role": "Batsman", "status": "Retired"}
)

doc6 = Document(
    page_content="Lasith Malinga: All-time great pace bowler for MI, famous for his slinging action and yorkers.",
    metadata={"team": "MI", "role": "Bowler", "wickets": 170}
)

doc7 = Document(
    page_content="AB de Villiers: Known as 'Mr. 360' for his ability to hit the ball anywhere for RCB.",
    metadata={"team": "RCB", "role": "Batsman", "strike_rate": 151.68}
)

doc8 = Document(
    page_content="Chris Gayle: The 'Universe Boss' who holds the record for the highest individual score (175*).",
    metadata={"team": "PBKS", "role": "Batsman", "sixes": 357}
)

doc9 = Document(
    page_content="Rashid Khan: A world-class leg-spinner known for his economical bowling and match-winning spells.",
    metadata={"team": "GT", "role": "Bowler", "economy": 6.73}
)

doc10 = Document(
    page_content="Jasprit Bumrah: The spearhead of the MI bowling attack, elite in both powerplay and death overs.",
    metadata={"team": "MI", "role": "Bowler", "best_bowling": "5/10"}
)

# Putting them into a list for use in Chroma
docs = [doc1, doc2, doc3, doc4, doc5, doc6, doc7, doc8, doc9, doc10]




In [3]:
vector_store = Chroma(
    embedding_function=GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001"),
    persist_directory='chroma_db',
    collection_name='sample'
)



In [4]:
# add documents
vector_store.add_documents(docs)



['6288c341-add8-40a1-9432-38910424962a',
 'a56783d0-5442-4ee0-ba94-5ee4f2a76e8b',
 '8616c706-ffda-4b93-b894-069d1ba1ad01',
 'b71cb23e-fa06-48ba-9055-a9eca7c31efd',
 'ece7e546-81cd-4bbf-b709-488de69fde3b',
 '944d6cb6-5f74-4df2-b58b-31af833dd403',
 '18cb473d-1461-47b4-b930-826379cfdc22',
 'cd2cc206-55b5-41ea-9464-9a1538dab6a0',
 '8a34ce7b-786f-4cd4-9be1-37567230abdd',
 'b114ec79-8fee-40f5-9d85-09141b8def74']

In [5]:
# view documents
vector_store.get(include=['embeddings','documents','metadatas'])



{'ids': ['6288c341-add8-40a1-9432-38910424962a',
  'a56783d0-5442-4ee0-ba94-5ee4f2a76e8b',
  '8616c706-ffda-4b93-b894-069d1ba1ad01',
  'b71cb23e-fa06-48ba-9055-a9eca7c31efd',
  'ece7e546-81cd-4bbf-b709-488de69fde3b',
  '944d6cb6-5f74-4df2-b58b-31af833dd403',
  '18cb473d-1461-47b4-b930-826379cfdc22',
  'cd2cc206-55b5-41ea-9464-9a1538dab6a0',
  '8a34ce7b-786f-4cd4-9be1-37567230abdd',
  'b114ec79-8fee-40f5-9d85-09141b8def74'],
 'embeddings': array([[-0.02065258,  0.00852258,  0.01331688, ...,  0.01190765,
         -0.01131722,  0.0010501 ],
        [-0.0113489 , -0.00728605,  0.01510515, ...,  0.01118816,
         -0.01606124,  0.00544042],
        [-0.02618742,  0.01815327,  0.02020825, ...,  0.01269849,
         -0.01892932,  0.00213546],
        ...,
        [-0.01000837, -0.00884169,  0.0259126 , ..., -0.00807419,
         -0.00289412,  0.00386677],
        [-0.00869135,  0.00395621,  0.01191945, ...,  0.00813984,
         -0.00679723,  0.00487308],
        [-0.00289141,  0.00903986, 

In [6]:
# search documents
vector_store.similarity_search(
    query='who among these are a bowler?',
    k=2
)



[Document(id='944d6cb6-5f74-4df2-b58b-31af833dd403', metadata={'team': 'MI', 'role': 'Bowler', 'wickets': 170}, page_content='Lasith Malinga: All-time great pace bowler for MI, famous for his slinging action and yorkers.'),
 Document(id='8a34ce7b-786f-4cd4-9be1-37567230abdd', metadata={'role': 'Bowler', 'team': 'GT', 'economy': 6.73}, page_content='Rashid Khan: A world-class leg-spinner known for his economical bowling and match-winning spells.')]

In [7]:
# search with similarity score
vector_store.similarity_search_with_score(
    query='who among these are a bowler?',
    k=2
)



[(Document(id='944d6cb6-5f74-4df2-b58b-31af833dd403', metadata={'wickets': 170, 'role': 'Bowler', 'team': 'MI'}, page_content='Lasith Malinga: All-time great pace bowler for MI, famous for his slinging action and yorkers.'),
  0.5916815996170044),
 (Document(id='8a34ce7b-786f-4cd4-9be1-37567230abdd', metadata={'economy': 6.73, 'team': 'GT', 'role': 'Bowler'}, page_content='Rashid Khan: A world-class leg-spinner known for his economical bowling and match-winning spells.'),
  0.6071895956993103)]

In [8]:
# meta-data filtering
vector_store.similarity_search_with_score(
    query='who among these are a bowler?',
    filter={'team': "CSK"}
)

[(Document(id='a56783d0-5442-4ee0-ba94-5ee4f2a76e8b', metadata={'role': 'Wicket-keeper Batsman', 'team': 'CSK', 'titles': 5}, page_content='MS Dhoni: Legendary captain of CSK known for finishing games and 5 IPL titles.'),
  0.6908049583435059),
 (Document(id='ece7e546-81cd-4bbf-b709-488de69fde3b', metadata={'team': 'CSK', 'status': 'Retired', 'role': 'Batsman'}, page_content="Suresh Raina: Known as 'Mr. IPL' for his consistent performance over a decade for CSK."),
  0.7099800705909729)]