In [2]:
#%pip install -U langchain-community chromadb transformers sentence-transformers

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

In [3]:
# Sample document (larger for better testing)
document = """
Elon Musk is the CEO of Tesla. Tesla's mission is to accelerate the world's transition to sustainable energy.
Tesla manufactures electric vehicles (EVs), battery energy storage, solar panels, and related products. SpaceX, another company led by Musk,
aims to make space travel accessible to humanity by developing reusable rockets.
Musk has also co-founded Neuralink, which focuses on connecting the human brain to computers using advanced neural interfaces.
Recently, Musk has expressed his concerns about artificial intelligence, advocating for regulation to prevent potential harm.
Musk believes that governments should play an active role in ensuring the safe development of AI technologies.
His innovative ventures span automotive, space exploration, energy, AI, and neuroscience, making him a pivotal figure in modern technology.
"""

In [4]:
# Small chunk size
small_splitter = RecursiveCharacterTextSplitter(chunk_size=20, chunk_overlap=2)
small_chunks = small_splitter.split_text(document)
print("Smaller Chunks -")
print(small_chunks)
# Large chunk size
large_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=10)
large_chunks = large_splitter.split_text(document)
print("Larger Chunks -")
print(large_chunks)


Smaller Chunks -
['Elon Musk is the', 'CEO of Tesla.', "Tesla's mission is", 'to accelerate the', "world's transition", 'to sustainable', 'energy.', 'Tesla manufactures', 'electric vehicles', '(EVs), battery', 'energy storage,', 'solar panels, and', 'related products.', 'SpaceX, another', 'company led by', 'Musk,', 'aims to make space', 'travel accessible', 'to humanity by', 'developing reusable', 'rockets.', 'Musk has also', 'co-founded', 'Neuralink, which', 'focuses on', 'connecting the', 'human brain to', 'computers using', 'advanced neural', 'interfaces.', 'Recently, Musk has', 'expressed his', 'concerns about', 'artificial', 'intelligence,', 'advocating for', 'regulation to', 'prevent potential', 'harm.', 'Musk believes that', 'governments should', 'play an active role', 'in ensuring the', 'safe development of', 'AI technologies.', 'His innovative', 'ventures span', 'automotive, space', 'exploration,', 'energy, AI, and', 'neuroscience,', 'making him a', 'a pivotal figure in', 'mod

In [6]:

# Initialize embedding model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


In [7]:

# Create vector stores
small_vector_store = Chroma.from_texts(small_chunks, embedding_model)
large_vector_store = Chroma.from_texts(large_chunks, embedding_model)


In [8]:

# Semantic query
query = "What are Musk's views on artificial intelligence and government regulations?"



In [9]:
# Perform search
small_results = small_vector_store.similarity_search(query, k=1)
large_results = large_vector_store.similarity_search(query, k=1)



In [10]:
# Display results
print("-----------------------------------")
print("Results")
print("-----------------------------------")
print("Small Chunk Results:", small_results)
print("Large Chunk Results:", large_results)

-----------------------------------
Results
-----------------------------------
Small Chunk Results: [Document(metadata={}, page_content='Recently, Musk has expressed his concerns about artificial intelligence, advocating for regulation to prevent potential harm.')]
Large Chunk Results: [Document(metadata={}, page_content='Recently, Musk has expressed his concerns about artificial intelligence, advocating for regulation to prevent potential harm.')]
