In [10]:
from langchain_community.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

In [11]:
file_path = 'data.json'
persist_directory = "chroma_db"  # Directory to store the database
model_name = "sentence-transformers/all-MiniLM-L6-v2"
sentence_model = SentenceTransformer(model_name)

In [12]:
embedding = SentenceTransformerEmbeddings(model_name=model_name)

In [13]:
def metadata_func(record: dict, metadata: dict) -> dict:

    metadata["title"] = record.get("metadata").get("title").lower()
    metadata["tags"] = ",".join(str(e) for e in record.get("metadata").get("tags"))
    if "source" in metadata:
        source = metadata["source"].split("/")
        metadata["source"] = source[-1]

    return metadata

In [14]:
loader = JSONLoader(
    file_path=file_path,
    jq_schema='.[]',
    content_key="content",
    metadata_func=metadata_func
    )

textDocs = loader.load()
print(len(textDocs))
print(textDocs[0])

10
page_content='The Eiffel Tower is a wrought-iron lattice tower located in Paris, France.' metadata={'source': 'data.json', 'seq_num': 1, 'title': 'eiffel tower', 'tags': 'landmark,Paris,France,Eiffel Tower'}


In [15]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=256,
    chunk_overlap=50,
    length_function=len,
)
docs = text_splitter.split_documents(textDocs)
print(len(docs))
print(docs[0])

10
page_content='The Eiffel Tower is a wrought-iron lattice tower located in Paris, France.' metadata={'source': 'data.json', 'seq_num': 1, 'title': 'eiffel tower', 'tags': 'landmark,Paris,France,Eiffel Tower'}


In [16]:
vectordb = Chroma.from_documents(documents=docs, embedding=embedding, persist_directory=persist_directory)

In [8]:
query = "Who painted the Mona Lisa, and why is it famous?"
filter_title = "Mona Lisa"
results = vectordb.similarity_search(
        query, k=5,
        # filter={"title": filter_title}
    )

# Print results
for res in results:
    print('content:',res.page_content)
    print('seq_num:',res.metadata['seq_num'],'-',res.metadata['source'])
    print('title:',res.metadata['title'])
    print("------------")

content: The Mona Lisa is a famous portrait painting by Leonardo da Vinci.
seq_num: 5 - data.json
title: mona lisa
------------
content: The Colosseum in Rome is an ancient amphitheater used for gladiatorial contests and public spectacles.
seq_num: 10 - data.json
title: the colosseum
------------
content: The Apollo 11 mission was the first to land humans on the Moon in 1969.
seq_num: 7 - data.json
title: apollo 11 mission
------------
content: The Great Wall of China is a series of fortifications made of stone, brick, and other materials.
seq_num: 2 - data.json
title: great wall of china
------------
content: The Eiffel Tower is a wrought-iron lattice tower located in Paris, France.
seq_num: 1 - data.json
title: eiffel tower
------------


In [9]:
# # Load existing Chroma vectorstore
query = "what is programming language?"
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
results = vectordb.similarity_search_with_relevance_scores(query, k=5)
for res in results:
    data = res[0]
    print('content:',data.page_content)
    print('seq_num:',data.metadata['seq_num'],'-',data.metadata['source'])
    print('tags:',data.metadata['tags'])
    print('title:',data.metadata['title'])
    print('score:',res[1])
    print("------------")

  vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)


content: Python is a high-level programming language known for its simplicity and readability.
seq_num: 4 - data.json
tags: programming,Python,software,technology
title: python programming language
score: 0.4027660194964654
------------
content: The Mediterranean diet is known for its health benefits and emphasis on fruits, vegetables, and olive oil.
seq_num: 8 - data.json
tags: diet,health,Mediterranean,nutrition
title: mediterranean diet
score: -0.3175599833835492
------------
content: The Colosseum in Rome is an ancient amphitheater used for gladiatorial contests and public spectacles.
seq_num: 10 - data.json
tags: landmark,Rome,Colosseum,history
title: the colosseum
score: -0.3339251824767211
------------
content: The Mona Lisa is a famous portrait painting by Leonardo da Vinci.
seq_num: 5 - data.json
tags: art,painting,Mona Lisa,Leonardo da Vinci
title: mona lisa
score: -0.3798670793439547
------------
content: The Great Wall of China is a series of fortifications made of stone, b

  results = vectordb.similarity_search_with_relevance_scores(query, k=5)
