In [11]:
from pathlib import Path
from llama_index import download_loader, Document
from llama_index.text_splitter import SentenceSplitter
from llama_index.schema import TextNode
import csv

In [7]:
def read_csv(file_path: str) -> None:
    events_list = []
    with open(file_path, mode='r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            events_list.append(row)
    return events_list

In [4]:
text_splitter = SentenceSplitter(
    chunk_size=1024,
    # separator=" ",
)

In [22]:
events = read_csv("./events.csv")
documents = [Document(text=str(t), metadata={"event_id": t["Id"], "event_name":  t["Name"] }) for t in events]

In [9]:
text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, doc in enumerate(documents):
    cur_text_chunks = text_splitter.split_text(doc.text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

In [12]:
nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc = documents[doc_idxs[idx]]
    node.metadata = src_doc.metadata
    nodes.append(node)

In [13]:
print(nodes[0].get_content(metadata_mode="all"))

event_id: 1
event_name: The Phantom of the Opera

{'Id': '1', 'Name': 'The Phantom of the Opera', 'One Line Description': 'Experience the iconic musical', 'Location': 'Majestic Theatre NYC', 'Time': 'September 5 2023 8:00 PM', 'Avg. Rating': '4.6', 'Ticket Price': '$75'}


In [14]:
import chromadb
chroma_client = chromadb.Client()

In [15]:
collection = chroma_client.create_collection(name="events")

In [39]:
ids = [doc.id_ for doc in documents]
metadata = [doc.metadata for doc in documents]
text = [doc.text for doc in documents]


In [40]:
collection.add(
    documents=text,
    metadatas=metadata,
    ids=ids
)

/Users/sabhyachhabria/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:09<00:00, 8.88MiB/s]


In [52]:
results = collection.query(
    query_texts=["broadway in nyc after Jan 2024"],
    n_results=2
)

In [53]:
results

{'ids': [['b2c81f66-697a-4288-a552-0501e803e1f9',
   '962fac33-0ec7-4794-bddb-b10d359d80ff']],
 'distances': [[1.1672581434249878, 1.1732980012893677]],
 'metadatas': [[{'event_id': '76', 'event_name': 'Hamilton'},
   {'event_id': '35', 'event_name': 'The Color Purple'}]],
 'embeddings': None,
 'documents': [['{\'Id\': \'76\', \'Name\': \'Hamilton\', \'One Line Description\': "Don\'t miss the cultural phenomenon", \'Location\': \'Richard Rodgers Theatre NYC\', \'Time\': \'March 25 2025 7:00 PM\', \'Avg. Rating\': \'4.9\', \'Ticket Price\': \'$90\'}',
   "{'Id': '35', 'Name': 'The Color Purple', 'One Line Description': 'Moving musical experience', 'Location': 'Bernard B. Jacobs Theatre NYC', 'Time': 'October 15 2023 2:00 PM', 'Avg. Rating': '4.8', 'Ticket Price': '$85'}"]]}