In [2]:
from pathlib import Path
from llama_index import download_loader, Document
from llama_index.text_splitter import SentenceSplitter
from llama_index.schema import TextNode
import csv



In [3]:
def read_csv(file_path: str) -> None:
    events_list = []
    with open(file_path, mode='r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            events_list.append(row)
    return events_list

In [4]:
text_splitter = SentenceSplitter(
    chunk_size=1024,
    # separator=" ",
)

In [5]:
events = read_csv("./events.csv")
documents = [Document(text=str(t), metadata={"event_id": t["Id"], "event_name":  t["Name"] }) for t in events]

In [6]:
text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, doc in enumerate(documents):
    cur_text_chunks = text_splitter.split_text(doc.text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

In [7]:
nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc = documents[doc_idxs[idx]]
    node.metadata = src_doc.metadata
    nodes.append(node)

In [8]:
print(nodes[0].get_content(metadata_mode="all"))

event_id: 1
event_name: The Phantom of the Opera

{'Id': '1', 'Name': 'The Phantom of the Opera', 'One Line Description': 'Experience the iconic musical', 'Location': 'Majestic Theatre NYC', 'Time': 'September 5 2023 8:00 PM', 'Avg. Rating': '4.6', 'Ticket Price': '$75'}


In [9]:
import chromadb
chroma_client = chromadb.PersistentClient("./datastore/")

In [28]:
collection = chroma_client.create_collection(name="events")

ValueError: Collection events already exists.

In [11]:
ids = [doc.id_ for doc in documents]
metadata = [doc.metadata for doc in documents]
text = [doc.text for doc in documents]


In [12]:
collection.add(
    documents=text,
    metadatas=metadata,
    ids=ids
)

In [23]:
results = collection.query(
    query_texts=["broadway in nyc after Jan 2025 after 8:00 PM"],
    n_results=5
)

In [33]:
collection.delete(ids)

In [36]:
chroma_client.delete_collection("events")

Collection(name=events)