In [1]:
import os
import fitz
import pandas as pd
from llama_index.core.program import LLMTextCompletionProgram
from pydantic import BaseModel
from llama_index.llms.ollama import Ollama

from typing_extensions import Literal as LiteralExt 

from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import MarkdownTextSplitter, RecursiveCharacterTextSplitter
from llama_index.vector_stores.chroma import ChromaVectorStore


from uuid import uuid4

import chromadb
from langchain_chroma import Chroma
from langchain_core.documents import Document



In [2]:
df = pd.read_csv("./data/rag-data.csv")

In [8]:
client = chromadb.PersistentClient(path="./chroma_store")
collection_name="wr-uae"
collection = client.get_or_create_collection(collection_name)
embed_model = OllamaEmbeddings(model="nomic-embed-text:v1.5")


vector_store = Chroma(
    collection_name=collection_name,
    embedding_function=embed_model,
    client=client
)
# vector_store = ChromaVectorStore(chroma_collection=collection)
# index = Ve∏ctorStoreIndex.from_vector_store(vector_store, embed_model=embed_model_hf)

In [6]:
vector_store.similarity_search("")

[]

In [9]:
docs = []
for _, row in df.iterrows():
    chunk = row["chunk"]
    category = row["category"]
    
    docs.append(Document(page_content=chunk, metadata={"category": category}))
    

In [10]:
uuids = [str(uuid4()) for _ in range(len(docs))]


In [11]:
vector_store.add_documents(documents=docs, ids=uuids)

['c524a8b1-d5a6-480e-8381-e3b0a1f343a4',
 'f1faa66d-c2df-4fd5-814b-294cb4926f8d',
 '44933832-bdb9-47ae-bdbb-123725faa84a',
 '7bbe31cc-34ea-495a-bf4e-42e5d03bdf8c',
 '1ede5ed8-ca3f-4d93-a30f-b48c69062af3',
 '77c36bd6-0db1-4bb0-95e5-7d0e54bd96f6',
 '92f678ec-d357-469c-8f56-f2b06b462479',
 'b7cd9191-c103-417d-b03e-d2ec814f78db',
 '594f1e9b-51fc-4ad7-a58a-9c23a1ce0e5a',
 '12a1f433-e8fa-43cb-97a3-c69fd73e7381',
 'd5640bfc-81c9-4a52-8e74-2e612d4355f8',
 '4ba70fcc-50ee-4d3f-9380-d3dd1963de39',
 'ae16c38a-bbd8-4a08-8a8e-6fa4487db706',
 '3a1e8412-06e4-4f1a-8f86-28c3ec635711',
 'ac9e1b6b-bbd3-41cd-915d-9f3f5bbd9840',
 '77db07b3-c171-49d1-b864-edc976df686b',
 'a8e37408-d330-4c29-91d3-78201c09197f',
 'dab7cfb7-92bf-4144-96f6-7fdd71ed4c31',
 'e844f5c1-d5ca-4961-b8a7-7ab03a2d075c',
 'ad6871df-7491-4fd4-bd94-40840f003e0a',
 'f8fa6af9-1865-49d3-8d7f-dc6108a5425a',
 'b08a47c0-3344-470f-bf3c-10e0e00dd75f',
 '5083fdaf-0d5b-439f-919d-8c767c8a57df',
 'ba0649fc-7074-4a72-901d-052f177ce962',
 '3081a5ad-18bc-

In [139]:
collection = client.get_or_create_collection(collection_name)


In [7]:
vector_store.get(ids=["2396616a-3906-4d6c-b73d-85da215241ee"])

{'ids': [],
 'embeddings': None,
 'documents': [],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': []}