In [None]:
import chromadb
chroma_client = chromadb.Client()

In [None]:
collection = chroma_client.create_collection(name="my_collection")

In [None]:
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

In [None]:
chroma_client.delete_collection(name="my_collection")

In [1]:
#CHUNKING LOGIC
from langchain.document_loaders import ReadTheDocsLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import csv

with open('league_lore.csv', 'r', encoding = 'utf8') as file:
    csv_reader = csv.DictReader(file)
    championdata = [row for row in csv_reader]
    
chunk_size = 300
chunk_overlap = 20
#chunks_result = chunk_section({"text": csv_reader, "source": "your_source"}, chunk_size, chunk_overlap)

# def chunk_section(section, chunk_size, chunk_overlap):
#     text_splitter = RecursiveCharacterTextSplitter(
#         separators=["\n\n", "\n", " ", ""],
#         chunk_size=chunk_size,
#         chunk_overlap=chunk_overlap,
#         length_function=len)
    
#     #repeat this function for each {key:value} in list of dictionaries (loop through list of dictionaries)
#     #at each iteration, append the new lore + metadata to the lists
#     chunks = text_splitter.create_documents(
#         #Find a way to retrieve the lore and metadata columns from the csv. sample_section is a dictionary. The code below is creating a list that contains the value associated with the key in dictionary sample_section
#         texts=[championdata["Lore"]], 
#         metadatas=[{"Champion": championdata["Champion"], "Region": championdata["Region"], "Link": championdata["Link"]}])

#     return [{"Lore": chunk.page_content, "Champion": chunk.metadata["Champion"], "Region": chunk.metadata["Region"], "Link": chunk.metadata["Link"]} for chunk in chunks]

def chunk_section(championdata, chunk_size, chunk_overlap):
    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", " ", ""],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    
    chunks_list = []
    for champion_entry in championdata:
        chunks = text_splitter.create_documents(
            texts=[champion_entry["Lore"]],
            metadatas=[{"Link": champion_entry["Link"], "Champion": champion_entry["Champion"], "Region": champion_entry["Region"]}]
        )
        chunks_list.extend([{"Link": champion_entry["Link"], "Lore": chunk.page_content, "Champion": chunk.metadata["Champion"], "Region": chunk.metadata["Region"]} for chunk in chunks])
    
    return chunks_list


In [None]:
#SCALE CHUNKING
from functools import partial

chunks_cd = championdata.flat_map(partial(chunk_section, chunk_size = chunk_size, chunk_overlap = chunk_overlap))
print(f"{chunks_cd.count()} chunks")

In [None]:
# TEST CHUNKING SIZE AND OVERLAP
from langchain.document_loaders import ReadTheDocsLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import csv

# Define the chunk size and overlap
chunk_size = 300
chunk_overlap = 20

# Open and read the CSV file
with open('league_lore.csv', 'r', encoding='utf8') as file:
    csv_reader = csv.DictReader(file)
    championdata = [row for row in csv_reader]

# Call the chunk_section method
chunks_result = chunk_section(championdata, chunk_size, chunk_overlap)

# Print the resulting chunks
for chunk in chunks_result:
    print(chunk)


In [None]:
#EMBEDDING 
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import numpy as np
from ray.data import ActorPoolStrategy

class EmbedChunks:
    def __init__(self, model_name):
        if model_name == "text-embedding-ada-002":
            self.embedding_model = OpenAIEmbeddings(
                model=model_name,
                openai_api_base=os.environ["OPENAI_API_BASE"],
                openai_api_key=os.environ["OPENAI_API_KEY"])
        else:
            self.embedding_model = HuggingFaceEmbeddings(
                model_name=model_name,
                model_kwargs={"device": "cuda"},
                encode_kwargs={"device": "cuda", "batch_size": 100})

    def __call__(self, batch):
        embeddings = self.embedding_model.embed_documents(batch["text"])
        return {"text": batch["text"], "source": batch["source"], "embeddings": embeddings}

In [None]:
# Embed chunks
embedding_model_name = "thenlper/gte-base"
embedded_chunks = chunks_ds.map_batches(
    EmbedChunks,
    fn_constructor_kwargs={"model_name": embedding_model_name},
    batch_size=100, 
    num_gpus=1,
    compute=ActorPoolStrategy(size=2))

In [None]:
chunk_section("league_lore.csv")

In [None]:
documents

In [None]:
metadatas

In [None]:
ids

In [None]:
results = collection.query(
    query_texts=["Which champions are from Shurima"],
    n_results=1,
    include=['documents']
)

results