# Read json dataset

In [None]:
import json
with open("dataset/dataset.json", "r") as fp:
    dataset = json.load(fp)

In [None]:
len(dataset)

## The dataset will be chunked, each chunk being one usecase-testcases example

### Embed data

In [None]:
CREATE_DB = False
LOAD_DB = True

In [None]:
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
import os

In [None]:
class GeminiEmbeddingFunction(EmbeddingFunction):
    """
    Custom embedding function using the Gemini AI API for document retrieval.

    This class extends the EmbeddingFunction class and implements the __call__ method
    to generate embeddings for a given set of documents using the Gemini AI API.

    Parameters:
    - input (Documents): A collection of documents to be embedded.

    Returns:
    - Embeddings: Embeddings generated for the input documents.
    """
    def __call__(self, input: Documents) -> Embeddings:
        gemini_api_key = "AIzaSyCYZFFO_Yr8C62LU2_HxGbOFZSYNEZKHi4"
        if not gemini_api_key:
            raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
        genai.configure(api_key=gemini_api_key)
        model = "models/embedding-001"
        title = "Custom query"
        return genai.embed_content(
            model=model,
            content=input,
            task_type="retrieval_document",
            title=title
        )["embedding"]

In [None]:
import chromadb
from typing import List
def create_chroma_db(documents:List, path:str, name:str):
    """
    Creates a Chroma database using the provided documents, path, and collection name.

    Parameters:
    - documents: An iterable of documents to be added to the Chroma database.
    - path (str): The path where the Chroma database will be stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - Tuple[chromadb.Collection, str]: A tuple containing the created Chroma Collection and its name.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    for i, d in enumerate(documents):
        db.add(documents=d, ids=str(i))
    # db.add(documents=documents, ids=[str(i) for i in range(len(documents))])

    return db, name

In [None]:
if CREATE_DB:
    db, name = create_chroma_db(documents=dataset, 
                          path="RAG\contents",
                          name="usecase_embeddings")

In [None]:
def load_chroma_collection(path, name):
    """
    Loads an existing Chroma collection from the specified path with the given name.

    Parameters:
    - path (str): The path where the Chroma database is stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - chromadb.Collection: The loaded Chroma Collection.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    return db

In [None]:
if LOAD_DB:
    db = load_chroma_collection(path="RAG\contents", name="usecase_embeddings")

In [None]:
def get_relevant_passage(query, db, n_results):
  passage = db.query(query_texts=[query], n_results=n_results)['documents'][0]
  return passage

# Test

In [None]:
#Example usage
relevant_text = get_relevant_passage(query="Add task to list", db=db, n_results=5)

In [None]:
relevant_text

In [None]:
DATASET_PATH = "dataset/dataset-20.jsonl"

In [None]:
from jsonl import *
dataset = read_jsonl(DATASET_PATH)[:100]

In [None]:
for idx, data in enumerate(dataset):
    usecase = data["usecase"]

    # usecase = json.dumps(usecase, indent=4)

    relevant_text = get_relevant_passage(query=data["usecase"]["scenario"], db=db, n_results=1)

    rag_ex = json.loads(relevant_text[0])

    if "testCases" in rag_ex:
        rag_ex["testcases"] = rag_ex["testCases"]
        del rag_ex["testCases"]

    data["rag-example"] = rag_ex
   


In [None]:
write_jsonl("dataset/dataset-20-rag.jsonl", dataset)