# Read json dataset

In [3]:
import json
with open("dataset/dataset.json", "r") as fp:
    dataset = json.load(fp)

In [4]:
len(dataset)

281

## The dataset will be chunked, each chunk being one usecase-testcases example

### Embed data

In [5]:
CREATE_DB = False
LOAD_DB = True

In [7]:
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
import os

In [8]:
class GeminiEmbeddingFunction(EmbeddingFunction):
    """
    Custom embedding function using the Gemini AI API for document retrieval.

    This class extends the EmbeddingFunction class and implements the __call__ method
    to generate embeddings for a given set of documents using the Gemini AI API.

    Parameters:
    - input (Documents): A collection of documents to be embedded.

    Returns:
    - Embeddings: Embeddings generated for the input documents.
    """
    def __call__(self, input: Documents) -> Embeddings:
        gemini_api_key = "AIzaSyCYZFFO_Yr8C62LU2_HxGbOFZSYNEZKHi4"
        if not gemini_api_key:
            raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
        genai.configure(api_key=gemini_api_key)
        model = "models/embedding-001"
        title = "Custom query"
        return genai.embed_content(
            model=model,
            content=input,
            task_type="retrieval_document",
            title=title
        )["embedding"]

In [9]:
import chromadb
from typing import List
def create_chroma_db(documents:List, path:str, name:str):
    """
    Creates a Chroma database using the provided documents, path, and collection name.

    Parameters:
    - documents: An iterable of documents to be added to the Chroma database.
    - path (str): The path where the Chroma database will be stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - Tuple[chromadb.Collection, str]: A tuple containing the created Chroma Collection and its name.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    for i, d in enumerate(documents):
        db.add(documents=d, ids=str(i))
    # db.add(documents=documents, ids=[str(i) for i in range(len(documents))])

    return db, name

In [10]:
if CREATE_DB:
    db, name = create_chroma_db(documents=dataset, 
                          path="RAG\contents",
                          name="usecase_embeddings")

In [11]:
def load_chroma_collection(path, name):
    """
    Loads an existing Chroma collection from the specified path with the given name.

    Parameters:
    - path (str): The path where the Chroma database is stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - chromadb.Collection: The loaded Chroma Collection.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    return db

In [12]:
if LOAD_DB:
    db = load_chroma_collection(path="RAG\contents", name="usecase_embeddings")

⚠️ It looks like you upgraded from a version below 0.6 and could benefit from vacuuming your database. Run chromadb utils vacuum --help for more information.


In [13]:
def get_relevant_passage(query, db, n_results):
  passage = db.query(query_texts=[query], n_results=n_results)['documents'][0]
  return passage

# Test

In [14]:
#Example usage
relevant_text = get_relevant_passage(query="Add task to list", db=db, n_results=5)

In [15]:
relevant_text

['{"usecase": {"name": "Add a new task in To Do list", "description": "User should be able to add a new task", "preconditions": "User should be logged in", "steps": ["User should be logged in", "User should be on the To Do list page", "User should click on the Add button", "User should enter the task name and description", "User should choose the Deadline", "User should set the importance of the task on a scale of 1 to 10", "User should click on the Submit button"]}, "testcases": [{"name": "Correctly add a new task", "description": "User should be able to add a new task", "input": {"taskName": "Task 1", "taskDescription": "Task 1 description", "deadline": "2024-12-31", "importance": "5"}, "expected": {"outcome": "The task is added to the list", "status": "Pass"}}, {"name": "Given Deadline is in the past", "description": "User should not be able to set the deadline in the past", "input": {"taskName": "Task 1", "taskDescription": "Task 1 description", "deadline": "2021-01-01", "importanc

In [22]:
DATASET_PATH = "dataset/dataset-20.jsonl"

In [23]:
from jsonl import *
dataset = read_jsonl(DATASET_PATH)[:100]

In [28]:
for idx, data in enumerate(dataset):
    usecase = data["usecase"]

    # usecase = json.dumps(usecase, indent=4)

    relevant_text = get_relevant_passage(query=data["usecase"]["scenario"], db=db, n_results=1)

    rag_ex = json.loads(relevant_text[0])

    if "testCases" in rag_ex:
        rag_ex["testcases"] = rag_ex["testCases"]
        del rag_ex["testCases"]

    data["rag-example"] = rag_ex
   


In [33]:
write_jsonl("dataset/dataset-20-rag.jsonl", dataset)