# Generate synthetic test set

## Import libraries

In [None]:
import os, sys

sys.path.append(os.path.join(os.getcwd(), '..'))
sys.path.append(os.path.join(os.getcwd(), '../src'))

In [None]:
from dotenv import load_dotenv

from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

from langchain_groq import ChatGroq
from langchain_cohere import CohereEmbeddings
from langchain_core.rate_limiters import InMemoryRateLimiter

from src.vectorstore import PineconeVectorDB

In [None]:
load_dotenv()

## Generation

### Load documents

In [None]:
vector_db = PineconeVectorDB(
    index_name=os.getenv('PINECONE_INDEX'),
    embedding=CohereEmbeddings(model='embed-multilingual-v3.0')
)

vector_store = vector_db.get_vectorstore()

In [None]:
sample_question = 'Hãy giới thiệu về trường Đại học Khoa học Tự nhiên TP.HCM?'
N = 10000 # set it to be large to get all documents in vectorstore
knowledge_base = vector_store.similarity_search(sample_question, k=N)

### Generate data

In [None]:
limiter = InMemoryRateLimiter(
    requests_per_second=0.3,
    check_every_n_seconds=0.1,
    max_bucket_size=30
)

generator_llm = ChatGroq(model="llama3-70b-8192", rate_limiter=limiter, max_retries=3)
critic_llm = ChatGroq(model="llama3-8b-8192", rate_limiter=limiter, max_retries=3)
embedding = CohereEmbeddings(model='embed-multilingual-v3.0', max_retries=10)

In [8]:
generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embedding
)

testset = []
batch_size = 2
for i in range(0, len(knowledge_base)):
    sub_testset = generator.generate_with_langchain_docs(
        documents=[knowledge_base[i]], test_size=10,
        distributions={simple: 0.5, reasoning: 0.3, multi_context: 0.2}
    )
    testset.append(sub_testset)