In [None]:

!python -m pip install ragas  sentence_transformers xmltodict python-dotenv

In [None]:
! python -m pip install --upgrade google.generativeai

In [1]:
from dotenv import load_dotenv, dotenv_values
import google.generativeai as genai
from IPython.display import Markdown, display
import os

# Load .env file

load_dotenv()


my_api_key = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=my_api_key)

In [8]:
import pandas as pd
from langchain_community.document_loaders import PubMedLoader
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

In [9]:
from langchain_google_genai import ChatGoogleGenerativeAI,GoogleGenerativeAI

from langchain_community.chat_models import ChatOllama

data_generation_model = GoogleGenerativeAI(model="gemini-pro")


In [10]:
data_generation_model.invoke("Hello")

'Hello there! How can I assist you today?'

In [11]:

critic_model = GoogleGenerativeAI(model="gemini-pro",temperature=1)

In [12]:
critic_model.invoke("hello")

'Hi, my name is Helper. I am an artificial intelligence chatbot that can help you with a variety of questions and tasks. \n\nHow can I help you improve your products or services today?'

In [13]:
model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)




In [14]:

loader = PubMedLoader("cancer", load_max_docs=5)

In [15]:

documents = loader.load()

In [16]:
from pprint import pprint
pprint(documents)

[Document(metadata={'uid': '39162093', 'Title': 'Interpretability of operative and pathological reports for radiotherapy planning of sinonasal carcinomas: An ancillary study of the GORTEC 2016-02 SANTAL trial.', 'Published': '2024-08-20', 'Copyright Information': '© 2024 ARS‐AAOA, LLC.'}, page_content='Interpretation of surgical mapping is essential for postoperative radiotherapy planning. Operative and pathological reports lack comprehensive information on margins quality and tissue block mapping. Standardizing reports is essential to reduce uncertainties, aiming for less morbid poRT.'),
 Document(metadata={'uid': '39162089', 'Title': 'Unraveling the role of oligodendrocytes and myelin in pain.', 'Published': '2024-08-20', 'Copyright Information': '© 2024 The Author(s). Journal of Neurochemistry published by John Wiley & Sons Ltd on behalf of International Society for Neurochemistry.'}, page_content='Oligodendrocytes, the myelin-producing cells in the central nervous system (CNS), are

In [17]:
generator = TestsetGenerator.from_langchain(
    data_generation_model,
    critic_model,
    embeddings
)
     

In [18]:

distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}
     

In [19]:

testset = generator.generate_with_langchain_docs(documents, 5, distributions)

embedding nodes:   0%|          | 0/10 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/5 [00:00<?, ?it/s]

Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._co

In [20]:
test_df = testset.to_pandas()

In [21]:
test_df

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,How do sourdough strains mitigate intestinal i...,"[['Lactiplantibacillus plantarum', 'Pediococcu...",Sourdough strains mitigate intestinal injury i...,simple,"[{'uid': '39162079', 'Title': 'Comparison of t...",True
1,What was the patient's initial symptom upon vi...,[Hemangioma is a common vascular neoplasm that...,The patient visited the hospital with a compla...,simple,"[{'uid': '39162066', 'Title': 'Cavernous Heman...",True
2,Does RT planning precision improve post-surger...,[Interpretation of surgical mapping is essenti...,The answer to given question is not present in...,multi_context,"[{'uid': '39162093', 'Title': 'Interpretabilit...",True
3,How did sourdough postbiotics affect gut damag...,"[['Lactiplantibacillus plantarum', 'Pediococcu...",Sourdough postbiotics derived from heat-killed...,multi_context,"[{'uid': '39162079', 'Title': 'Comparison of t...",True
4,How is margin accuracy in poRT surgical mappin...,[Interpretation of surgical mapping is essenti...,The answer to given question is not present in...,multi_context,"[{'uid': '39162093', 'Title': 'Interpretabilit...",True


In [22]:
test_df.to_csv('eval_set.csv', sep=',', encoding='utf-8', index=False, header=True)