In [None]:
import configparser
import boto3
import s3fs

from langchain_community.document_loaders import DirectoryLoader #for reading with langchain
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.chat_models import BedrockChat
from langchain.embeddings import HuggingFaceEmbeddings

from llama_index.core import  SimpleDirectoryReader #for reading with llama_index

from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

from Core.testset_cleaning import clean_testset, display_row_info

In [1]:

config = configparser.ConfigParser()
config.read('../aws_credentials.cfg')

aws_access_key_id = config.get('default', 'aws_access_key_id')
aws_secret_access_key = config.get('default', 'aws_secret_access_key')
region_name = config.get('default', 'region_name')

# Session boto3 with credentials
session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name=region_name
)

In [2]:
s3 = s3fs.S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key)
s3_bucket_name = "airliquide-alit-gio-aiops-dev/chaima/indexes_md"

# Import your documents 

You need to import your documents to generate questions with RAGAS.
At the moments, here is some code to load your local documents with llama_index or langchain.

It should be possible to load documents from your S3 bucket with llama_index S3Reader, feel free to implement it :) 

- Loading local documents with langchain (exemple with markdown files):

In [None]:
path = "path/to/your/docs"
loader = DirectoryLoader(path=path, recursive=True, glob="**/*.md", loader_cls=UnstructuredMarkdownLoader, show_progress=True)
documents = loader.load()
print(len(documents))

- Loading local documents with llama_index:

In [None]:
from llama_index.core import  SimpleDirectoryReader

required_exts = [".md"]
path = "path/to/your/docs" #"pdf_md" #"cloud-services/docs"
reader = SimpleDirectoryReader(
    input_dir=path,
    required_exts=required_exts,
    recursive=True,
)
documents = reader.load_data()
len(documents)

Loading LLM on AWS BedrockChat for RAGAS

In [None]:
from langchain_community.chat_models import BedrockChat
from langchain.embeddings import HuggingFaceEmbeddings

# Defining Bedrock client for question generation
bedrock_client = session.client(
    service_name='bedrock-runtime',
    region_name=region_name,
    endpoint_url=f"https://bedrock-runtime.{region_name}.amazonaws.com"
)

# LLM model for generation
config_generator = {
    "model_id": "mistral.mistral-large-2402-v1:0",  
    "model_kwargs": {"temperature": 0.4},
}

# Embedding model
bedrock_generator_model = BedrockChat(
    client=bedrock_client,  
    model_id=config_generator["model_id"],
    model_kwargs=config_generator["model_kwargs"],
)

# Initialisation des embeddings
bedrock_embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")

Generate questions and answers with RAGAS

In [8]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

generator = TestsetGenerator.from_langchain( #or from_llama_index
    generator_llm=bedrock_generator_model,
    critic_llm=bedrock_generator_model,
    embeddings=bedrock_embeddings,
)

testset = generator.generate_with_langchain_docs( #or generate_with_llama_index
    documents,
    test_size=15,
    distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25},
)

Cleaning the testset

In [None]:
testset_df = testset.to_pandas()
testset_df = clean_testset(testset_df)
testset = testset[testset['not legit'] == False]
testset = testset[['question', 'contexts', 'ground_truth', 'metadata', 'evolution_type']]