In [None]:
import os
from tqdm import tqdm
from datasets import load_dataset
from elasticsearch import Elasticsearch
from langchain_community.vectorstores import ElasticsearchStore
from langchain_community.document_loaders import TextLoader
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import SpacyTextSplitter
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from elasticsearch import Elasticsearch
from getpass import getpass
from utils import *
from dotenv import load_dotenv
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)

from ragas import evaluate


load_dotenv()  # take environment variables from .env.

HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
HUGGINGFACE_USERNAME = os.getenv('HUGGINGFACE_USERNAME')
HUGGINGFACE_DATASET_NAME = os.getenv('HUGGINGFACE_DATASET_NAME')
ELASTIC_CLOUD_ID = os.getenv('ELASTIC_CLOUD_ID')
ELASTIC_API_KEY = os.getenv('ELASTIC_API_KEY')
QA_VALIDATION_DATASET = os.getenv('QA_VALIDATION_DATASET')
QA_VALIDATION_TOKEN = os.getenv('QA_VALIDATION_TOKEN')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

model_name = "NeuML/pubmedbert-base-embeddings"

from langchain_community.embeddings import HuggingFaceEmbeddings
from pprint import pprint

# Validation dataset (without RAG answers): https://huggingface.co/datasets/prio7777777/pubmed-qa-validation

In [None]:
'''
Example on how to run a validation for a given configuration
NOTE: this has not been tested holistically, but the code should work
'''


In [None]:
model_name = "NeuML/pubmedbert-base-embeddings"
device = 'cuda:0'
model_id = "llama2:latest" 

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': device},
    encode_kwargs={'device': device}
)

indexes = ['pubmedbert-sentence-transformer-50','pubmedbert-sentence-transformer-400','pubmedbert-recursive-character-400-overlap-50']
# indexes = ['pubmedbert-sentence-transformer-100']
## define the LLM model to use | later this can be overwritten by the user
# llm = prepare_llm(HUGGINGFACE_TOKEN,model_id=model_id,use_openai=True)
# llm = Ollama(model = "llama2:latest")
llm = ChatOpenAI(temperature = 0, openai_api_key = OPENAI_API_KEY)

## create configuration for the run_config function
save_path = '../data/chunking_test.csv'
save_path_result = "../data/chunking_test_formatted.csv"


#### Evaluate the influence of chunking size and chunk overlap

In [None]:
chunking_configuration_results = []

In [None]:
for index_name in indexes:
    elastic_vector_search = ElasticsearchStore(
        es_cloud_id=ELASTIC_CLOUD_ID,
        index_name=index_name,
        embedding=embeddings,
        es_api_key=ELASTIC_API_KEY,
    )

    print(elastic_vector_search.client.info())


    config_1 = {
        "index_name": index_name,
        'evaluation_dataset_path': QA_VALIDATION_DATASET,
        'HUGGINGFACE_TOKEN': HUGGINGFACE_TOKEN,
        'HUGGINGFACE_DATASET_NAME': HUGGINGFACE_DATASET_NAME,
        'llm': llm,
        'QA_VALIDATION_TOKEN': QA_VALIDATION_TOKEN,
        'save_path': save_path,
        'max_retrieved_docs': 3,
        'OPENAI_API_KEY': OPENAI_API_KEY
    }

    answers = run_config(elastic_vector_search=elastic_vector_search,
                     use_ensemble_retriever=False,
                     verbose=False,
                     save=True,
                     **config_1)

        
    config_2 = {
        'QA_VALIDATION_TOKEN': QA_VALIDATION_TOKEN,
        'QA_VALIDATION_DATASET': QA_VALIDATION_DATASET,
        'save_path': save_path,
        'save_path_result': save_path_result, 
    }

    ## this is a Dataset on which the RAGAs metrics can be applied
    result_dataset = testset_to_validation(save=True,**config_2)

    ## get ragas metrics
    resulted_metrics = evaluate(
        result_dataset,
        metrics=[
            context_precision,
            # faithfulness,
            answer_relevancy,
            context_recall,
        ],
    )

    chunking_configuration_results.append({'configuration': index_name, 'answer_relevancy': resulted_metrics['answer_relevancy'], 'context_precision': resulted_metrics['context_precision'], 'context_recall': resulted_metrics['context_recall']})

    ## save individual results

    resulted_metrics.to_pandas().to_csv(f'../data/chunking_configurations/{index_name}_results.csv',index=False)


## save the results
df = pd.DataFrame(chunking_configuration_results,columns=['configuration', 'answer_relevancy', 'context_precision', 'context_recall'])
df.to_csv('../data/chunking_configurations/chunking_configuration_results.csv',index=False)

In [None]:

## first define embeddings for the db


## define what index to use and instantiate the vector store

index_name = 'pubmedbert-sentence-transformer-400'

elastic_vector_search = ElasticsearchStore(
    es_cloud_id=ELASTIC_CLOUD_ID,
    index_name=index_name,
    embedding=embeddings,
    es_api_key=ELASTIC_API_KEY,
)

## define the LLM model to use | later this can be overwritten by the user
llm = prepare_llm(HUGGINGFACE_TOKEN,model_id=model_id,use_openai=True)

## create configuration for the run_config function
save_path = '../data/rag_validation_answers_400.csv'

config_1 = {
    "index_name": index_name,
    'evaluation_dataset_path': QA_VALIDATION_DATASET,
    'HUGGINGFACE_TOKEN': HUGGINGFACE_TOKEN,
    'HUGGINGFACE_DATASET_NAME': HUGGINGFACE_DATASET_NAME,
    'llm': llm,
    'QA_VALIDATION_TOKEN': QA_VALIDATION_TOKEN,
    'save_path': save_path,
    'max_retrieved_docs': 3
}

## this will save the results under the given path as a csv file
## the file will contain the question and the result for each question in the validation dataset (questions generated with RAGas from the new dataset)
## takes about 20-30 mins on T4 GPU
answers = run_config(elastic_vector_search=elastic_vector_search,
                     use_ensemble_retriever=False,
                     verbose=True,
                     config_name='new_dataset_400',
                     save=True,
                     **config_1)


config_2 = {
    'QA_VALIDATION_TOKEN': QA_VALIDATION_TOKEN,
    'QA_VALIDATION_DATASET': QA_VALIDATION_DATASET,
    'save_path': save_path,
    'save_path_result': '../data/validation_400_gpt_3-5-turbo.csv' 
}

## this is a Dataset on which the RAGAs metrics can be applied
result_dataset = testset_to_validation(save=True,**config_2)

## get ragas metrics
resulted_metrics = evaluate(
    result_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
    ],
)

pprint(resulted_metrics)

### Influence of weight of ensemble retriever in context_precision

As we deal with a medical appication we desire precision over recall (in terms of IR).
We will analyze this in the context of the weight of the BM25 retriever in the ensemble retriever.

In [None]:
from langchain_community.document_loaders import HuggingFaceDatasetLoader

loader = HuggingFaceDatasetLoader("MaraEliana/pubmed-abstracts",use_auth_token="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",page_content_column='abstract')
data = loader.load()

In [None]:
llm = ChatOpenAI(temperature = 0, openai_api_key = OPENAI_API_KEY)
eval_dataset = load_dataset(QA_VALIDATION_DATASET,token=QA_VALIDATION_TOKEN)['train']

In [None]:
index_name = 'pubmedbert-sentence-transformer-400'

elastic_vector_search = ElasticsearchStore(
        es_cloud_id = ELASTIC_CLOUD_ID,
        index_name = index_name,
        embedding = embeddings,
        es_api_key = ELASTIC_API_KEY
    )

def load_ensemble_retriever(index_name,_elastic_vector_search):
    text_splitter = get_splitter_per_index(index_name)
    retriever = create_ensemble_retriever(_elastic_vector_search, text_splitter, neuro_weight=0,max_retrieved_docs=20)
    return retriever

## buffer ensemble retriever for consecutive uses
ensemble_retriever = load_ensemble_retriever(index_name,elastic_vector_search)


In [None]:
rag = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        verbose=True,
        retriever = ensemble_retriever,
        chain_type_kwargs={
            "verbose": True },
    )

In [None]:
max_questions_to_evaluate = 25
counter = 0

answers = []

for example in tqdm(eval_dataset,desc="generate RAG answers"):
    answers.append(rag(example['question']))
    counter +=1 
    if counter == max_questions_to_evaluate:
        break

In [None]:
answers_df = pd.DataFrame(answers)
result_df = pd.merge(answers_df, eval_dataset.to_pandas(), left_on='query', right_on='question', how='inner')
result_df = result_df.drop(columns=['query','question_type','episode_done'])
## first parse the ground_truth and ground_truth context by \n
columns_mapping = {'question': 'question', 'result': 'answer', 'ground_truth_context':'contexts'} #'ground_truth': 'ground_truths',
result_df = result_df.rename(columns=columns_mapping)

result_df['contexts'] = result_df['contexts'].apply(lambda x: [x])
result_df_dataset = Dataset.from_pandas(result_df)

In [None]:
resulted_metrics = evaluate(
    result_df_dataset,
    metrics=[
        context_precision,
    ],
)

pprint(resulted_metrics)

### Generation of syntethic evaluation set

In [None]:
TEST_SET_SIZE = 100

In [None]:
# %pip install ragas 
from ragas.testset import TestsetGenerator
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from ragas.llms import LangchainLLM
import random
#https://docs.ragas.io/en/latest/howtos/customisations/llms.html

sub_data = random.sample(data, TEST_SET_SIZE)

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
print(OPENAI_API_KEY)

# Add custom llms and embeddings
generator_llm = LangchainLLM(llm=ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY))
critic_llm = LangchainLLM(llm=ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY)) ## should be gpt-4 but we dont have access
embeddings_model = embeddings

# Change resulting question type distribution
testset_distribution = {
    "simple": 0.25,
    "reasoning": 0.25,
    "multi_context": 0.25,
    "conditional": 0.25,
}

# percentage of conversational question
chat_qa = 0.1


test_generator = TestsetGenerator(
    generator_llm=generator_llm,
    critic_llm=critic_llm,
    embeddings_model=embeddings_model,
    testset_distribution=testset_distribution,
    chat_qa=chat_qa,
)

testset = test_generator.generate(sub_data, test_size=TEST_SET_SIZE) ## why second parameter is 5?