In [1]:
import os
from tqdm import tqdm
from datasets import load_dataset
from elasticsearch import Elasticsearch
from langchain_community.vectorstores import ElasticsearchStore
from langchain_community.document_loaders import TextLoader
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import SpacyTextSplitter
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from elasticsearch import Elasticsearch
from getpass import getpass
from utils import *
from dotenv import load_dotenv
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)

from ragas import evaluate


load_dotenv()  # take environment variables from .env.

HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
HUGGINGFACE_USERNAME = os.getenv('HUGGINGFACE_USERNAME')
HUGGINGFACE_DATASET_NAME = os.getenv('HUGGINGFACE_DATASET_NAME')
ELASTIC_CLOUD_ID = os.getenv('ELASTIC_CLOUD_ID')
ELASTIC_API_KEY = os.getenv('ELASTIC_API_KEY')
QA_VALIDATION_DATASET = os.getenv('QA_VALIDATION_DATASET')
QA_VALIDATION_TOKEN = os.getenv('QA_VALIDATION_TOKEN')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

model_name = "NeuML/pubmedbert-base-embeddings"

from langchain_community.embeddings import HuggingFaceEmbeddings
from pprint import pprint

# Validation dataset (without RAG answers): https://huggingface.co/datasets/prio7777777/pubmed-qa-validation

In [2]:
'''
Example on how to run a validation for a given configuration
NOTE: this has not been tested holistically, but the code should work
'''


'\nExample on how to run a validation for a given configuration\nNOTE: this has not been tested holistically, but the code should work\n'

In [18]:
model_name = "NeuML/pubmedbert-base-embeddings"
device = 'cuda:0'
model_id = "llama2:latest" 

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': device},
    encode_kwargs={'device': device}
)

indexes = ['pubmedbert-sentence-transformer-50','pubmedbert-sentence-transformer-400','pubmedbert-recursive-character-400-overlap-50']
# indexes = ['pubmedbert-sentence-transformer-100']
## define the LLM model to use | later this can be overwritten by the user
# llm = prepare_llm(HUGGINGFACE_TOKEN,model_id=model_id,use_openai=True)
# llm = Ollama(model = "llama2:latest")
llm = ChatOpenAI(temperature = 0, openai_api_key = OPENAI_API_KEY)

## create configuration for the run_config function
save_path = '../data/chunking_test.csv'
save_path_result = "../data/chunking_test_formatted.csv"


#### Evaluate the influence of chunking size and chunk overlap

In [19]:
chunking_configuration_results = []

In [20]:
for index_name in indexes:
    elastic_vector_search = ElasticsearchStore(
        es_cloud_id=ELASTIC_CLOUD_ID,
        index_name=index_name,
        embedding=embeddings,
        es_api_key=ELASTIC_API_KEY,
    )

    print(elastic_vector_search.client.info())


    config_1 = {
        "index_name": index_name,
        'evaluation_dataset_path': QA_VALIDATION_DATASET,
        'HUGGINGFACE_TOKEN': HUGGINGFACE_TOKEN,
        'HUGGINGFACE_DATASET_NAME': HUGGINGFACE_DATASET_NAME,
        'llm': llm,
        'QA_VALIDATION_TOKEN': QA_VALIDATION_TOKEN,
        'save_path': save_path,
        'max_retrieved_docs': 3,
        'OPENAI_API_KEY': OPENAI_API_KEY
    }

    answers = run_config(elastic_vector_search=elastic_vector_search,
                     use_ensemble_retriever=False,
                     verbose=False,
                     save=True,
                     **config_1)

        
    config_2 = {
        'QA_VALIDATION_TOKEN': QA_VALIDATION_TOKEN,
        'QA_VALIDATION_DATASET': QA_VALIDATION_DATASET,
        'save_path': save_path,
        'save_path_result': save_path_result, 
    }

    ## this is a Dataset on which the RAGAs metrics can be applied
    result_dataset = testset_to_validation(save=True,**config_2)

    ## get ragas metrics
    resulted_metrics = evaluate(
        result_dataset,
        metrics=[
            context_precision,
            # faithfulness,
            answer_relevancy,
            context_recall,
        ],
    )

    chunking_configuration_results.append({'configuration': index_name, 'answer_relevancy': resulted_metrics['answer_relevancy'], 'context_precision': resulted_metrics['context_precision'], 'context_recall': resulted_metrics['context_recall']})

    ## save individual results

    resulted_metrics.to_pandas().to_csv(f'../data/chunking_configurations/{index_name}_results.csv',index=False)


## save the results
df = pd.DataFrame(chunking_configuration_results,columns=['configuration', 'answer_relevancy', 'context_precision', 'context_recall'])
df.to_csv('../data/chunking_configurations/chunking_configuration_results.csv',index=False)

{'name': 'instance-0000000001', 'cluster_name': '3799852da3b9401fb819ca73ae522ae6', 'cluster_uuid': 'okQgd_O8RiqhrBY6O1QGQg', 'version': {'number': '8.12.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '48a287ab9497e852de30327444b0809e55d46466', 'build_date': '2024-02-19T10:04:32.774273190Z', 'build_snapshot': False, 'lucene_version': '9.9.2', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


generate RAG answers: 100%|██████████| 67/67 [02:46<00:00,  2.48s/it]


Evaluating:   0%|          | 0/201 [00:00<?, ?it/s]

Task exception was never retrieved
future: <Task finished name='Task-4567' coro=<AsyncClient.aclose() done, defined at c:\Users\priot\anaconda3\envs\nlp\lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
Traceback (most recent call last):
  File "c:\Users\priot\anaconda3\envs\nlp\lib\site-packages\httpx\_client.py", line 2018, in aclose
    await self._transport.aclose()
  File "c:\Users\priot\anaconda3\envs\nlp\lib\site-packages\httpx\_transports\default.py", line 385, in aclose
    await self._pool.aclose()
  File "c:\Users\priot\anaconda3\envs\nlp\lib\site-packages\httpcore\_async\connection_pool.py", line 313, in aclose
    await self._close_connections(closing_connections)
  File "c:\Users\priot\anaconda3\envs\nlp\lib\site-packages\httpcore\_async\connection_pool.py", line 305, in _close_connections
    await connection.aclose()
  File "c:\Users\priot\anaconda3\envs\nlp\lib\site-packages\httpcore\_async\connection.py", line 171, in aclose
    

{'name': 'instance-0000000001', 'cluster_name': '3799852da3b9401fb819ca73ae522ae6', 'cluster_uuid': 'okQgd_O8RiqhrBY6O1QGQg', 'version': {'number': '8.12.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '48a287ab9497e852de30327444b0809e55d46466', 'build_date': '2024-02-19T10:04:32.774273190Z', 'build_snapshot': False, 'lucene_version': '9.9.2', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


generate RAG answers: 100%|██████████| 67/67 [02:39<00:00,  2.37s/it]


Evaluating:   0%|          | 0/201 [00:00<?, ?it/s]

{'name': 'instance-0000000001', 'cluster_name': '3799852da3b9401fb819ca73ae522ae6', 'cluster_uuid': 'okQgd_O8RiqhrBY6O1QGQg', 'version': {'number': '8.12.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '48a287ab9497e852de30327444b0809e55d46466', 'build_date': '2024-02-19T10:04:32.774273190Z', 'build_snapshot': False, 'lucene_version': '9.9.2', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


generate RAG answers: 100%|██████████| 67/67 [02:19<00:00,  2.09s/it]


Evaluating:   0%|          | 0/201 [00:00<?, ?it/s]

Task exception was never retrieved
future: <Task finished name='Task-6258' coro=<AsyncClient.aclose() done, defined at c:\Users\priot\anaconda3\envs\nlp\lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
Traceback (most recent call last):
  File "c:\Users\priot\anaconda3\envs\nlp\lib\site-packages\httpx\_client.py", line 2018, in aclose
    await self._transport.aclose()
  File "c:\Users\priot\anaconda3\envs\nlp\lib\site-packages\httpx\_transports\default.py", line 385, in aclose
    await self._pool.aclose()
  File "c:\Users\priot\anaconda3\envs\nlp\lib\site-packages\httpcore\_async\connection_pool.py", line 313, in aclose
    await self._close_connections(closing_connections)
  File "c:\Users\priot\anaconda3\envs\nlp\lib\site-packages\httpcore\_async\connection_pool.py", line 305, in _close_connections
    await connection.aclose()
  File "c:\Users\priot\anaconda3\envs\nlp\lib\site-packages\httpcore\_async\connection.py", line 171, in aclose
    

In [None]:
# pd.set_option('display.max_rows', 67)
config_2 = {
        'QA_VALIDATION_TOKEN': QA_VALIDATION_TOKEN,
        'QA_VALIDATION_DATASET': QA_VALIDATION_DATASET,
        'save_path': save_path,
        'save_path_result': save_path_result, 
    }

## this is a Dataset on which the RAGAs metrics can be applied
result_dataset = testset_to_validation(save=True,**config_2)
result_df = result_dataset.to_pandas()
result_df

In [None]:
## get ragas metrics
resulted_metrics = evaluate(
    result_dataset,
    metrics=[
        faithfulness,
    ],
)

#faithfulness, PROBLEMATIC
# context_precision,
#         context_recall,
#         answer_relevancy,

In [None]:

## first define embeddings for the db


## define what index to use and instantiate the vector store

index_name = 'pubmedbert-sentence-transformer-400'

elastic_vector_search = ElasticsearchStore(
    es_cloud_id=ELASTIC_CLOUD_ID,
    index_name=index_name,
    embedding=embeddings,
    es_api_key=ELASTIC_API_KEY,
)

## define the LLM model to use | later this can be overwritten by the user
llm = prepare_llm(HUGGINGFACE_TOKEN,model_id=model_id,use_openai=True)

## create configuration for the run_config function
save_path = '../data/rag_validation_answers_400.csv'

config_1 = {
    "index_name": index_name,
    'evaluation_dataset_path': QA_VALIDATION_DATASET,
    'HUGGINGFACE_TOKEN': HUGGINGFACE_TOKEN,
    'HUGGINGFACE_DATASET_NAME': HUGGINGFACE_DATASET_NAME,
    'llm': llm,
    'QA_VALIDATION_TOKEN': QA_VALIDATION_TOKEN,
    'save_path': save_path,
    'max_retrieved_docs': 3
}

## this will save the results under the given path as a csv file
## the file will contain the question and the result for each question in the validation dataset (questions generated with RAGas from the new dataset)
## takes about 20-30 mins on T4 GPU
answers = run_config(elastic_vector_search=elastic_vector_search,
                     use_ensemble_retriever=False,
                     verbose=True,
                     config_name='new_dataset_400',
                     save=True,
                     **config_1)


In [3]:

config_2 = {
    'QA_VALIDATION_TOKEN': QA_VALIDATION_TOKEN,
    'QA_VALIDATION_DATASET': QA_VALIDATION_DATASET,
    'save_path': save_path,
    'save_path_result': '../data/validation_400_gpt_3-5-turbo.csv' 
}

## this is a Dataset on which the RAGAs metrics can be applied
result_dataset = testset_to_validation(save=True,**config_2)

## get ragas metrics
resulted_metrics = evaluate(
    result_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
    ],
)

pprint(resulted_metrics)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`


Evaluating:   0%|          | 0/268 [00:00<?, ?it/s]

{'answer_relevancy': 0.8942793135767543,
 'context_precision': 0.9253731342358208,
 'context_recall': 0.9253731343283582,
 'faithfulness': 0.825542328042328}


In [9]:
resulted_metrics['answer_relevancy']

0.8942793135767543

In [15]:
from langchain_community.document_loaders import HuggingFaceDatasetLoader

loader = HuggingFaceDatasetLoader("MaraEliana/pubmed-abstracts",use_auth_token="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",page_content_column='abstract')
data = loader.load()



Generating train split:   0%|          | 0/69698 [00:00<?, ? examples/s]

In [None]:
# query = "What is the role of artificial intelligence in nephrology?"
# results = elastic_vector_search.similarity_search(query,k=50)


# titles_elastic = [res.metadata["Title"] for res in results]
# for res in results:
#     print(res.metadata['Title'])

### For Mara

Using Ollama generate a bigger validation dataset of 5000 items (make TEST_SET_SIZE 5000).
Change ChatOpenAI with the llama model (line 15/16)
Save the resulting csv locally and send it to me so I can upload it to huggingface.

In [None]:
TEST_SET_SIZE = 100

In [None]:
# %pip install ragas 
from ragas.testset import TestsetGenerator
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from ragas.llms import LangchainLLM
import random
#https://docs.ragas.io/en/latest/howtos/customisations/llms.html

sub_data = random.sample(data, TEST_SET_SIZE)

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
print(OPENAI_API_KEY)

# Add custom llms and embeddings
generator_llm = LangchainLLM(llm=ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY))
critic_llm = LangchainLLM(llm=ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY)) ## should be gpt-4 but we dont have access
embeddings_model = embeddings

# Change resulting question type distribution
testset_distribution = {
    "simple": 0.25,
    "reasoning": 0.25,
    "multi_context": 0.25,
    "conditional": 0.25,
}

# percentage of conversational question
chat_qa = 0.1


test_generator = TestsetGenerator(
    generator_llm=generator_llm,
    critic_llm=critic_llm,
    embeddings_model=embeddings_model,
    testset_distribution=testset_distribution,
    chat_qa=chat_qa,
)

testset = test_generator.generate(sub_data, test_size=TEST_SET_SIZE) ## why second parameter is 5?

In [None]:
test_df = testset.to_pandas()
test_df


In [None]:
# test_df.to_csv('testset.csv',index=False)

# index = 4

In [None]:
# from pprint import pprint

# pprint(test_df.iloc[index]['question'])

# ## this is the answer
# pprint(test_df.iloc[index]['ground_truth'])


In [None]:
# ## relevant contexts split by \n
# pprint(test_df.iloc[index]['ground_truth_context'][0])