In [3]:
import os
from tqdm import tqdm
from datasets import load_dataset
from elasticsearch import Elasticsearch
from langchain_community.vectorstores import ElasticsearchStore
from langchain_community.document_loaders import TextLoader
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import SpacyTextSplitter
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from elasticsearch import Elasticsearch
from getpass import getpass
from utils import *
from dotenv import load_dotenv
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)

from ragas import evaluate


load_dotenv()  # take environment variables from .env.

HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
HUGGINGFACE_USERNAME = os.getenv('HUGGINGFACE_USERNAME')
HUGGINGFACE_DATASET_NAME = os.getenv('HUGGINGFACE_DATASET_NAME')
ELASTIC_CLOUD_ID = os.getenv('ELASTIC_CLOUD_ID')
ELASTIC_API_KEY = os.getenv('ELASTIC_API_KEY')
QA_VALIDATION_DATASET = os.getenv('QA_VALIDATION_DATASET')
QA_VALIDATION_TOKEN = os.getenv('QA_VALIDATION_TOKEN')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

model_name = "NeuML/pubmedbert-base-embeddings"

from langchain_community.embeddings import HuggingFaceEmbeddings
from pprint import pprint

# Validation dataset (without RAG answers): https://huggingface.co/datasets/prio7777777/pubmed-qa-validation

In [None]:
'''
Example on how to run a validation for a given configuration
NOTE: this has not been tested holistically, but the code should work
'''


In [6]:
model_name = "NeuML/pubmedbert-base-embeddings"
device = 'cuda:0'
model_id = "llama2:latest" 

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': device},
    encode_kwargs={'device': device}
)

indexes = ['pubmedbert-sentence-transformer-50','pubmedbert-sentence-transformer-400','pubmedbert-recursive-character-400-overlap-50']
# indexes = ['pubmedbert-sentence-transformer-100']
## define the LLM model to use | later this can be overwritten by the user
# llm = prepare_llm(HUGGINGFACE_TOKEN,model_id=model_id,use_openai=True)
# llm = Ollama(model = "llama2:latest")
llm = ChatOpenAI(temperature = 0, openai_api_key = OPENAI_API_KEY)

## create configuration for the run_config function
save_path = '../data/chunking_test.csv'
save_path_result = "../data/chunking_test_formatted.csv"


#### Evaluate the influence of chunking size and chunk overlap

In [None]:
chunking_configuration_results = []

In [None]:
for index_name in indexes:
    elastic_vector_search = ElasticsearchStore(
        es_cloud_id=ELASTIC_CLOUD_ID,
        index_name=index_name,
        embedding=embeddings,
        es_api_key=ELASTIC_API_KEY,
    )

    print(elastic_vector_search.client.info())


    config_1 = {
        "index_name": index_name,
        'evaluation_dataset_path': QA_VALIDATION_DATASET,
        'HUGGINGFACE_TOKEN': HUGGINGFACE_TOKEN,
        'HUGGINGFACE_DATASET_NAME': HUGGINGFACE_DATASET_NAME,
        'llm': llm,
        'QA_VALIDATION_TOKEN': QA_VALIDATION_TOKEN,
        'save_path': save_path,
        'max_retrieved_docs': 3,
        'OPENAI_API_KEY': OPENAI_API_KEY
    }

    answers = run_config(elastic_vector_search=elastic_vector_search,
                     use_ensemble_retriever=False,
                     verbose=False,
                     save=True,
                     **config_1)

        
    config_2 = {
        'QA_VALIDATION_TOKEN': QA_VALIDATION_TOKEN,
        'QA_VALIDATION_DATASET': QA_VALIDATION_DATASET,
        'save_path': save_path,
        'save_path_result': save_path_result, 
    }

    ## this is a Dataset on which the RAGAs metrics can be applied
    result_dataset = testset_to_validation(save=True,**config_2)

    ## get ragas metrics
    resulted_metrics = evaluate(
        result_dataset,
        metrics=[
            context_precision,
            # faithfulness,
            answer_relevancy,
            context_recall,
        ],
    )

    chunking_configuration_results.append({'configuration': index_name, 'answer_relevancy': resulted_metrics['answer_relevancy'], 'context_precision': resulted_metrics['context_precision'], 'context_recall': resulted_metrics['context_recall']})

    ## save individual results

    resulted_metrics.to_pandas().to_csv(f'../data/chunking_configurations/{index_name}_results.csv',index=False)


## save the results
df = pd.DataFrame(chunking_configuration_results,columns=['configuration', 'answer_relevancy', 'context_precision', 'context_recall'])
df.to_csv('../data/chunking_configurations/chunking_configuration_results.csv',index=False)

In [None]:

## first define embeddings for the db


## define what index to use and instantiate the vector store

index_name = 'pubmedbert-sentence-transformer-400'

elastic_vector_search = ElasticsearchStore(
    es_cloud_id=ELASTIC_CLOUD_ID,
    index_name=index_name,
    embedding=embeddings,
    es_api_key=ELASTIC_API_KEY,
)

## define the LLM model to use | later this can be overwritten by the user
llm = prepare_llm(HUGGINGFACE_TOKEN,model_id=model_id,use_openai=True)

## create configuration for the run_config function
save_path = '../data/rag_validation_answers_400.csv'

config_1 = {
    "index_name": index_name,
    'evaluation_dataset_path': QA_VALIDATION_DATASET,
    'HUGGINGFACE_TOKEN': HUGGINGFACE_TOKEN,
    'HUGGINGFACE_DATASET_NAME': HUGGINGFACE_DATASET_NAME,
    'llm': llm,
    'QA_VALIDATION_TOKEN': QA_VALIDATION_TOKEN,
    'save_path': save_path,
    'max_retrieved_docs': 3
}

## this will save the results under the given path as a csv file
## the file will contain the question and the result for each question in the validation dataset (questions generated with RAGas from the new dataset)
## takes about 20-30 mins on T4 GPU
answers = run_config(elastic_vector_search=elastic_vector_search,
                     use_ensemble_retriever=False,
                     verbose=True,
                     config_name='new_dataset_400',
                     save=True,
                     **config_1)


In [None]:

config_2 = {
    'QA_VALIDATION_TOKEN': QA_VALIDATION_TOKEN,
    'QA_VALIDATION_DATASET': QA_VALIDATION_DATASET,
    'save_path': save_path,
    'save_path_result': '../data/validation_400_gpt_3-5-turbo.csv' 
}

## this is a Dataset on which the RAGAs metrics can be applied
result_dataset = testset_to_validation(save=True,**config_2)

## get ragas metrics
resulted_metrics = evaluate(
    result_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
    ],
)

pprint(resulted_metrics)

### Influence of weight of ensemble retriever in context_precision

As we deal with a medical appication we desire precision over recall (in terms of IR).
We will analyze this in the context of the weight of the BM25 retriever in the ensemble retriever.

In [1]:
from langchain_community.document_loaders import HuggingFaceDatasetLoader

loader = HuggingFaceDatasetLoader("MaraEliana/pubmed-abstracts",use_auth_token="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",page_content_column='abstract')
data = loader.load()



In [4]:
llm = ChatOpenAI(temperature = 0, openai_api_key = OPENAI_API_KEY)
eval_dataset = load_dataset(QA_VALIDATION_DATASET,token=QA_VALIDATION_TOKEN)['train']



In [12]:
index_name = 'pubmedbert-sentence-transformer-400'

elastic_vector_search = ElasticsearchStore(
        es_cloud_id = ELASTIC_CLOUD_ID,
        index_name = index_name,
        embedding = embeddings,
        es_api_key = ELASTIC_API_KEY
    )

def load_ensemble_retriever(index_name,_elastic_vector_search):
    text_splitter = get_splitter_per_index(index_name)
    retriever = create_ensemble_retriever(_elastic_vector_search, text_splitter, neuro_weight=0,max_retrieved_docs=20)
    return retriever

## buffer ensemble retriever for consecutive uses
ensemble_retriever = load_ensemble_retriever(index_name,elastic_vector_search)


Using the latest cached version of the dataset since MaraEliana/pubmed-abstracts couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\priot\.cache\huggingface\datasets\MaraEliana___pubmed-abstracts\default\0.0.0\110dab8c7d5a1e9a2f94f11694acfd43ce4df88e (last modified on Tue Feb 27 17:19:04 2024).


In [13]:
rag = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        verbose=True,
        retriever = ensemble_retriever,
        chain_type_kwargs={
            "verbose": True },
    )

In [14]:
max_questions_to_evaluate = 25
counter = 0

answers = []

for example in tqdm(eval_dataset,desc="generate RAG answers"):
    answers.append(rag(example['question']))
    counter +=1 
    if counter == max_questions_to_evaluate:
        break

generate RAG answers:   0%|          | 0/67 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
" although artificial intelligence ( ai ) has had a profound impact on areas such as image recognition, comparable advances in drug discovery are rare. this article quantifies the stages of drug discovery in which improvements in the time taken, success rate or affordability will have the most profound overall impact on bringing new drugs to market. changes in clinical success rates will have the most profound impact on improving success in drug discovery ; in other words, the quality of decisions regarding which compound to take forward ( and how to conduct clinical trials ) are more important than speed or cost. although cur

generate RAG answers:   1%|▏         | 1/67 [00:05<06:18,  5.74s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
" artificial intelligence ( ai ) is changing healthcare and the practice of medicine as data - driven science and machine - learning technologies, in particular, are contributing to a variety of medical and clinical tasks. such advancements have also raised many questions, especially about public trust. as a response to these concerns there has been a concentrated effort from public bodies, policy - makers and technology companies leading the way in ai to address what is identified as a \ " public trust deficit \ ". this paper argues that a focus 

generate RAG answers:   3%|▎         | 2/67 [00:10<05:37,  5.19s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
" an inverse relationship between risk of schizophrenia and premorbid iq is a robust empirical finding. cognitive impairment may be a core feature of schizophrenia in addition to the clinical symptoms that have historically defined the disorder. "

" the authors sought to clarify the relationship between iq and subsequent risk for schizophrenia. "

" although schizophrenia is characterized by impairments in intelligence and the loss of brain volume, the relationship between changes in iq and brain measures is not clear. "

" there is uncertainty a

generate RAG answers:   4%|▍         | 3/67 [00:14<04:44,  4.44s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
" following the innovations and new discoveries of the last 10 \ u2009years in the field of lung ultrasound ( lus ), a multidisciplinary panel of international lus experts from six countries and from different fields ( clinical and technical ) reviewed and updated the original international consensus for point - of - care lus, dated 2012. as a result, a total of 20 statements have been produced. each statement is complemented by guidelines and future developments proposals. the statements are furthermore classified based on their nature as technic

generate RAG answers:   6%|▌         | 4/67 [00:20<05:20,  5.09s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
" direct pcr can be used to successfully generate full str profiles from dna present on the surface of objects. str profiles are only of use in cases where a potential donor profile is available for comparison, and dna is of sufficient dna quality and quantity to generate a reliable profile. often, no donor information is available and only trace dna is present on items. as a result, alternative techniques are required to generate genetic data that can provide investigative leads. massively parallel sequencing ( mps ) offers the ability to detect 

generate RAG answers:   7%|▋         | 5/67 [00:24<05:07,  4.96s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
facilitate better classification and enable enhanced risk prediction for relevant outcomes. "

" risk calculators are an underused tool for surgeons and trainees when determining and communicating surgical risk. we summarize some of the more common risk calculators and discuss their evolution and limitations. we also describe artificial intelligence models, which have the potential to help clinicians better understand and use risk assessment. "

" previous studies employed varying methods, predictors, and endpoints to determine how to best predict

generate RAG answers:   9%|▉         | 6/67 [00:29<05:01,  4.95s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
" the information generated through drug profiling can be used to infer a common source between one or several seizures as well as drug trafficking routes to provide insights into drug markets. although well established, it is time - consuming and ineffective to compare all drug profiles manually. in recent years, there has been a push to automate processes to enable a more efficient comparison of illicit drug specimens. various chemometric methods have been employed to compare and interpret forensic case data promptly. the intelligence that is pr

generate RAG answers:  10%|█         | 7/67 [00:35<05:08,  5.14s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
" machine learning has proven useful in analyzing complex biological data and has greatly influenced the course of research in structural biology and precision medicine. deep neural network models oftentimes fail to predict the structure of complex proteins and are heavily dependent on experimentally determined structures for their training and validation. single - particle cryogenic electron microscopy ( cryoem ) is also advancing the understanding of biology and will be needed to complement these models by continuously supplying high - quality e

generate RAG answers:  12%|█▏        | 8/67 [00:39<04:36,  4.69s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
" the centers for disease control and prevention ( cdc ) utilizes a blood lead reference value ( blrv ) to identify children with elevated blood lead levels ( blls ). at or above the blrv, the cdc recommends actions be taken to reduce children's blls. in 2021, the cdc updated its blrv to 3. 5 \ u00a0 \ u03bcg / dl. to align with the cdc's updated blrv, the fda is updating its interim reference levels ( irls ) for lead from food to 2. 2 \ u00a0 \ u03bcg / day for children and 8. 8 \ u00a0 \ u03bcg / day for females of childbearing age. the updated 

generate RAG answers:  13%|█▎        | 9/67 [00:43<04:26,  4.59s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
" artificial intelligence and algorithms are increasingly able to replace human workers in cognitively sophisticated tasks, including ones related to justice. many governments and international organizations are discussing policies related to the application of algorithmic judges in courts. in this paper, we investigate the public perceptions of algorithmic judges. across two experiments ( n \ u2009 = \ u20091, 822 ), and an internal meta - analysis ( n \ u2009 = \ u20093, 039 ), our results show that even though court users acknowledge several ad

generate RAG answers:  15%|█▍        | 10/67 [00:48<04:23,  4.62s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
" in a year when disagreements over scientific matters like covid - 19 continue to occupy political discourse, the surfacing of a spate of high - profile research errors is regrettable. it's crucial that the public trusts science at a time when so many topics - artificial intelligence, climate change, and pandemics - cast shadows of uncertainty on the future. errors, intentional or not, erode confidence in science. it's not surprising that science integrity has become a focal point for major institutions in the united states, from the white house 

generate RAG answers:  16%|█▋        | 11/67 [00:51<04:03,  4.35s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
" this short review aims at providing the readers with an update on the current status, as well as future perspectives in the quickly evolving field of radiomics applied to the field of pet / ct imaging. numerous pitfalls have been identified in study design, data acquisition, segmentation, features calculation and modeling by the radiomics community, and these are often the same issues across all image modalities and clinical applications, however some of these are specific to pet / ct ( and spect / ct ) imaging and therefore the present paper fo

generate RAG answers:  18%|█▊        | 12/67 [00:58<04:35,  5.01s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
- 19. communicated by ramaswamy h. sarma. "

" nidra ( sleep ), ahara ( food ) and brahmacharya ( abstinence ) are the three sub - pillars of health and alterations in these basic pillars of health can lead to mortality and morbidity. among these, nidra has a critical role in the biological and psychological functioning of the body. the circadian rhythm is the physiological machinery that controls and regulates physiological activities throughout the 24 hours in conjunction with the day and night. the synchronicity of the circadian rhythm and adeq

generate RAG answers:  19%|█▉        | 13/67 [01:02<04:20,  4.82s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
" web - based public reporting by means of dashboards has become an essential tool for governments worldwide to monitor covid - 19 information and communicate it to the public. the actionability of such dashboards is determined by their fitness for purpose - meeting a specific information need - and fitness for use - placing the right information into the right hands at the right time and in a manner that can be understood. "

" natural language processing models such as chatgpt can generate text - based content and are poised to become a major in

generate RAG answers:  21%|██        | 14/67 [01:08<04:22,  4.96s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
factors. to conclude, most of the current literature is probably quite optimistic with internal validation using loo cv. more efforts should be made to encourage the use of external validation with external test sets to further improve generalizability of the models. "

" machine learning models may outperform traditional statistical regression algorithms for predicting clinical outcomes. proper validation of building such models and tuning their underlying algorithms is necessary to avoid over - fitting and poor generalizability, which smaller da

generate RAG answers:  22%|██▏       | 15/67 [01:12<04:09,  4.80s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
quality and lack of data access. this audit required 115 person - hours across 8 - 10 months. our recommendations for performing reliability and fairness audits include verifying data validity, analyzing model performance on intersectional subgroups, and collecting clinician - patient linkages as necessary for label generation by clinicians. those responsible for ai models should require such audits before model deployment and mediate between model auditors and impacted stakeholders. "

" artificial intelligence systems for health care, like any o

generate RAG answers:  24%|██▍       | 16/67 [01:17<04:12,  4.95s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
show efficacy for psychotic symptoms, but adjunctive buspirone may be associated with improvement in extrapyramidal symptoms and cognitive deficits in schizophrenia. due to the preliminary nature of this meta - analysis, larger sample size and higher quality rcts are needed to confirm these finding. "

prolonged post symptomatic viral shredding. common adverse effects of leflunomide were hyperlipidemia, leucopenia, neutropenia and liver - function alteration. leflunomide / teriflunomide may serve as an agent of importance to achieve faster virolog

generate RAG answers:  25%|██▌       | 17/67 [01:21<03:47,  4.56s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
" physicians in everyday clinical practice are under pressure to innovate faster than ever because of the rapid, exponential growth in healthcare data. \ " big data \ " refers to extremely large data sets that cannot be analyzed or interpreted using traditional data processing methods. in fact, big data itself is meaningless, but processing it offers the promise of unlocking novel insights and accelerating breakthroughs in medicine - which in turn has the potential to transform current clinical practice. physicians can analyze big data, but at pre

generate RAG answers:  27%|██▋       | 18/67 [01:31<05:06,  6.25s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
over the first test dataset remained the same at 72. 8 % ( f - score 0. 721 ). the accuracy over the combined test datasets was then 72. 4 % ( f - score 0. 720 ), a 2 % improvement. through fine - tuning a machine - learning model on task - specific data, the accuracy achieved in categorizing tweets was close to that expected by a single human annotator. regular training of machine - learning models with recent data is advisable to maximize accuracy. "

" class - prediction accuracy provides a quick but superficial way of determining classifier pe

generate RAG answers:  28%|██▊       | 19/67 [01:35<04:19,  5.41s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
" the biological significance of proteins attracted the scientific community in exploring their characteristics. the studies shed light on the interaction patterns and functions of proteins in a living body. due to their practical difficulties, reliable experimental techniques pave the way for introducing computational methods in the interaction prediction. automated methods reduced the difficulties but could not yet replace experimental studies as the field is still evolving. interaction prediction problem being critical needs highly accurate res

generate RAG answers:  30%|██▉       | 20/67 [01:40<04:16,  5.47s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
" several factors are involved in obtaining the competence of providing spiritual care in nursing students. the purpose of this study was to explain the relationship between moral intelligence and the professional self - concept with the competency of the nursing students in providing spiritual care to promote nursing education. "

" clinical competency is one of the most important requirements in nursing profession, based on which nurses are assessed. to obtain an effective and improved form of clinical competency, several factors are observed an

generate RAG answers:  31%|███▏      | 21/67 [01:45<03:59,  5.21s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
" several factors are involved in obtaining the competence of providing spiritual care in nursing students. the purpose of this study was to explain the relationship between moral intelligence and the professional self - concept with the competency of the nursing students in providing spiritual care to promote nursing education. "

" the aim of this study was to identify the relationship between perceived competence in spiritual care and spiritual intelligence among nursing students. "

" some nurses leave their job because of working conditions, 

generate RAG answers:  33%|███▎      | 22/67 [01:49<03:46,  5.03s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
" the recognition of child physical abuse can be challenging and often requires a multidisciplinary assessment. deep learning models, based on clinical characteristics, laboratory studies, and imaging findings, were developed to facilitate unbiased identification of children who may have been abused. "

" child maltreatment remains a serious public health issue in the united states. therefore, it is important to engage in quality control of the assessment, prevention, and treatment services for families affected by maltreatment. parenting capacity

generate RAG answers:  34%|███▍      | 23/67 [01:54<03:41,  5.03s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
" improving the rate of polyp detection is an important measure to prevent colorectal cancer ( crc ). real - time automatic polyp detection systems, through deep learning methods, can learn and perform specific endoscopic tasks previously performed by endoscopists. the purpose of this study was to explore whether a high - performance, real - time automatic polyp detection system could improve the polyp detection rate ( pdr ) in the actual clinical environment. "

" artificial intelligence ( ai ) for polyp detection is being introduced to colonosco

generate RAG answers:  36%|███▌      | 24/67 [02:00<03:39,  5.11s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
" improving the rate of polyp detection is an important measure to prevent colorectal cancer ( crc ). real - time automatic polyp detection systems, through deep learning methods, can learn and perform specific endoscopic tasks previously performed by endoscopists. the purpose of this study was to explore whether a high - performance, real - time automatic polyp detection system could improve the polyp detection rate ( pdr ) in the actual clinical environment. "

" artificial intelligence ( ai ) tools aimed at improving polyp detection have been s

generate RAG answers:  36%|███▌      | 24/67 [02:05<03:44,  5.23s/it]


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m





In [15]:
answers_df = pd.DataFrame(answers)
result_df = pd.merge(answers_df, eval_dataset.to_pandas(), left_on='query', right_on='question', how='inner')
result_df = result_df.drop(columns=['query','question_type','episode_done'])
## first parse the ground_truth and ground_truth context by \n
columns_mapping = {'question': 'question', 'result': 'answer', 'ground_truth_context':'contexts'} #'ground_truth': 'ground_truths',
result_df = result_df.rename(columns=columns_mapping)

result_df['contexts'] = result_df['contexts'].apply(lambda x: [x])
result_df_dataset = Dataset.from_pandas(result_df)

In [16]:
resulted_metrics = evaluate(
    result_df_dataset,
    metrics=[
        context_precision,
    ],
)

pprint(resulted_metrics)

Evaluating:   0%|          | 0/25 [00:00<?, ?it/s]

{'context_precision': 0.9200}


### For Mara

Using Ollama generate a bigger validation dataset of 5000 items (make TEST_SET_SIZE 5000).
Change ChatOpenAI with the llama model (line 15/16)
Save the resulting csv locally and send it to me so I can upload it to huggingface.

In [None]:
TEST_SET_SIZE = 100

In [None]:
# %pip install ragas 
from ragas.testset import TestsetGenerator
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from ragas.llms import LangchainLLM
import random
#https://docs.ragas.io/en/latest/howtos/customisations/llms.html

sub_data = random.sample(data, TEST_SET_SIZE)

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
print(OPENAI_API_KEY)

# Add custom llms and embeddings
generator_llm = LangchainLLM(llm=ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY))
critic_llm = LangchainLLM(llm=ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY)) ## should be gpt-4 but we dont have access
embeddings_model = embeddings

# Change resulting question type distribution
testset_distribution = {
    "simple": 0.25,
    "reasoning": 0.25,
    "multi_context": 0.25,
    "conditional": 0.25,
}

# percentage of conversational question
chat_qa = 0.1


test_generator = TestsetGenerator(
    generator_llm=generator_llm,
    critic_llm=critic_llm,
    embeddings_model=embeddings_model,
    testset_distribution=testset_distribution,
    chat_qa=chat_qa,
)

testset = test_generator.generate(sub_data, test_size=TEST_SET_SIZE) ## why second parameter is 5?