In [132]:
import os
from tqdm import tqdm
from datasets import load_dataset
from elasticsearch import Elasticsearch
from langchain_community.vectorstores import ElasticsearchStore
from langchain_community.document_loaders import TextLoader
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import SpacyTextSplitter
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from elasticsearch import Elasticsearch
from getpass import getpass
from utils import *
from dotenv import load_dotenv
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)

from ragas import evaluate


load_dotenv()  # take environment variables from .env.

HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
HUGGINGFACE_USERNAME = os.getenv('HUGGINGFACE_USERNAME')
HUGGINGFACE_DATASET_NAME = os.getenv('HUGGINGFACE_DATASET_NAME')
ELASTIC_CLOUD_ID = os.getenv('ELASTIC_CLOUD_ID')
ELASTIC_API_KEY = os.getenv('ELASTIC_API_KEY')
QA_VALIDATION_DATASET = os.getenv('QA_VALIDATION_DATASET')
QA_VALIDATION_TOKEN = os.getenv('QA_VALIDATION_TOKEN')

model_name = "NeuML/pubmedbert-base-embeddings"

from langchain_community.embeddings import HuggingFaceEmbeddings
from pprint import pprint

# Validation dataset (without RAG answers): https://huggingface.co/datasets/prio7777777/pubmed-qa-validation

In [None]:
'''
Example on how to run a validation for a given configuration
NOTE: this has not been tested holistically, but the code should work
'''

## first define embeddings for the db

model_name = "NeuML/pubmedbert-base-embeddings"
device = 'cuda:0'

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': device},
    encode_kwargs={'device': device}
)

## define what index to use and instantiate the vector store

index_name = 'qa_project_pubmedbert-400-new-dataset'

elastic_vector_search = ElasticsearchStore(
    es_cloud_id=ELASTIC_CLOUD_ID,
    index_name=index_name,
    embedding=embeddings,
    es_api_key=ELASTIC_API_KEY,
)

## define the LLM model to use | later this can be overwritten by the user
llm = prepare_llm(HUGGINGFACE_TOKEN)

## create configuration for the run_config function
save_path = 'rag_validation_answers_400.csv'

config_1 = {
    "index_name": index_name,
    'evaluation_dataset_path': QA_VALIDATION_DATASET,
    'HUGGINGFACE_TOKEN': HUGGINGFACE_TOKEN,
    'HUGGINGFACE_DATASET_NAME': HUGGINGFACE_DATASET_NAME,
    'llm': llm,
    'QA_VALIDATION_TOKEN': QA_VALIDATION_TOKEN,
    'save_path': save_path
}

## this will save the results under the given path as a csv file
## the file will contain the question and the result for each question in the validation dataset (questions generated with RAGas from the new dataset)
## takes about 20-30 mins on T4 GPU
answers = run_config(elastic_vector_search=elastic_vector_search,
                     use_ensemble_retriever=False,
                     verbose=True,
                     config_name='new_dataset_400',
                     save=True,
                     **config_1)

config_2 = {
    'QA_VALIDATION_TOKEN': QA_VALIDATION_TOKEN,
    'A_VALIDATION_DATASET': QA_VALIDATION_DATASET,
    'save_path_answers': save_path,
    'save_path_result': 'validation_400.csv' 
}

## this is a Dataset on which the RAGAs metrics can be applied
result_dataset = testset_to_validation(save=True,**config_2)

## get ragas metrics
resulted_metrics = evaluate(
    result_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
    ],
)

pprint(resulted_metrics)

In [31]:
from langchain_community.document_loaders import HuggingFaceDatasetLoader

loader = HuggingFaceDatasetLoader("MaraEliana/pubmed-abstracts",use_auth_token="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",page_content_column='abstract')
data = loader.load()

In [32]:
# data = data['train']
data

[Document(page_content='"A meta-analysis of 63 studies showed a significant negative association between intelligence and religiosity. The association was stronger for college students and the general population than for participants younger than college age; it was also stronger for religious beliefs than religious behavior. For college students and the general population, means of weighted and unweighted correlations between intelligence and the strength of religious beliefs ranged from -.20 to -.25 (mean r = -.24). Three possible interpretations were discussed. First, intelligent people are less likely to conform and, thus, are more likely to resist religious dogma. Second, intelligent people tend to adopt an analytic (as opposed to intuitive) thinking style, which has been shown to undermine religious beliefs. Third, several functions of religiosity, including compensatory control, self-regulation, self-enhancement, and secure attachment, are also conferred by intelligence. Intelli

In [33]:
from pprint import pprint
pprint(data[77].metadata.keys())

dict_keys(['title', 'publication_date', 'id', 'authors'])


In [57]:
query = "What is the role of artificial intelligence in nephrology?"
results = elastic_vector_search.similarity_search(query,k=50)


titles_elastic = [res.metadata["Title"] for res in results]
for res in results:
    print(res.metadata['Title'])

Artificial Intelligence in Nephrology Core Concepts Clinical Applications and Perspectives
Artificial Intelligence in Kidney Cancer
Kidney cancer management 3 0 can artificial intelligence make us better
Artificial Intelligence in Pediatric Nephrology A Call for Action
Artificial intelligence and machine learning in nephropathology
The promise of artificial intelligence for kidney pathophysiology
Artificial intelligence the future of urinary stone management
Time for a full digital approach in nephropathology a systematic review of current artificial intelligence applications and future directions
Artificial intelligence approaches to improve kidney care
Machine learning in medicine Medical droids tricorders and a computer named Hal 9000
Overcoming barriers to implementation of artificial intelligence in gastroenterology
Artificial intelligence enabled decision support in nephrology
Artificial intelligence in the diagnosis treatment and prevention of urinary stones
Artificial intellige

In [22]:
## We currently do not support Windows...
## try https://docs.vllm.ai/en/latest/serving/deploying_with_docker.html
%pip install vllm

Collecting vllm
  Using cached vllm-0.3.0.tar.gz (264 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × Getting requirements to build wheel did not run successfully.
  │ exit code: 1
  ╰─> [28 lines of output]
        device: torch.device = torch.device(torch._C._get_default_device()),  # torch.device('cpu'),
      Traceback (most recent call last):
        File "c:\Users\priot\anaconda3\envs\computing\Lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py", line 353, in <module>
          main()
        File "c:\Users\priot\anaconda3\envs\computing\Lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py", line 335, in main
          json_out['return_val'] = hook(**hook_input['kwargs'])
                                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "c:\Users\priot\anaconda3\envs\computing\Lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py", line 118, in get_requires_for_build_wheel
          return hook(config_settings)
                 ^^^^^^^^^^^^^^^^^^^^^
        File "C

In [18]:
from langchain_community.llms import VLLMOpenAI
'''
Run this to open server before running this cell:

!python -m vllm.entrypoints.openai.api_server --model HuggingFaceH4/zephyr-7b-alpha --host 0.0.0.0 --port 8080

'''
# https://python.langchain.com/docs/integrations/llms/vllm

llm = VLLMOpenAI(
    openai_api_key="EMPTY",
    openai_api_base="http://localhost:8000/v1",
    model_name="tiiuae/falcon-7b",
    model_kwargs={"stop": ["."]},
)
print(llm.invoke("Rome is"))

APIConnectionError: Connection error.

In [34]:
data

[Document(page_content='"A meta-analysis of 63 studies showed a significant negative association between intelligence and religiosity. The association was stronger for college students and the general population than for participants younger than college age; it was also stronger for religious beliefs than religious behavior. For college students and the general population, means of weighted and unweighted correlations between intelligence and the strength of religious beliefs ranged from -.20 to -.25 (mean r = -.24). Three possible interpretations were discussed. First, intelligent people are less likely to conform and, thus, are more likely to resist religious dogma. Second, intelligent people tend to adopt an analytic (as opposed to intuitive) thinking style, which has been shown to undermine religious beliefs. Third, several functions of religiosity, including compensatory control, self-regulation, self-enhancement, and secure attachment, are also conferred by intelligence. Intelli

In [36]:
TEST_SET_SIZE = 100

In [37]:
# %pip install ragas 
from ragas.testset import TestsetGenerator
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from ragas.llms import LangchainLLM
import random
#https://docs.ragas.io/en/latest/howtos/customisations/llms.html

sub_data = random.sample(data, TEST_SET_SIZE)

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
print(OPENAI_API_KEY)

# Add custom llms and embeddings
generator_llm = LangchainLLM(llm=ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY))
critic_llm = LangchainLLM(llm=ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY)) ## should be gpt-4 but we dont have access
embeddings_model = embeddings

# Change resulting question type distribution
testset_distribution = {
    "simple": 0.25,
    "reasoning": 0.25,
    "multi_context": 0.25,
    "conditional": 0.25,
}

# percentage of conversational question
chat_qa = 0.1


test_generator = TestsetGenerator(
    generator_llm=generator_llm,
    critic_llm=critic_llm,
    embeddings_model=embeddings_model,
    testset_distribution=testset_distribution,
    chat_qa=chat_qa,
)

testset = test_generator.generate(sub_data, test_size=TEST_SET_SIZE) ## why second parameter is 5?

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


2080it [29:34,  3.09it/s]

In [38]:
test_df = testset.to_pandas()
test_df


Unnamed: 0,question,ground_truth_context,ground_truth,question_type,episode_done
0,How does the success rate of AI-designed compo...,"[- ""The AI-designed compounds met 9.5 objectiv...",[The success rate of AI-designed compounds in ...,conditional,True
1,What is the recommended approach for building ...,"[- ""This paper argues that a focus on trust as...",[The recommended approach for building a relat...,conditional,True
2,What is the relationship between risk of schiz...,"[- ""An inverse relationship between risk of sc...",[The relationship between risk of schizophreni...,simple,True
3,"According to international guidelines, what ar...","[- ""This can largely replace the previous radi...","[According to the given context, the two recom...",reasoning,True
4,In what ways does direct PCR coupled with MPS ...,"[- ""Direct PCR can be used to successfully gen...",[Direct PCR coupled with MPS enhances forensic...,conditional,True
...,...,...,...,...,...
62,What are the possible roles of AI in neurology?,"[- ""The present times are witness to artificia...",[The possible roles of AI in neurology include...,simple,True
63,What were the predictors of job stress in midw...,"[- ""Results showed that the most predictors of...",[The predictors of job stress in midwives were...,reasoning,True
64,Can AI help estimate interrupted time-series d...,"[- ""The interrupted time-series (ITS) concept ...","[Yes, AI can help estimate interrupted time-se...",conditional,True
65,What ethical concerns arise from the developme...,[- Precision medicine development is driven by...,[The answer to the question cannot be determin...,conditional,True


In [39]:
test_df.to_csv('testset.csv',index=False)

index = 4

In [40]:
from pprint import pprint

pprint(test_df.iloc[index]['question'])

## this is the answer
pprint(test_df.iloc[index]['ground_truth'])


('In what ways does direct PCR coupled with MPS enhance forensic analysis by '
 'producing SNP data from small amounts of DNA on different surfaces?')
['Direct PCR coupled with MPS enhances forensic analysis by allowing the '
 'generation of SNP data from small amounts of DNA on different surfaces. This '
 'method can successfully generate full STR profiles from DNA present on the '
 'surface of objects. MPS, or massively parallel sequencing, can detect trace '
 'levels of DNA and improve the success of DNA analysis from touched items. In '
 'this study, direct PCR coupled with MPS was used to generate forensic '
 'intelligence SNP data from latent DNA. The study analyzed 60 touched samples '
 'across different substrates (glass slide, fuse, zip-lock bag, and wire) and '
 'donors using the Ion AmpliSeq Library Preparation Kit and Ion Torrent PGM. '
 'The results demonstrate the successful recovery of SNPs, concordance with '
 'reference samples, and genotype reproducibility from differ

In [41]:
## relevant contexts split by \n
pprint(test_df.iloc[index]['ground_truth_context'][0])

('- "Direct PCR can be used to successfully generate full STR profiles from '
 'DNA present on the surface of objects."\n'
 '- "Massively parallel sequencing (MPS) offers the ability to detect trace '
 'levels of DNA and improve DNA analysis success from touched items."\n'
 '- "Here, we present the first application of direct PCR coupled with MPS to '
 'generate forensic intelligence SNP data from latent DNA."\n'
 '- "The panels assessed are (1) the HIrisplex System that targets 24 SNPs to '
 'simultaneously predict hair and eye, and (2) the Precision ID Ancestry Panel '
 'that targets 165 autosomal SNPs indicative of biogeographic ancestry."\n'
 '- "For each panel, we analysed 60 touched samples across five individuals '
 'and four substrates (glass slide, fuse, zip-lock bag and wire) using Ion '
 'AmpliSeq Library Preparation Kit on the automated Ion Chef System and Ion '
 'Torrent PGM."\n'
 '- "We examine the SNP recovery, concordance with reference samples and the '
 'genotype repr