In [34]:
import os
from tqdm import tqdm
from datasets import load_dataset
from elasticsearch import Elasticsearch
from langchain_community.vectorstores import ElasticsearchStore
from langchain_community.document_loaders import TextLoader
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import SpacyTextSplitter
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from opensearch_utils import *
from huggingface_utils import *
from elasticsearch import Elasticsearch
from getpass import getpass

from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.

HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
HUGGINGFACE_USERNAME = os.getenv('HUGGINGFACE_USERNAME')
HUGGINGFACE_DATASET_NAME = os.getenv('HUGGINGFACE_DATASET_NAME')
ELASTIC_CLOUD_ID = os.getenv('ELASTIC_CLOUD_ID')
ELASTIC_API_KEY = os.getenv('ELASTIC_API_KEY')

model_name = "NeuML/pubmedbert-base-embeddings"

from langchain_community.embeddings import HuggingFaceEmbeddings

device = 'cpu' # make sure you are on gpu
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': device},
    encode_kwargs={'device': device}
)

In [2]:
index_name = 'qa_project_pubmedbert-50' # previously index="test_pubmed_split"
elastic_vector_search = ElasticsearchStore(
    es_cloud_id=ELASTIC_CLOUD_ID,
    index_name=index_name,
    embedding=embeddings,
    es_api_key=ELASTIC_API_KEY,
)

In [3]:
from langchain_community.document_loaders import HuggingFaceDatasetLoader

loader = HuggingFaceDatasetLoader(HUGGINGFACE_DATASET_NAME,use_auth_token=HUGGINGFACE_TOKEN,page_content_column='Abstract')
data = loader.load()



In [4]:
from pprint import pprint
pprint(data[77].metadata.keys())

dict_keys(['Journal', 'Title', 'Authors', 'Author_Information', 'DOI', 'Misc', 'Published_date'])


In [57]:
query = "What is the role of artificial intelligence in nephrology?"
results = elastic_vector_search.similarity_search(query,k=50)


titles_elastic = [res.metadata["Title"] for res in results]
for res in results:
    print(res.metadata['Title'])

Artificial Intelligence in Nephrology Core Concepts Clinical Applications and Perspectives
Artificial Intelligence in Kidney Cancer
Kidney cancer management 3 0 can artificial intelligence make us better
Artificial Intelligence in Pediatric Nephrology A Call for Action
Artificial intelligence and machine learning in nephropathology
The promise of artificial intelligence for kidney pathophysiology
Artificial intelligence the future of urinary stone management
Time for a full digital approach in nephropathology a systematic review of current artificial intelligence applications and future directions
Artificial intelligence approaches to improve kidney care
Machine learning in medicine Medical droids tricorders and a computer named Hal 9000
Overcoming barriers to implementation of artificial intelligence in gastroenterology
Artificial intelligence enabled decision support in nephrology
Artificial intelligence in the diagnosis treatment and prevention of urinary stones
Artificial intellige

In [24]:
# %pip install pymed

from pymed import PubMed

pubmed = PubMed(tool="MyTool", email='test')

results = pubmed.query('artificial intelligence in nephrology', max_results=20)

## see: https://github.com/gijswobben/pymed/blob/master/pymed/article.py
'''
these dont match with the ones I see in the browser
'''

for i,res in enumerate(results):
    print(i,") ",res.title)

In [50]:
# %pip install biopython
from Bio import Entrez

def search(query):
    Entrez.email = 'your.email@example.com'
    handle = Entrez.esearch(db='pubmed',
                            sort='relevance',
                            retmax='150',
                            retmode='xml',
                            term=query)
    results = Entrez.read(handle)
    return results

def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'your.email@example.com'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results

results=search('What is the role of artificial intelligence in nephrology?')
id_list = results['IdList']
papers = fetch_details(id_list)

titles_bio = [paper['MedlineCitation']['Article']['ArticleTitle'] for paper in papers['PubmedArticle']]
for i, paper in enumerate(papers['PubmedArticle']):
         print("{}) {}".format(i+1, paper['MedlineCitation']['Article']['ArticleTitle']))


1) Artificial Intelligence in Nephrology: Core Concepts, Clinical Applications, and Perspectives.
2) Intradialytic Hypotension: Mechanisms and Outcome.
3) Artificial intelligence-enabled decision support in nephrology.
4) Role of Artificial Intelligence in Kidney Disease.
5) A Review of the Role of Artificial Intelligence in Healthcare.
6) Exploring the Potential of Chatbots in Critical Care Nephrology.
7) Artificial Intelligence in Pediatric Nephrology-A Call for Action.
8) Artificial intelligence at the time of COVID-19: who does the lion's share?
9) Toward generalizing the use of artificial intelligence in nephrology and kidney transplantation.
10) Artificial Intelligence in Nephrology: How Can Artificial Intelligence Augment Nephrologists' Intelligence?
11) Identification of Markers for Diagnosis and Treatment of Diabetic Kidney Disease Based on the Ferroptosis and Immune.
12) Lipophagy deficiency exacerbates ectopic lipid accumulation and tubular cells injury in diabetic nephropat

In [58]:
## check overlap between my results and the ones from the browser
## nu au punctuatie unele titluri

import re
## strip punctuation and lowercase
titles_elastic = [re.sub(r'[^\w\s]', '', title.lower()) for title in titles_elastic]
titles_bio = [re.sub(r'[^\w\s]', '', title.lower()) for title in titles_bio]

common_articles = set(titles_elastic).intersection(set(titles_bio))
common_articles

'''
CONCLUZIE:

rezultatele de pe pubmed prin api sunt destul de diferite de cele de la noi si e de asteptat nu putem sa folosim 
pe alea pentru evaluare ca baseline
'''

{'artificial intelligence in nephrology core concepts clinical applications and perspectives',
 'artificial intelligence in surgical training for kidney cancer a systematic review of the literature'}

In [9]:
# %pip install xmltodict
# https://python.langchain.com/docs/integrations/document_loaders/pubmed
# https://docs.ragas.io/en/stable/concepts/testset_generation.html#testset-generation

from langchain_community.document_loaders import PubMedLoader

loader = PubMedLoader('intelligence in nephrology',load_max_docs=50)
docs = loader.load()

for doc in docs:
    print(doc.metadata['Title'])

Older Tissue Age Derived From Abdominal Computed Tomography Biomarkers of Muscle, Fat, and Bone Is Associated With Chronic Conditions and Higher Mortality.
Reducing echocardiographic examination time through routine use of fully automated software: a comparative study of measurement and report creation time.
Echocardiographic artificial intelligence for pulmonary hypertension classification.
Artificial intelligence and Machine Learning Trends in Kidney Care.
Artificial Intelligence Electrocardiography Detecting Thyrotoxic Periodic Paralysis Following a SARS-CoV-2 Infection.
Reducing Barriers and Improving Motivations of Students Enrolled in Summer Health Professions Exposure Programs.
Impact of metformin on cardiovascular and kidney outcome based on kidney function status in type 2 diabetic patients: a multicentric, retrospective cohort study.
Efficacy and safety of mTOR inhibition in cutaneous sarcoidosis: a single-centre trial.
The multilevel extensive diversity across the cynomolgus

In [22]:
## We currently do not support Windows...
## try https://docs.vllm.ai/en/latest/serving/deploying_with_docker.html
%pip install vllm

Collecting vllm
  Using cached vllm-0.3.0.tar.gz (264 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × Getting requirements to build wheel did not run successfully.
  │ exit code: 1
  ╰─> [28 lines of output]
        device: torch.device = torch.device(torch._C._get_default_device()),  # torch.device('cpu'),
      Traceback (most recent call last):
        File "c:\Users\priot\anaconda3\envs\computing\Lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py", line 353, in <module>
          main()
        File "c:\Users\priot\anaconda3\envs\computing\Lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py", line 335, in main
          json_out['return_val'] = hook(**hook_input['kwargs'])
                                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "c:\Users\priot\anaconda3\envs\computing\Lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py", line 118, in get_requires_for_build_wheel
          return hook(config_settings)
                 ^^^^^^^^^^^^^^^^^^^^^
        File "C

In [18]:
from langchain_community.llms import VLLMOpenAI
'''
Run this to open server before running this cell:

!python -m vllm.entrypoints.openai.api_server --model HuggingFaceH4/zephyr-7b-alpha --host 0.0.0.0 --port 8080

'''
# https://python.langchain.com/docs/integrations/llms/vllm

llm = VLLMOpenAI(
    openai_api_key="EMPTY",
    openai_api_base="http://localhost:8000/v1",
    model_name="tiiuae/falcon-7b",
    model_kwargs={"stop": ["."]},
)
print(llm.invoke("Rome is"))

APIConnectionError: Connection error.

In [37]:
# %pip install ragas 
from ragas.testset import TestsetGenerator
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from ragas.llms import LangchainLLM
#https://docs.ragas.io/en/latest/howtos/customisations/llms.html

sub_data = data[:5]

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
print(OPENAI_API_KEY)

# Add custom llms and embeddings
generator_llm = LangchainLLM(llm=ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY))
critic_llm = LangchainLLM(llm=ChatOpenAI(model="gpt-4", openai_api_key=OPENAI_API_KEY))
embeddings_model = embeddings

# Change resulting question type distribution
testset_distribution = {
    "simple": 0.25,
    "reasoning": 0.5,
    "multi_context": 0.0,
    "conditional": 0.25,
}

# percentage of conversational question
chat_qa = 0.2


test_generator = TestsetGenerator(
    generator_llm=generator_llm,
    critic_llm=critic_llm,
    embeddings_model=embeddings_model,
    testset_distribution=testset_distribution,
    chat_qa=chat_qa,
)

testset = test_generator.generate(sub_data, test_size=5)

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




RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}