In [1]:
%pip install python-dotenv
%pip install litellm
%pip install -qU trulens_eval pydantic fastapi kaleido python-multipart uvicorn cohere openai tiktoken "llama-index"
%pip install transformers
%pip install sentence-transformers
%pip install pinecone-client
%pip install datasets
%pip install accelerate
%pip install einops
%pip install langchain
%pip install xformers
%pip install bitsandbytes
%pip install matplotlib seaborn tqdm
%pip install chromadb
%pip install evaluate
%pip install rouge_score
%pip install bert_score
%pip install rank_bm25

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Collecting litellm
  Downloading litellm-1.28.11-py3-none-any.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
Collecting openai>=1.0.0 (from litellm)
  Downloading openai-1.13.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.4/227.4 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
Collecting tiktoken>=0.4.0 (from litellm)
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m86.0 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai>=1.0.0->litellm)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import os
from dotenv import load_dotenv

# load environment variables from .env file
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [4]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

## Data Preparation

In [5]:
import json

file_path = "pubmed_intelligence.json"
with open(file_path, "r", encoding="utf-8") as file:
    docs = json.load(file)

len(docs)

58730

In [6]:
from transformers import LlamaTokenizer

hf_auth = os.environ.get('HF_AUTH')
tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf",use_auth_token=hf_auth)



In [None]:
tokenizer.save_pretrained('./tokenizer')

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/tokenizer.model',
 './tokenizer/added_tokens.json')

In [7]:
def token_len(text):
    tokens = tokenizer.encode(text)
    return len(tokens)

In [7]:
token_counts = [token_len(doc['Abstract']) for doc in docs]
min_tokens=min(token_counts)
avg_tokens=int(sum(token_counts) / len(token_counts))
max_tokens=max(token_counts)

print(f"""Min: {min_tokens}
Avg: {avg_tokens}
Max: {max_tokens}""")

Min: 1
Avg: 365
Max: 18575


In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50,
    length_function=token_len,
    separators=['\n\n', '\n', ' ', '']
)

In [9]:
from tqdm.auto import tqdm

documents = []

for doc in tqdm(docs):
    uid = doc['PMID']
    chunks = text_splitter.split_text(doc['Abstract'])
    for i, chunk in enumerate(chunks):
        documents.append({
            'id': f'{uid}-{i}',
            'text': chunk,
            'source': doc
        })
len(documents)

  0%|          | 0/58730 [00:00<?, ?it/s]

258608

In [15]:
import pandas as pd
data = pd.DataFrame(documents)
data.head()
data.to_excel('documents.xlsx', index=False)

In [9]:
import pandas as pd
data = pd.read_excel('documents.xlsx')

## Document Retriever


In [1]:
from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import os
import pinecone
from tqdm import tqdm

In [2]:
embedding_model = 'sentence-transformers/all-MiniLM-L6-v2'
device = 'cuda:0'

embed_model = HuggingFaceEmbeddings(
    model_name=embedding_model,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Semantic Search

In [23]:
data['text'] = data['text'].astype(str)

In [24]:
from langchain.vectorstores import Chroma
vectordb = Chroma.from_texts(texts=list(data['text']), embedding=embed_model, persist_directory="chroma_db")

In [26]:
query = 'What are the three successive steps involved in the novel mass detection process described for identifying breast abnormalities on mammographic images, according to the paper?'

vectordb.similarity_search(
    query,  # the search query
    k=5,  # returns top 3 most relevant chunks of text
)

[Document(page_content='Female breast cancer is the second most common cancer in the world. Several efforts in artificial intelligence have been made to help improving the diagnostic accuracy at earlier stages. However, the identification of breast abnormalities, like masses, on mammographic images is not a trivial task, especially for dense breasts. In this paper we describe our novel mass detection process that includes three successive steps of enhancement, characterization and classification. The'),
 Document(page_content='for imaging the breast. In the first half of the 20th century, the diagnosis was in practice only clinical, with consequent diagnostic delay and an unfavorable prognosis in the short term. The rise of organized mammography screening has led to a remarkable reduction in mortality through the early detection of breast malignancies. This historical review aims to offer a complete panorama of the development of mammography and breast'),
 Document(page_content='PURPOS

### lexicographical search

In [27]:
# facet search
author = 'Hamrouni K'
start_date = '2013/01/01'
end_date = '2023/01/01'

# filtered_docs = [doc['Abstract'] for doc in docs if (author in doc['Authors']) and (doc['ArticleDate']>start_date and doc['ArticleDate']<end_date)]
filtered_docs = [doc['Abstract'] for doc in docs if (doc['ArticleDate']>start_date and doc['ArticleDate']<end_date)]
print(len(filtered_docs))

43093


In [28]:
from langchain.retrievers import BM25Retriever
bm25_retriever = BM25Retriever.from_texts(filtered_docs)

In [29]:
bm25_retriever.get_relevant_documents(query)

[Document(page_content='Female breast cancer is the second most common cancer in the world. Several efforts in artificial intelligence have been made to help improving the diagnostic accuracy at earlier stages. However, the identification of breast abnormalities, like masses, on mammographic images is not a trivial task, especially for dense breasts. In this paper we describe our novel mass detection process that includes three successive steps of enhancement, characterization and classification. The proposed enhancement system is based mainly on the analysis of the breast texture. First of all, a filtering step with morphological operators and soft thresholding is achieved. Then, we remove from the filtered breast region, all the details that may interfere with the eventual masses, including pectoral muscle and galactophorous tree. The pixels belonging to this tree will be interpolated and replaced by the average of the neighborhood. In the characterization process, measurement of the

### combine semantic search and lexicographical search

In [30]:
from langchain.retrievers import EnsembleRetriever
faiss_retriever = vectordb.as_retriever(search_kwargs={"k":5,})

# combine lexicographical search and semantic search
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever,faiss_retriever],
                                       weights=[0.5,0.5])


## Text Generation Pipeline


In [31]:
from torch import cuda, bfloat16
import os
import transformers
model_id = 'meta-llama/Llama-2-13b-chat-hf'

In [32]:
bitsAndBites_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

In [33]:
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=os.environ.get('HF_AUTH')
)



config.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

In [34]:
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bitsAndBites_config,
    device_map='auto',
    token=os.environ.get('HF_AUTH')
)
model.eval()
print(f"Model loaded ")

model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Model loaded 


In [None]:
model.save_pretrained('./model')

You can even check the memory footprint of your model using the `get_memory_footprint` method.


In [35]:
model.get_memory_footprint()

7100747776

In [36]:
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,
    task='text-generation',
    temperature=0.01,
    max_new_tokens=512,
    repetition_penalty=1.1
)

In [37]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

In [38]:
query = 'What are the three successive steps involved in the novel mass detection process described for identifying breast abnormalities on mammographic images, according to the paper?'
llm(prompt=query)

  warn_deprecated(


'\n\nAnswer: The three successive steps involved in the novel mass detection process described for identifying breast abnormalities on mammographic images, according to the paper, are:\n\n1. Pre-processing: This step involves correcting for image artifacts and enhancing the contrast of the mammographic images to improve the visibility of any potential abnormalities.\n2. Feature extraction: This step involves extracting relevant features from the pre-processed mammographic images, such as shape, size, and margin, that can be used to distinguish between normal and abnormal tissue.\n3. Classification: This step involves using a machine learning algorithm to classify each feature extractor as either normal or abnormal based on the extracted features and the known characteristics of benign and malignant lesions.'

## Question Answering Chain


In [39]:
from langchain.chains import RetrievalQA

rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    verbose=True,
    retriever=ensemble_retriever,
    chain_type_kwargs={
        "verbose": True },
)

In [40]:
query = 'What are the three successive steps involved in the novel mass detection process described for identifying breast abnormalities on mammographic images, according to the paper?'

rag_pipeline(query)

  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mUse the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Female breast cancer is the second most common cancer in the world. Several efforts in artificial intelligence have been made to help improving the diagnostic accuracy at earlier stages. However, the identification of breast abnormalities, like masses, on mammographic images is not a trivial task, especially for dense breasts. In this paper we describe our novel mass detection process that includes three successive steps of enhancement, characterization and classification. The proposed enhancement system is based mainly on the analysis of the breast texture. First of all, a filtering step with morphological operators and soft thresholding is achie

{'query': 'What are the three successive steps involved in the novel mass detection process described for identifying breast abnormalities on mammographic images, according to the paper?',
 'result': ' According to the paper, the three successive steps involved in the novel mass detection process for identifying breast abnormalities on mammographic images are:\n\nStep 1: Enhancement - The first step involves filtering the breast region using morphological operators and soft thresholding to remove noise and enhance the texture of the image.\n\nStep 2: Characterization - The second step involves measuring the Gaussian density in the wavelet domain to segment the masses and distinguish them from other structures in the breast tissue.\n\nStep 3: Classification - The third step involves using a comparative classification mechanism based on Bayesian regularization back-propagation networks and ANFIS techniques to classify the detected masses as either benign or malignant.'}

## Evaluation

### Automatic metircs evaluation

#### dataset preparation

In [41]:
from datasets import load_dataset

# Load PubMedQA dataset
pubmedqa_dataset = load_dataset("pubmed_qa",'pqa_artificial')

Downloading readme:   0%|          | 0.00/4.87k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/233M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/211269 [00:00<?, ? examples/s]

In [42]:
questions = []
answers = []

for qa_pair in pubmedqa_dataset['train']:

  contexts = " ".join(qa_pair['context']['contexts'])
  if 'intelligence' in contexts:
    questions.append(qa_pair['question'])
    answers.append(qa_pair['long_answer'])

In [44]:
print(f'number of QA-pairs related to context of intelligence: {len(questions)}')
# print(questions)
# print(answers)

number of QA-pairs related to context of intelligence: 187


In [43]:
df = pd.DataFrame({'questions': questions, 'answers': answers})
df.to_excel('evaluation_dataset_pubmedQA_intelligence.xlsx',index=False)

In [None]:
rag_answers = []
for i,quest in enumerate(questions[80:]):
    ans = rag_pipeline(quest)
    rag_answers.append(ans)

In [54]:
df = pd.DataFrame(rag_answers)
df.to_excel('evaluation_resultQA_pubmedQA_intelligence.xlsx', index=False)

In [65]:
df = pd.read_excel('evaluation_resultQA_pubmedQA_intelligence.xlsx')

### Compute Automatic Evaluation Metrics

Bert Score

In [None]:
import numpy as np
### your code ###
bertscore_simple = bertscore.compute(predictions=list(df['result']), references=answers, lang="en")
bertscore_simple_averaged={}

for key in bertscore_simple.keys():
  if key!='hashcode':
    bertscore_simple_averaged[key]=np.mean(bertscore_simple[key])

### your code ###
print("Simple system:")
print(bertscore_simple_averaged)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Simple system:
{'precision': 0.8648118072299547, 'recall': 0.88223060901447, 'f1': 0.8732190144959316}


In [67]:
import evaluate
### your code ###
bleu = evaluate.load('bleu')
rouge = evaluate.load('rouge')
bertscore = evaluate.load("bertscore")
### your code ###

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

BLUE score

In [68]:
### your code ###
bleu_simple = bleu.compute(predictions=list(df['result']), references=answers)
### your code ###
print("Simple system:")
print(bleu_simple)


Simple system:
{'bleu': 0.07127069789930258, 'precisions': [0.20629601142731568, 0.07926287744227353, 0.04767246214245653, 0.033099070505554294], 'brevity_penalty': 1.0, 'length_ratio': 2.1125812441968432, 'translation_length': 18202, 'reference_length': 8616}


Rouge score

In [69]:
### your code ###
rouge_simple = rouge.compute(predictions=list(df['result']),references=answers)
### your code ###
print("Simple system:")
print(rouge_simple)

Simple system:
{'rouge1': 0.2817572778268289, 'rouge2': 0.12232437196132802, 'rougeL': 0.20828801645958467, 'rougeLsum': 0.20830019256068882}


### Manual Evaluation

#### data preparation

In [None]:
import pandas as pd
questions = pd.read_excel('questions.xlsx')
questions.head()

Unnamed: 0,question,Pmid
0,how were students identified as gifted with le...,22057201
1,what are the specific areas of healthcare ment...,35892459
2,How do the adaptation mechanisms of cancer cel...,37540301
3,How can the study of neural computation in hum...,28728020
4,"Are the importance of flow, playfulness and ex...",37195831


In [None]:
answers = []
for quest in questions['question']:
    ans = rag_pipeline(quest)
    answers.append(ans)

In [None]:
df = pd.DataFrame(answers)
df.to_excel('evaluation_QA_manual.xlsx', index=False)