In [1]:
import pandas as pd
from langchain_community.document_loaders import DataFrameLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter,CharacterTextSplitter,TokenTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_transformers import LongContextReorder
from getpass import getpass
from langchain import HuggingFaceHub, PromptTemplate, LLMChain
import os
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
import torch


In [2]:
df = pd.read_parquet("data/Pubmed_abstract_10k.parquet")

In [3]:
loader = DataFrameLoader(df,page_content_column="abstract")
documents = loader.load()

splitter = TokenTextSplitter(chunk_size=128, chunk_overlap=64)

#splitter = RecursiveCharacterTextSplitter(chunk_size=500,
#                                 chunk_overlap = 200)

texts = splitter.split_documents(documents)

In [4]:
len(texts)

18537

In [5]:
#model_name = "sentence-transformers/all-mpnet-base-v2"
model_name = "intfloat/e5-large-unsupervised"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)



In [6]:
db = FAISS.from_documents(texts, embeddings)

In [8]:
#!unzip faiss_retriever.zip

Archive:  faiss_retriever.zip
   creating: faiss_retriever_db_2/
  inflating: faiss_retriever_db_2/index.faiss  
  inflating: faiss_retriever_db_2/index.pkl  


In [9]:
db2 = FAISS.load_local(folder_path="./faiss_retriever_db_2/", embeddings=embeddings
                       ,allow_dangerous_deserialization=True)

In [7]:
faiss_retriever = db.as_retriever(search_kwargs={"k": 10})

In [8]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [9]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
model_id = "mistralai/Mistral-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=False, device_map='auto')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [40]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128)
llm = HuggingFacePipeline(
    pipeline = pipe,
    model_kwargs={"temperature": 0, "max_length": 1024}
)

In [41]:
# PROMPT
PROMPT_TEMPLATE="Your are a medical assistant for question-answering tasks. Answer the Question using the provided Contex only. Your answer should be in your own words and be no longer than 150 words. \n\n Context: {context} \n\n Question: {question} \n\n Answer:"
PROMPT = PromptTemplate.from_template(PROMPT_TEMPLATE)

# RAG pipeline
qa_chain = RetrievalQA.from_chain_type(
    llm,
    chain_type="stuff",
    retriever=faiss_retriever,
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=True
)

In [42]:
query = "What are the main causes for diabetes?"
result = qa_chain({"query": query})

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [25]:
print(result['result'].split('Answer:')[1].strip())

Diabetes is a chronic disease that occurs when the pancreas does not produce enough insulin or when the body cannot effectively use the insulin it produces. Insulin is a hormone that regulates blood sugar levels. There are several causes of diabetes, including genetics, lifestyle factors, and environmental factors.

Genetics plays a significant role in the development of diabetes. People with a family history of diabetes are at a higher risk of developing the disease. Certain genetic mutations can also increase the risk of diabetes.

Lifestyle factors such as obesity and physical inactivity also play a crucial role.



In [15]:
query = "How to avoid getting diabetes?"
result = qa_chain({"query": query})

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [27]:
print(result['result'].split('Answer:')[1].strip())

Diabetes mellitus is a chronic disease that affects the way your body metabolizes sugar (glucose). It can lead to serious health problems, including heart disease, stroke, kidney disease, and blindness. There are several things you can do to help prevent diabetes.

1. Eat a healthy diet. A healthy diet includes plenty of fruits, vegetables, whole grains, and lean protein. Avoid sugary drinks and processed foods.

2. Exercise regularly. Regular exercise can help you maintain a healthy weight and improve your insulin sensitivity.



In [29]:
query = "What are the safest cryopreservation methods?"
result = qa_chain({"query": query})

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [44]:
print(result['result'].split('Answer:')[1].strip())

The safest cryopreservation methods are those that use a slow cooling rate and a low concentration of cryoprotectant.
