# Implementing RAG with Langchain and Hugging Face on Mini-bioasq Dataset

## Libraries

In [1]:
import torch
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.runnables import RunnablePassthrough
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.vectorstores import FAISS
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


## Document Loading

In [2]:
dataset_name = "rag-datasets/mini-bioasq"
page_content_column = "passage" 
name = "text-corpus"

loader = HuggingFaceDatasetLoader(dataset_name, page_content_column, name)

data = loader.load()
data[:2]



[Document(page_content='"New data on viruses isolated from patients with subacute thyroiditis de Quervain \\nare reported. Characteristic morphological, cytological, some physico-chemical \\nand biological features of the isolated viruses are described. A possible role \\nof these viruses in human and animal health disorders is discussed. The isolated \\nviruses remain unclassified so far."', metadata={'id': 9797}),
 Document(page_content='"We describe an improved method for detecting deficiency of the acid hydrolase, \\nalpha-1,4-glucosidase in leukocytes, the enzyme defect in glycogen storage \\ndisease Type II (Pompe disease). The procedure requires smaller volumes of blood \\nand less time than previous methods. The assay involves the separation of \\nleukocytes by Peter\'s method for beta-glucosidase and a modification of Salafsky \\nand Nadler\'s fluorometric method for alpha-glucosidase."', metadata={'id': 11906})]

## Document Transformers

In [3]:
splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=30)
chunked_docs = splitter.split_documents(data)

## Text Embedding

In [4]:
modelPath = "BAAI/bge-small-en"
embeddings = HuggingFaceEmbeddings(model_name=modelPath)
db = FAISS.from_documents(chunked_docs, embeddings)

In [5]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [6]:
model_id= "facebook/opt-350m"
model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [7]:
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=400,
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

prompt_template = """
<|system|>
Answer the question based on your knowledge. Use the following context to help:

{context}

</s>
<|user|>
{question}
</s>
<|assistant|>

 """

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

llm_chain = prompt | llm | StrOutputParser()

In [8]:
retriever = db.as_retriever()
rag_chain = {"context": retriever, "question": RunnablePassthrough()} | llm_chain

In [9]:
question = "Is the protein Papilin secreted?"

In [10]:
llm_chain.invoke({"context": "", "question": question})

'\n<|system|>\nAnswer the question based on your knowledge. Use the following context to help:\n\n\n\n</s>\n<|user|>\nIs the protein Papilin secreted?\n</s>\n<|assistant|>\n\n                                                                                                                                                                                                                                                                                                                                                                                                                 '

In [11]:
rag_chain.invoke(question)

'\n<|system|>\nAnswer the question based on your knowledge. Use the following context to help:\n\n[Document(page_content=\'"Papilins are homologous, secreted extracellular matrix proteins which share a \\\\ncommon order of protein domains. They occur widely, from nematodes to man, and \\\\ncan differ in the number of repeats of a given type of domain. Within one \\\\nspecies the number of repeats can vary by differential RNA splicing. A \\\\ndistinctly conserved cassette of domains at the amino-end of papilins is \\\\nhomologous with a cassette of protein domains at the carboxyl-end of the ADAMTS \\\\nsubgroup of secreted, matrix-associated\', metadata={\'id\': 15094122}), Document(page_content=\'a broad band of about 900,000 apparent molecular weight and \\\\nthe core protein as a narrow band of approximately 400,000. The core protein was \\\\nformed by some cell lines and by other cells on incubation with 1 mM \\\\n4-methylumbelliferyl xyloside, which inhibited formation of the \\\\n