**Let's illustrate building a RAG using an open-source LLM, embeddings model, and LangChain.**


**Install the required dependencies:**

In [1]:
!pip install -q torch transformers accelerate bitsandbytes transformers sentence-transformers faiss-gpu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m72.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install -q langchain langchain-community

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.6/974.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m321.8/321.8 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.1/127.1 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m145.0/145.0 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.36.1-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==1.0.1 (from gradio)
  Downloading gradio_client-1.0.1-py3-none-any.whl (318 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.1/318.1 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA

**Prepare the data**

In this example demonstrates how to interact with the WordPress REST API to retrieve posts

In [5]:
import requests
from bs4 import BeautifulSoup  # for html editing
import gradio as gr

# Global variable to store fetched data
fetched_data = None

class WordPressAPILoader:
    def __init__(self, base_url, post_type='posts'):
        self.base_url = base_url
        self.post_type = post_type

    def fetch_posts(self, count=15):
        url = f"{self.base_url}/wp-json/wp/v2/{self.post_type}"
        params = {
            'per_page': count
        }
        response = requests.get(url, params=params)
        response.raise_for_status()  # Raise an error for bad status codes
        return response.json()

    def load(self, count=15):
        data = self.fetch_posts(count)
        return data

# Function to clean HTML tags from content
def clean_html(raw_html):
    soup = BeautifulSoup(raw_html, "html.parser")
    return soup.get_text()

def fetch_and_display_posts(base_url):
    global fetched_data  # Declare the global variable
    try:
        loader = WordPressAPILoader(base_url)
        fetched_data = loader.load()  # Store data in the global variable

        results = []
        for post in fetched_data[:3]:  # Displaying the first 3 entries
            title = clean_html(post['title']['rendered'])
            content = clean_html(post['content']['rendered'])
            results.append(f"Title: {title}\n\nContent: {content}\n\n")

        return "\n\n".join(results)
    except Exception as e:
        return str(e)

# Create a Gradio interface
iface = gr.Interface(
    fn=fetch_and_display_posts,  # Function to wrap
    inputs=gr.Textbox(label="WordPress Site Base URL"),  # Input component
    outputs=gr.Textbox(label="Posts"),  # Output component
    title="WordPress Posts Viewer",  # Title of the app
    description="Enter the base URL of a WordPress site to fetch and display the first few posts."  # Description of the app
)

# Launch the interface
iface.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://d13252a8ea54c0810c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




The content of individual posts might exceed the input limit of an embedding model. To embed all the available content, we need to divide the documents into appropriately sized chunks.

In [6]:
from langchain.schema import Document
# Convert the data to Document objects
documents = []
for post in fetched_data:
    cleaned_content = clean_html(post['content']['rendered'])
    doc = Document(page_content=cleaned_content, metadata={'title': post['title']['rendered']})
    documents.append(doc)

# Create an instance of the RecursiveCharacterTextSplitter class with specific parameters.
# It splits text into chunks of 1000 characters each with a 150-character overlap.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

# Split the text into documents using the text splitter.
docs = text_splitter.split_documents(documents)

# Print the first chunk of the first document to verify
print(docs[0].page_content)

SPOILER ALERT: This post contains spoilers from the series premiere of “Orphan Black: Echoes,” now streaming on AMC+.

	The series premiere of AMC and BBC America’s “Orphan Black: Echoes” offers only a few concrete things about Lucy, the enigmatic character played by Krysten Ritter.

	First and foremost, she’s innately resourceful, which comes in handy when she wakes up with no memory of who she is, with a kind but mysterious woman (Keeley Hawes) interrogating her about anything she might know. After a violent reaction, she MacGyvers her way out of the containment facility where she’s being held and right past a few clues that suggest her origins may lie in a slimy vat of goo. But before Lucy can investigate, she goes like a thief in the night, popping up two years later as a field-hand living the quiet life in the country.



The simplest and most common method for chunking involves setting a fixed chunk size and deciding if there should be any overlap between them. Overlapping chunks help maintain semantic context. For general text, the recommended tool is the RecursiveCharacterTextSplitter, which we'll use here.

In [7]:
# Create an instance of the RecursiveCharacterTextSplitter class with specific parameters.
# It splits text into chunks of 1000 characters each with a 150-character overlap.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

# Split the text into documents using the text splitter.
docs = text_splitter.split_documents(documents)

# Print the first chunk of the first document to verify
print(docs[3].page_content)

Keeley Hawes
Courtesy of Sophie Giraud/AMC




	Simultaneous to all of this, we see a few glimpses inside the Addictive Foundation, a medical technology organization that prints working replicas of vital organs. Run by Hawes’ unnamed woman from the opening sequence, the foundation is the one tracking Lucy, who was copied from DNA that did not have enough detail for memory recall. In other words, she’s a defect in their program and she’s a liability. In the final scene, the woman reveals herself to be Kira Manning, the daughter of Sarah Manning, one of the many clones played by Emmy winner Tatiana Maslany in BBC America’s original “Orphan Black” series.

	With confirmation this is a sequel series set 30 years after the original, Ritter spoke to Variety about why she likes playing “an experiment gone wrong” and what it is like to act in the aforementioned vat of goo.


In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=30)

chunked_docs = splitter.split_documents(docs)


Once the documents are properly sized, we can create a database with their embeddings.

We'll generate document chunk embeddings using the HuggingFaceEmbeddings and the sentence-transformers/all-MiniLM-l6-v2 model. Many other embedding models are available on the Hub, and you can monitor the top-performing ones on the Massive Text Embedding Benchmark (MTEB) Leaderboard.

For the vector database, we'll use FAISS, a library developed by Facebook AI that provides efficient similarity search and clustering of dense vectors. FAISS is a widely used library for nearest neighbor search in large datasets.

We'll use the LangChain API to access both the embeddings model and FAISS.

In [9]:
db = FAISS.from_documents(chunked_docs,HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-l6-v2'))

  warn_deprecated(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


We need a method to retrieve documents based on an unstructured query. We'll use the as_retriever method with our database as the backbone:

1.search_type="similarity" indicates that we want to perform a similarity search
between the query and documents.

2.search_kwargs={'k': 4} tells the retriever to return the top 4 results.

In [10]:
retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 4}
)


With the vector database and retriever now set up, the next step is to configure the model.

**Load quantized model**


In this example, we selected the HuggingFaceH4/zephyr-7b-beta model, which is small yet powerful.

To speed up inference, we'll use the quantized version of the model.

In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = 'HuggingFaceH4/zephyr-7b-beta'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

**Setup the LLM chain**


Finally, we have everything needed to set up the LLM chain.

First, create a text generation pipeline with the loaded model and its tokenizer.

Next, create a prompt template that matches the model's format. If you change the model checkpoint, ensure the formatting is updated accordingly.

In [12]:
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser

text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=400,
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

prompt_template = """
<|system|>
Answer the question based on your knowledge. Use the following context to help:

{context}

</s>
<|user|>
{question}
</s>
<|assistant|>

 """

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

llm_chain = prompt | llm | StrOutputParser()

  warn_deprecated(


In [13]:
from langchain_core.runnables import RunnablePassthrough

retriever = db.as_retriever()

rag_chain = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)


**Compare the answers**


Let's check if using RAG makes any difference in generating answers.

In [14]:
question  = input("Please enter a string: ")

Please enter a string: spoilers from the series premiere of “Orphan Black: Echoes,” 


In [16]:
llm_chain.invoke({"context":"", "question": question})

'\n<|system|>\nAnswer the question based on your knowledge. Use the following context to help:\n\n\n\n</s>\n<|user|>\nspoilers from the series premiere of “Orphan Black: Echoes,” \n</s>\n<|assistant|>\n\n  According to reports, in the series premiere of "Orphan Black: Echoes," Sarah Manning (played by Tatiana Maslany) is seen struggling with her identity and past traumas as she tries to protect her daughter Kira (played by Skyler Wexler) from a dangerous new enemy. The episode also introduces new characters, including a woman named Evie Cho (played by Julia Chan), who has ties to both Sarah\'s past and the mysterious organization known as Neolution. Additionally, we learn that Sarah\'s sister Cosima (also played by Maslany) may be in danger after being taken captive by an unknown assailant. Overall, the premiere sets up a complex web of mysteries and intrigue for the rest of the season.'

In [17]:
rag_chain.invoke(question)

'\n<|system|>\nAnswer the question based on your knowledge. Use the following context to help:\n\n[Document(page_content=\'SPOILER ALERT: This post contains spoilers from the series premiere of “Orphan Black: Echoes,” now streaming on AMC+.\\n\\n\\tThe series premiere of AMC and BBC America’s “Orphan Black: Echoes” offers only a few concrete things about Lucy, the enigmatic character played by Krysten Ritter.\', metadata={\'title\': \'‘Orphan Black: Echoes’ Star Krysten Ritter on Her Character’s Mysterious Past: ‘She Is an Experiment Gone Wrong’\'}), Document(page_content=\'That was so appealing to me — I love that she has somebody to love. As a mother myself, I loved that I got to play that mother-daughter relationship between Lucy and Charlie, who she definitely sees as a stepdaughter. That dynamic is something that I definitely understood.\\n\\nThe final shot of the premiere confirms this is a true sequel series to “Orphan Black.” Was that daunting at all to be leading the next chap

ui interface of llm_chain

In [18]:
import gradio as gr

# Define the function that will be called with the user's input
def process_question(question):
    # Simulate the llm_chain.invoke function
    result = llm_chain.invoke({"context":"", "question": question})
    return result

# Simulate the llm_chain.invoke function for demonstration purposes
def llm_chain_invoke(params):
    context = params.get("context", "")
    question = params.get("question", "")
    # Process the question here
    response = f"Processed question: {question}"
    return response

# Create the Gradio interface
with gr.Blocks() as demo:
    question_input = gr.Textbox(label="Ask question:")
    output = gr.Textbox(label="Answer")

    def on_submit(question):
        return process_question(question)

    question_input.submit(on_submit, question_input, output)

# Launch the Gradio interface
demo.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://2fd54638daaa6bb889.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




ui interface of rag_chain

In [19]:
import gradio as gr

# Define the function that will be called with the user's input
def process_question(question):
    # Simulate the rag_chain.invoke function
    result = rag_chain.invoke( question)
    return result

# Create the Gradio interface
with gr.Blocks() as demo:
    question_input = gr.Textbox(label="Ask Question")
    output = gr.Textbox(label="Answer")

    def on_submit(question):
        return process_question(question)

    question_input.submit(on_submit, question_input, output)

# Launch the Gradio interface
demo.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://213bde73896a769dc3.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


