# Retrieval Augmented Generation (RAG) Basics

In this notebook, we will cover the basics of Retrieval Augmented Generation (RAG) model. RAG is a model that combines the best of both worlds - retrieval and generation. It uses a retriever to retrieve relevant passages from a large corpus and then uses a generator to generate the answer.

References:


In [None]:
import os
import re
import string
from openai import OpenAI

# Helper Functions

In [None]:
def preprocess_text(text: str):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r"\s+", " ", text).strip()
    return text

def tokenize(text: str):
    return preprocess_text(text).split()

In [None]:
def retrieve_relevant_chunks(query, corpus, top_n=2):
    query_tokens = set(tokenize(query))
    similarities = []
    for chunk in corpus:
        chunk_tokens = set(tokenize(chunk))
        similarity = len(query_tokens.intersection(chunk_tokens)) / len(query_tokens.union(chunk_tokens))
        similarities.append(similarity)
    top_chunks = sorted(list(enumerate(similarities)), key=lambda x: x[1], reverse=True)[:top_n]
    return [corpus[i] for i, _ in top_chunks]


In [None]:
def answer_question(query: str, corpus: str, top_n=2):
    relevant_chunks = retrieve_relevant_chunks(query, corpus, top_n)
    if not relevant_chunks:
        return "I'm sorry, I don't know the answer to that question."
    context = "/n".join(relevant_chunks)
    client = OpenAI(api_key = os.environ.get("OPENAI_API_KEY"))
    chat_completion = client.chat_completion.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": f"Based on the provided context, answer the following question: {query}\n\nContext:\n{context}",
                },
            {
                "role": "user",
                "content": query,
                },
        ],
        max_tokens=100,
        temperature=0,
    )
    answer = chat_completion.choices[0].message.content.strip()
    return answer

In [None]:
# Example usage
query = "Who is the president of the United States?"
corpus = [
    "The president of the United States is Joe Biden.",
    "Joe Biden is the current president of the United States.",
    "The current president of the United States is Joe Biden.",
]
answer = answer_question(query, corpus)
print(answer)