In [1]:
import minsearch
import json
import requests

In [2]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
from minsearch import Index

index = Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x11294bb90>

In [4]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5,
        output_ids=True
    )

    return results

In [None]:
from textwrap import dedent
TEMPLATE = dedent("""\
                You're given a QUESTION from a course student and that you need to answer with 
                your own knowledge and provided CONTEXT.
                At the beginning the context is EMPTY.

                <QUESTION>{question}</QUESTION>
                <CONTEXT>{context}</CONTEXT>
                
                If CONTEXT is EMPTY, you can use our FAQ database.
                In this case, use the following output template:
                {{
                "action": "SEARCH",
                "reasoning": "<add your reasoning here>"
                }} 
                
                If you can answer the QUESTION using CONTEXT, use this template:
                {{
                "action": "ANSWER",
                "answer": "<your answer>",
                "source": "CONTEXT"
                }}
                If the context doesn't contain the answer, use your own knowledge to answer the question
                {{
                "action": "ANSWER",
                "answer": "<your answer>", 
                "source": "OWN_KNOWLEDGE"
                }}  
                """ 
                )
def build_context(search_results: list[dict]) -> str:
    context_blocks = [
        f"section: {d['section']}\n question: {d['question']}\n answer: {d['text']}"
        for d in search_results
    ]
    context = "\n\n".join(context_blocks)
    return context

In [7]:
from openai import OpenAI
client = OpenAI()

def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

In [11]:
question = "Can I still join the course?"
context = "EMPTY"

prompt = TEMPLATE.format(question=question, context=context)
# print(prompt)
answer = llm(prompt)
print(answer)

{
"action": "SEARCH",
"reasoning": "The context is empty, and I need to check the FAQ database for information regarding course enrollment."
}


In [12]:
search_results = search(question)
context = build_context(search_results)
prompt = TEMPLATE.format(question=question, context=context)

In [13]:
prompt

'You\'re given a QUESTION from a course student and that you need to answer with \nyour own knowledge and provided CONTEXT.\nAt the beginning the context is EMPTY.\n\n<QUESTION>Can I still join the course?</QUESTION>\n<CONTEXT>section: General course-related questions\nquestion: Course - Can I still join the course after the start date?\nanswer: Yes, even if you don\'t register, you\'re still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don\'t leave everything for the last minute.\n\nsection: General course-related questions\nquestion: Course - When will the course start?\nanswer: The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours\'\' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this lin

In [12]:
from openai import OpenAI
client = OpenAI()

def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer