### Import packages and API Key

In [28]:
import minsearch
import json
from openai import OpenAI
from dotenv import load_dotenv
import os
from openai import OpenAI
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) # Get Key from Env (ignored in git) 
client = OpenAI()

### Get min search

In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/refs/heads/main/minsearch/minsearch.py

--2025-06-04 14:28:08--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/refs/heads/main/minsearch/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5488 (5.4K) [text/plain]
Saving to: ‘minsearch.py.4’


2025-06-04 14:28:08 (47.4 MB/s) - ‘minsearch.py.4’ saved [5488/5488]



### Load documents in good format

In [29]:
with open('documents.json', 'rt') as f_in: 
    docs_raw = json.load(f_in)

documents = []

for course_dict in docs_raw: 
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [4]:
index = minsearch.Index(
    text_fields=["question", "text", "section"], 
    keyword_fields=["course"] 
)
index.fit(documents)

<minsearch.Index at 0x7e3239486840>

### Create reusable functions. 

In [5]:
def search(query): 
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query, 
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost, 
        num_results=10
    )
    return results

In [12]:
def build_prompt(query, seach_results): 
    prompt_template = """
    You're a course teaching assistant. Answer the Question based on the CONTEXT from the FAQ database. 
    Use only the facts from the CONTEXT when answering the QUESTION. 
    If the CONTEXT doesn't contain the answer, output NONE
    
    QUESTION: {question}
    
    CONTEXT: 
    {context}
    """
    
    context = ""
    
    for doc in search_results: 
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
   
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt 

In [13]:
def llm(prompt):
    response=client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )  
    return response.choices[0].message.content

### create RAG 

In [37]:
query = 'How many Zoomcamps are there in a year?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

### Return Answer

In [36]:
rag(query)

'There are 3 Zoomcamps in a year, as of 2024. They are for separate courses: Data-Engineering (Jan - Apr), MLOps (May - Aug), and Machine Learning (Sep - Jan).'