In [None]:
!pip install openai

In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-07-27 21:48:20--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-07-27 21:48:20 (42.0 MB/s) - ‘minsearch.py’ saved [3832/3832]



In [3]:
from openai import OpenAI
from tqdm.auto import tqdm

import json
import minsearch
import os

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

## Retrivel and Search

In [5]:
!wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/documents.json

--2024-07-27 21:48:30--  https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/documents.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 658332 (643K) [text/plain]
Saving to: ‘documents.json’


2024-07-27 21:48:30 (28.9 MB/s) - ‘documents.json’ saved [658332/658332]



In [6]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [7]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

print(documents[0])

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.", 'section': 'General course-related questions', 'question': 'Course - When will the course start?', 'course': 'data-engineering-zoomcamp'}


In [8]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [9]:
index.fit(documents)

<minsearch.Index at 0x72d206c04d30>

## The RAG Flow Cleaning and Modularizing Code

In [10]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course':'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=10
    )

    return results

In [11]:
def build_prompt(query, search_results):
    prompt_template = '''
    You are a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
    Use only facts from the CONTEXT when answering the QUESTION.
    If the CONTEXT doesn't contain the answer, output NONE
    
    QUESTION: {question}
    
    CONTEXT: {context}
    
    '''.strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [12]:
def llm(prompt):
    response = client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [13]:
def rag(query):
    results = search(query)
    builded_prompt = build_prompt(query, results)
    answer = llm(builded_prompt)

    return answer

In [14]:
query = "the course has already started. Can I still join?"
print(rag(query))

 Yes, you are still eligible to submit assignments even if the course has already started because there will be deadlines for turning in final projects but no mention of restrictions on submitting homeworks after a certain point within or beyond the official course period was stated in CONTEXT provided by FAQ database.
However it is important not to delay this process as submission timelines should still ideally adhere to those mentioned officially if they exist, despite you being able register and submit work independently post-start date without any formal recognition of your late participation or submissions.


In [15]:
print(rag("write that this is a test"))

 In the provided context for questions about using Docker with Terraform on AWS or general course and module queries from FAQs in a dbt course environment: 

There is no answer related to writing that this is a test as per your request because it's not directly asked nor answered within any given question-answer pairing. The closest context seems unrelated, focusing instead on issues with mounting local folders into Docker volumes and dealing with ownership permissions or troubleshooting GCP BQ connection errors across regions.
