In [1]:
!wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/minsearch.py

--2024-06-23 15:12:55--  https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py.1’


2024-06-23 15:12:56 (18.3 MB/s) - ‘minsearch.py.1’ saved [3832/3832]



In [2]:
import minsearch

In [3]:
import json

In [4]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [7]:
documents = []
for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [8]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [12]:
index = minsearch.Index(text_fields=["question", "text", "section"], keyword_fields=["course"])

In [10]:
q = 'the course has already started, can i still enroll?'

In [13]:
index.fit(documents)

<minsearch.Index at 0x709b4001f970>

In [16]:
boost = {'question': 3.0, 'section':0.5}
index.search(query = q, filter_dict={'course': 'data-engineering-zoomcamp'}, boost_dict = boost, num_results=5)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 202

In [19]:
from openai import OpenAI

In [40]:
client = OpenAI(base_url='http://localhost:11434/v1/', api_key='ollama')

In [41]:
response = client.chat.completions.create(
    model='phi3',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

" I'm afraid that if a course is already underway and you haven't been able to enroll through its regular process, it may be challenging to get in at this point. However, policies vary by institution and specific program. Here are some steps you could consider:\n\n1. Check the drop deadline: Many courses have an official cut-off date after which new students cannot join due to logistical constraints (e.g., seating, materials distribution). If you haven't reached this point, inquire directly with the course instructor or department about the possibility of joining late.\n\n2. Speak with an academic advisor: Your institution's academic advisors can provide valuable insights into your options for enrolling in a full-term course after it has started and help you understand how best to navigate this situation.\n\n3. Request special permission: Some institutions might offer the opportunity to join a latecomer through exceptional circumstances, such as emergencies or unique learning situation

In [42]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [43]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [44]:
def llm(prompt):
    response = client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [45]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [46]:
rag(query)

' In the project directory, run:\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n\nFor running Kafka in terminal, specifically for Python Kafka with a permission denied error on ./build.sh, use the command to change permissions:\nchmod +x build.sh\n\n(Note: The above Java-related information is not directly related to running Kafka via terminal but addresses how to run specific Java programs in relation to Kafka within the context provided.)'

In [47]:
rag('the course has already started, can I still enroll?')

" Yes, even if you don't register before the start date, you're still eligible to submit the homeworks and access materials after the course finishes at your own pace. However, there will be deadlines for turning in the final projects and specific registration instructions prior to the course starting on 15th Jan 2024 at 17h00."

In [48]:
from elasticsearch import Elasticsearch

In [49]:
es_client = Elasticsearch('http://localhost:9200') 

In [50]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [course-questions/Ie6F1aMZSLuSZ2W8ARLjTQ] already exists')

In [51]:
from tqdm.auto import tqdm

In [52]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|████████████████████████████████████████████████████████| 948/948 [00:26<00:00, 36.04it/s]


In [53]:
query = 'I just disovered the course. Can I still join it?'

In [54]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [55]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [56]:
rag(query)

" Yes, you can still join the course even after the start date as long as you are eligible to submit homeworks. However, there will be deadlines for turning in final projects, so it's not advisable to leave everything until last minute.\n\nAdditionally, all materials from the course will be kept available after its completion. This allows you to follow the course at your own pace and continue working on preparations such as homework assignments or a final capstone project for future cohorts."

## homework

In [57]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [58]:
from elasticsearch import Elasticsearch

In [59]:
es_client = Elasticsearch('http://localhost:9200') 

In [61]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions-hw"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions-hw'})

In [62]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|████████████████████████████████████████████████████████| 948/948 [00:25<00:00, 36.47it/s]


In [63]:
query = 'How do I execute a command in a running docker container?'

In [65]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [74]:
def search(query):
    boost = {'question': 4.0}

    results = response.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [75]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [76]:
rag(query)

' To execute a command in a running docker container, you can connect to it using `pgcli` and then type your commands directly into the terminal. For instance:\n\n$ docker run -it --rm --network pg-network ai2ys/dockerized-pgcli:4.0.1\n175dd47cda07:/#\nUsername: root\nPassword for root: [Your Password]\nServer: PostgreSQL 16.1 (Debian 16.1-1.pgdg120+1)\nVersion: 4e9d8f3a5bce\n> psql -h pg-database -U root -p 5432 -d ny_taxi\nPassword for user root: [Your Password]\n=> \\dt'