In [1]:
from qdrant_client import QdrantClient, models
import minsearch
from groq import Groq
import json
import os


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import requests

# Use the raw content URL from GitHub
docs_url = 'https://raw.githubusercontent.com/Saheed388/llm-rag-workshop/main/notebooks/documents.json'

docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [6]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x2104c3fec30>

In [2]:

client = Groq(api_key=os.getenv("GROQ_API_KEY"))


In [7]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results


In [8]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [12]:
def llm(prompt):

    response = client.chat.completions.create(
    model = "llama-3.3-70b-versatile",
    messages=[{"role": "user", "content": prompt}]
    )
    print(response.choices[0].message.content)


In [13]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [14]:
rag('how do I run kafka?')


To run Kafka, follow these steps based on the provided context:

**For Java Kafka:**
1. Go to the project directory.
2. Run the command: `java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java`

**For Python Kafka:**
1. Create a virtual environment by running: `python -m venv env`
2. Activate the virtual environment: 
   - For MacOS and Linux, run: `source env/bin/activate`
   - For Windows, run: `env\Scripts\activate`
3. Install the required packages: `pip install -r ../requirements.txt`
4. Ensure Docker images are up and running before executing the Python file.

Note: If you encounter a permission denied error while running `build.sh`, use the command `chmod +x build.sh` to grant execution permission. Additionally, if you're using Python Kafka, consider installing `kafka-python-ng` instead of `kafka-python` to avoid potential issues.


In [15]:
rag('the course has already started, can I still enroll?')


Yes, you can still enroll in the course even after it has started. According to the FAQ, "even if you don't register, you're still eligible to submit the homeworks." However, be aware that there will be deadlines for turning in the final projects, so it's recommended not to leave everything for the last minute.


RAG WITH VECTOR SEARCH

In [16]:
qds_client = QdrantClient("http://localhost:6333")

In [17]:
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [23]:
collection_name = "zoomcamp-PRAC"


In [10]:
qds_client.delete_collection(collection_name=collection_name)


False

In [33]:
qds_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

In [25]:
qds_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword"
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [26]:
documents[2]


{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [27]:
points = []

for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

In [28]:
qds_client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [29]:
question = 'I just discovered the course. Can I still join it?'


In [30]:
def vector_search(question):
    print('vector_search is used')
    
    course = 'data-engineering-zoomcamp'
    query_points = qds_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=model_handle 
        ),
        query_filter=models.Filter( 
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=5,
        with_payload=True
    )
    
    results = []
    
    for point in query_points.points:
        results.append(point.payload)
    
    return results

In [31]:
def rag(query):
    search_results = vector_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [32]:
rag('how do I run kafka?')


vector_search is used
To run Kafka, you need to follow these steps based on the provided context:

1. **For Java Kafka**: In the project directory, run the command: 
   `java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java`

   Make sure to replace `<jar_name>` with your actual jar file name. Also, ensure that the `StreamsConfig.BOOTSTRAP_SERVERS_CONFIG` is set to the correct server URL in your scripts (e.g., `JsonConsumer.java`, `JsonProducer.java`) and that the cluster key and secrets are updated in `Secrets.java`.

2. **For Python Kafka (producer.py)**: 
   - Create a virtual environment using `python -m venv env` (run only once).
   - Activate the virtual environment using `source env/bin/activate` (for MacOS, Linux) or `env\Scripts\activate` (for Windows).
   - Install packages using `pip install -r ../requirements.txt`.
   - Run your python files within this activated environment.

3. **If you encounter the `NoBrokersAvailable` error**, 