In [2]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [2]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [3]:
from openai import OpenAI

openai_client = OpenAI()

In [4]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [5]:
def llm(prompt):
    response = openai_client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [7]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [8]:
rag('how do I run kafka?')

'To run Kafka, you will need to execute specific commands based on your setup:\n\n1. **For Java Kafka**: In your project directory, run the following command in the terminal to run the producer:\n   ```bash\n   java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n   ```\n\n2. **For Python Kafka**: Ensure that you have created a virtual environment and installed the necessary requirements. To do this, you can follow these steps:\n   - Create a virtual environment (run only once):\n     ```bash\n     python -m venv env\n     source env/bin/activate  # Use `env\\Scripts\\activate` for Windows\n     pip install -r ../requirements.txt\n     ```\n   - To activate the virtual environment in the future, use:\n     ```bash\n     source env/bin/activate  # Use `env\\Scripts\\activate` for Windows\n     ```\n   - Deactivate it when done:\n     ```bash\n     deactivate\n     ```\n\nMake sure to have Docker images up and running if required.'

## RAG with Vector Search

In [6]:
from qdrant_client import QdrantClient, models

In [7]:
qd_client = QdrantClient("http://localhost:6333")

In [14]:
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [12]:
collection_name = "zoomcamp-faq"

In [6]:
qd_client.delete_collection(collection_name=collection_name)

True

In [7]:
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

True

In [8]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword"
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [11]:
points = []

for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

In [12]:
qd_client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [8]:
question = 'I just discovered the course. Can I still join it?'

In [9]:
# restructure the function to use course and limtis as arguments and retur
def vector_search(question):
    print('vector_search is used')
    
    course = 'data-engineering-zoomcamp'
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=model_handle 
        ),
        query_filter=models.Filter( 
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=5,
        with_payload=True
    )
    
    results = []
    
    for point in query_points.points:
        results.append(point.payload)
    
    return results

In [10]:
def rag(query):
    search_results = vector_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [15]:
rag('how do I run kafka?')

vector_search is used


"To run Kafka, you'll need to follow these steps:\n\n1. **For Java Kafka**: In your project directory, run the command:\n   ```\n   java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n   ```\n   This will start the producer. You can replace `JsonProducer.java` with other scripts like `JsonConsumer.java`, depending on what you intend to run.\n\n2. **Ensure Kafka Broker is Running**: If you encounter the error `kafka.errors.NoBrokersAvailable: NoBrokersAvailable`, it likely means your Kafka broker docker container is not working. You can check this by running `docker ps`. If it's not running, navigate to the docker compose yaml file directory and execute:\n   ```\n   docker compose up -d\n   ```\n   to start all instances.\n\n3. **Check Configuration**: Make sure that the `StreamsConfig.BOOTSTRAP_SERVERS_CONFIG` in your scripts is set to the correct server URL and that your cluster key and secrets are updated in the `Secrets.java` file. \n\nTh