In [2]:
from qdrant_client import QdrantClient, models
from openai import OpenAI
import json

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import requests

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
docs_raw = docs_response.json()

In [4]:
documents = []

for course in docs_raw:
    for doc in course['documents']:
        doc['course'] = course['course']
        documents.append(doc)

In [5]:
openai_client = OpenAI()
qd_client = QdrantClient("http://localhost:6333") #connecting to local Qdrant instance

In [6]:
model_handle = "jinaai/jina-embeddings-v2-small-en"
collection_name = "zoomcamp-rag"
qd_client.get_collection(collection_name= collection_name)

CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=None, indexed_vectors_count=0, points_count=948, segments_count=2, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=512, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), qua

In [7]:
def search_in_course(query, course, limit=3):
    
    if course == None:
        course_list = ['machine-learning-zoomcamp', 'data-engineering-zoomcamp', 'mlops-zoomcamp']
    else:
        course_list = [course]
        
    results = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query,
            model=model_handle
        ),
        query_filter=models.Filter( # filter by course name
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchAny(any=course_list)
                )
            ]
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    return results

In [8]:
def build_prompt(query, search_results):
    prompt_template = """
        You're a course teacher assistant. Answer the question based on the context.
        Use only the facts from the context. If relevant information is missing, say you do not know.
        Question:
        {query}
        Context:
        {context}
        """.strip()
    context = ""
    for doc in search_results.points:
        context += f"section: {doc.payload['section']}\nanswer:{doc.payload['text']}\n\n"
    return prompt_template.format(query=query, context=context).strip()

In [9]:
def llm(client, prompt, model='gpt-4.1-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{'role': 'user', 'content': prompt }]
    )
    return response.choices[0].message.content

In [11]:
def rag(query, course= None, verbose_search= False, verbose_prompt = False):
    search_results = search_in_course(q, course)
    if verbose_search:
        print(search_results.points)
    prompt = build_prompt(q,search_results)
    if verbose_prompt:
        print(prompt)
    message = llm(openai_client, prompt)
    return(message)

In [12]:
q = "How to build model on dbt project?"
output = rag(q, verbose_prompt=False, verbose_search = True)

[ScoredPoint(id=295, version=0, score=0.9089782, payload={'text': 'Before you can develop some data model on dbt, you should create development environment and set some parameter on it. After the model being developed, we should also create deployment environment to create and run some jobs.', 'section': 'Module 4: analytics engineering with dbt', 'course': 'data-engineering-zoomcamp'}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=304, version=0, score=0.8591342, payload={'text': 'When running trying to run the dbt project on prod there is some things you need to do and check on your own:\nFirst Make the pull request and Merge the branch into the main.\nMake sure you have the latest version, if you made changes to the repo in another place.\nCheck if the dbt_project.yml file is accessible to the project, if not check this solution (Dbt: This dbt Cloud run was cancelled because a valid dbt project was not found.).\nCheck if the name you gave to the dataset on BigQuery 

In [13]:
print(output)

To build a model on a dbt project, you should follow these steps based on the provided context:

1. **Set up your development environment** and configure necessary parameters before starting to develop the data model.
2. **Develop the model** within this environment.
3. After development, **create a deployment environment** to schedule and run dbt jobs.
4. When ready to deploy to production, do the following:
   - Make a pull request and merge your branch into the main branch.
   - Ensure you have the latest version of the project (especially if changes were made elsewhere).
   - Verify that the `dbt_project.yml` file is accessible to the project.
   - Confirm that the dataset name you specified in BigQuery matches the dataset configured in the production environment on dbt Cloud.
5. Use dbt’s **"ref" keyword** in your SQL models to manage dependencies between models. This helps dbt understand execution order automatically.

This is the process based on the available information from t