# Defining LLM

In [14]:
from dataclasses import dataclass

@dataclass
class LLMServiceConfig:
    host: str
    api_key: str
    api_version: str
    model: str

with open("../../../../OPENAI_API_KEY.yaml") as f:
    details = yaml.safe_load(f)["Glossary Terms Extraction Service"]
    
service_config = LLMServiceConfig(
    host=f"{details['protocol']}://{details['host']}",
    api_key=details['api_key'],
    api_version=details['api_version'],
    model=details['model']
)
client = AzureOpenAI(
    api_version=service_config.api_version,
    azure_endpoint=service_config.host,
    api_key=service_config.api_key
)

# DLT

## Q1

In [1]:
!pip install -q "dlt[qdrant]" "qdrant-client[fastembed]"

In [2]:
!dlt --version

[39mdlt 1.12.3[0m


## Q2

In [13]:
import dlt
import os
import yaml
from openai import AzureOpenAI
import requests

In [7]:
@dlt.resource
def zoomcamp_data():
    docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
    docs_response = requests.get(docs_url)
    documents_raw = docs_response.json()

    for course in documents_raw:
        course_name = course['course']

        for doc in course['documents']:
            doc['course'] = course_name
            yield doc

In [15]:
from dlt.destinations import qdrant

qdrant_destination = qdrant(
  qd_path="db.qdrant", 
)

In [16]:
pipeline = dlt.pipeline(
    pipeline_name="zoomcamp_pipeline",
    destination=qdrant_destination,
    dataset_name="zoomcamp_tagged_data"

)
load_info = pipeline.run(zoomcamp_data())
print(pipeline.last_trace)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 5 files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.50it/s]


Run started at 2025-07-07 13:28:55.396693+00:00 and COMPLETED in 12.10 seconds with 4 steps.
Step extract COMPLETED in 2.68 seconds.

Load package 1751894941.367451 is EXTRACTED and NOT YET LOADED to the destination and contains no failed jobs

Step normalize COMPLETED in 0.10 seconds.
Normalized data for the following tables:
- zoomcamp_data: 948 row(s)
- _dlt_pipeline_state: 1 row(s)

Load package 1751894941.367451 is NORMALIZED and NOT YET LOADED to the destination and contains no failed jobs

Step load COMPLETED in 3.35 seconds.
Pipeline zoomcamp_pipeline load step completed in 3.34 seconds
1 load package(s) were loaded to destination qdrant and into dataset zoomcamp_tagged_data
The qdrant destination used /Users/vpankrat/Courses/LLMzoomcamp/cohorts/2025/workshops/dlt/db.qdrant location to store data
Load package 1751894941.367451 is LOADED and contains no failed jobs

Step run COMPLETED in 12.09 seconds.
Pipeline zoomcamp_pipeline load step completed in 3.34 seconds
1 load package

**Number of inserted rows: 948**

## Q3

In [18]:
import json

In [23]:
print(os.getcwd())

/Users/vpankrat/Courses/LLMzoomcamp/cohorts/2025/workshops/dlt


In [29]:
meta_path = 'db.qdrant/meta.json'
with open(meta_path) as fp:
    data = json.load(fp)
print(json.dumps(data, indent=4))

{
    "collections": {
        "zoomcamp_tagged_data": {
            "vectors": {
                "fast-bge-small-en": {
                    "size": 384,
                    "distance": "Cosine",
                    "hnsw_config": null,
                    "quantization_config": null,
                    "on_disk": null,
                    "datatype": null,
                    "multivector_config": null
                }
            },
            "shard_number": null,
            "sharding_method": null,
            "replication_factor": null,
            "write_consistency_factor": null,
            "on_disk_payload": null,
            "hnsw_config": null,
            "wal_config": null,
            "optimizers_config": null,
            "init_from": null,
            "quantization_config": null,
            "sparse_vectors": null,
            "strict_mode_config": null
        },
        "zoomcamp_tagged_data__dlt_loads": {
            "vectors": {
                "fast-bge-small-e

**Model used: fast-bge-small-en**