# Homework - Workshop: Open-Source Data Ingestion


**Install Required Python Libraries:**

`dlt[qdrant]`: This installs the dlt (Data Load Tool) library along with an extra component specifically for integration with Qdrant. DLT is a Python library designed for building data pipelines. The `[qdrant]` specifies that we are using Qdrant as a vector database.

`qdrant-client[fastembed]`: This installs the qdrant-client library, which is the official Python client for interacting with a Qdrant vector database. The `[fastembed] `part indicates that it will also install fastembed, which is a library for generating embeddings (numerical representations of text or other data) efficiently.


---



In [None]:
!pip install -q "dlt[qdrant]" "qdrant-client[fastembed]"

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/100.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.9/100.9 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/353.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m353.7/353.7 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.0/329.0 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m986.0/986.0 kB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.6/61.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.6/101.6 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Show DLT version


---



In [None]:
!pip show dlt

Name: dlt
Version: 1.13.0
Summary: dlt is an open-source python-first scalable data loading library that does not require any backend to run.
Home-page: https://github.com/dlt-hub
Author: 
Author-email: "dltHub Inc." <services@dlthub.com>
License: 
Location: /usr/local/lib/python3.11/dist-packages
Requires: click, fsspec, gitpython, giturlparse, hexbytes, humanize, jsonpath-ng, orjson, packaging, pathvalidate, pendulum, pluggy, pytz, pyyaml, requests, requirements-parser, rich-argparse, semver, setuptools, simplejson, sqlglot, tenacity, tomlkit, typing-extensions, tzdata
Required-by: 


Import Required Libraries

---



In [None]:
import dlt
import requests

`@dlt.resource` declares the function as resource
`write_disponsition` defines how should the data be loaded at the destination which is replace

---



In [None]:
@dlt.resource(name='zoomcamp_data', write_disposition='replace')

def zoomcamp_data():
    docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
    docs_response = requests.get(docs_url)
    documents_raw = docs_response.json()

    for course in documents_raw:
        course_name = course['course']

        for doc in course['documents']:
            doc['course'] = course_name
            yield doc

Define the destination which is Qdrant vector database. Data will be stored in db.qdrant file placed in current working directory.

---



In [None]:
from dlt.destinations import qdrant

destination_vector_db = qdrant(
  qd_path="db.qdrant",
)

Define the pipeline

In [None]:
llm_pipeline = dlt.pipeline(
    pipeline_name="zoomcamp_pipeline",
    destination=destination_vector_db,
    dataset_name="zoomcamp_tagged_data"
)

Run the pipeline

---



In [None]:
load_info = llm_pipeline.run(
  zoomcamp_data()
)
print(llm_pipeline.last_trace)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model_optimized.onnx:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Run started at 2025-07-09 22:07:43.318980+00:00 and COMPLETED in 17.70 seconds with 4 steps.
Step extract COMPLETED in 0.63 seconds.

Load package 1752098867.6977303 is EXTRACTED and NOT YET LOADED to the destination and contains no failed jobs

Step normalize COMPLETED in 0.17 seconds.
Normalized data for the following tables:
- zoomcamp_data: 948 row(s)
- _dlt_pipeline_state: 1 row(s)

Load package 1752098867.6977303 is NORMALIZED and NOT YET LOADED to the destination and contains no failed jobs

Step load COMPLETED in 12.53 seconds.
Pipeline zoomcamp_pipeline load step completed in 12.49 seconds
1 load package(s) were loaded to destination qdrant and into dataset zoomcamp_tagged_data
The qdrant destination used /content/db.qdrant location to store data
Load package 1752098867.6977303 is LOADED and contains no failed jobs

Step run COMPLETED in 17.70 seconds.
Pipeline zoomcamp_pipeline load step completed in 12.49 seconds
1 load package(s) were loaded to destination qdrant and into d

Get Load Info

---



In [None]:
print(load_info)

Pipeline zoomcamp_pipeline load step completed in 12.49 seconds
1 load package(s) were loaded to destination qdrant and into dataset zoomcamp_tagged_data
The qdrant destination used /content/db.qdrant location to store data
Load package 1752098867.6977303 is LOADED and contains no failed jobs


In [None]:
# After dlt pipeline has successfully run and created db.qdrant
# List contents of the current working directory to see if db.qdrant is there
!ls

db.qdrant  sample_data


In [None]:
# If you see 'db.qdrant' in the output of the above command,
# you can then list the contents of the db.qdrant directory:
!ls db.qdrant

collection  meta.json


In [None]:
# To see contents of subdirectories (e.g., 'collections' for Qdrant data)
!ls db.qdrant/meta.json

db.qdrant/meta.json


Find the embedding model

In [None]:
import json

# Adjust this path based on where you found meta.json
meta_file_path = 'db.qdrant/meta.json' # Example path

try:
    with open(meta_file_path, 'r') as f:
        meta_data = json.load(f)
    print(json.dumps(meta_data, indent=2))
except FileNotFoundError:
    print(f"Error: {meta_file_path} not found. Please check the path.")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from {meta_file_path}.")

{
  "collections": {
    "zoomcamp_tagged_data": {
      "vectors": {
        "fast-bge-small-en": {
          "size": 384,
          "distance": "Cosine",
          "hnsw_config": null,
          "quantization_config": null,
          "on_disk": null,
          "datatype": null,
          "multivector_config": null
        }
      },
      "shard_number": null,
      "sharding_method": null,
      "replication_factor": null,
      "write_consistency_factor": null,
      "on_disk_payload": null,
      "hnsw_config": null,
      "wal_config": null,
      "optimizers_config": null,
      "init_from": null,
      "quantization_config": null,
      "sparse_vectors": null,
      "strict_mode_config": null
    },
    "zoomcamp_tagged_data__dlt_pipeline_state": {
      "vectors": {
        "fast-bge-small-en": {
          "size": 384,
          "distance": "Cosine",
          "hnsw_config": null,
          "quantization_config": null,
          "on_disk": null,
          "datatype": null,
   

In [None]:
vector_info = meta_data['collections']['zoomcamp_tagged_data']['vectors']

# Get the keys of the 'vectors_data' dictionary
keys = vector_info.keys()

# Since 'fast-bge-small-en' is the only key, you can convert keys to a list and get the first element
first_key = list(keys)[0]

print(first_key)

fast-bge-small-en
