In [1]:
#Import libraries
import os
import requests
import dlt
import lancedb
from dlt.destinations.adapters import lancedb_adapter

In [2]:
# Get data using requests URL

data = requests.get(
    "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1"
).json()

In [3]:
# A resource function that yields documents to be loaded
@dlt.resource
def qa_docs():
    for course in data:
        yield course["documents"]

# Configure and run the pipeline with a name, destination, and dataset name
pipeline = dlt.pipeline(pipeline_name="from_json", destination="lancedb", dataset_name="qanda")
load_info = pipeline.run(qa_docs, table_name="documents")

print(load_info)

Pipeline from_json load step completed in 0.17 seconds
1 load package(s) were loaded to destination LanceDB and into dataset qanda
The LanceDB destination used <dlt.destinations.impl.lancedb.configuration.LanceDBCredentials object at 0x7fc9921a4430> location to store data
Load package 1721568921.7144134 is LOADED and contains no failed jobs


In [4]:
# Connect to LanceDB
db = lancedb.connect("./.lancedb")
print(db.table_names())

['qanda____dlt_loads', 'qanda____dlt_pipeline_state', 'qanda____dlt_version', 'qanda___dltSentinelTable', 'qanda___documents', 'qanda_embedded____dlt_loads', 'qanda_embedded____dlt_pipeline_state', 'qanda_embedded____dlt_version', 'qanda_embedded___dltSentinelTable', 'qanda_embedded___documents']


In [5]:
db_table = db.open_table("qanda___documents")

db_table.to_pandas()

Unnamed: 0,id__,text,section,question,_dlt_load_id,_dlt_id
0,65735d27-fa75-592d-8f0c-1dc8b6f1a2f0,The purpose of this document is to capture fre...,General course-related questions,Course - When will the course start?,1721567581.230033,BxSs1zmVZlG5yQ
1,860a33a5-7c1c-554a-a71d-7fc850c31d40,GitHub - DataTalksClub data-engineering-zoomca...,General course-related questions,Course - What are the prerequisites for this c...,1721567581.230033,Ib1gRaPWLMuy+A
2,b1ab99b9-53e4-5ae8-9ab0-19ffeec89327,"Yes, even if you don't register, you're still ...",General course-related questions,Course - Can I still join the course after the...,1721567581.230033,DPx/UJd+owySmg
3,6c9890dc-05fa-5ff7-9f65-80e93d417a6e,You don't need it. You're accepted. You can al...,General course-related questions,Course - I have registered for the Data Engine...,1721567581.230033,BwN5BMiZhyXtkg
4,adc506f0-c9fc-5961-a207-1b863a9bd476,You can start by installing and setting up all...,General course-related questions,Course - What can I do before the course starts?,1721567581.230033,9osD9hG4wICBvQ
...,...,...,...,...,...,...
1891,d7fb7f43-2f8f-5599-a4a4-e15ad8a1ecaf,Problem description\nThis is the step in the c...,Module 6: Best practices,Github actions: Permission denied error when e...,1721568921.7144134,55q5xf8UYw3oJw
1892,f999585f-f8bc-565c-899c-aefeab097c3f,Problem description\nWhen a docker-compose fil...,Module 6: Best practices,Managing Multiple Docker Containers with docke...,1721568921.7144134,YIq1JrbVx6vNUQ
1893,9fca796d-d8bb-57d4-a15e-25946506ebb9,Problem description\nIf you are having problem...,Module 6: Best practices,AWS regions need to match docker-compose,1721568921.7144134,aDXIZzuym5Yi1A
1894,c5c57b41-6120-5dec-b075-7ede0ac9f82e,Problem description\nPre-commit command was fa...,Module 6: Best practices,Isort Pre-commit,1721568921.7144134,Gdt+/4STkSNHEA


#### **Load and Embed Data**

In [6]:
# Define the embedding model via ENV variables
os.environ["DESTINATION__LANCEDB__EMBEDDING_MODEL_PROVIDER"] = "sentence-transformers"
os.environ["DESTINATION__LANCEDB__EMBEDDING_MODEL"] = "all-MiniLM-L6-v2"

# Confirgure and run the pipeline specifying the content to embed using the lancedb_adapter
pipeline = dlt.pipeline(pipeline_name="from_json_embedded", destination="lancedb", dataset_name="qanda_embedded")

load_info = pipeline.run(lancedb_adapter(qa_docs, embed=["text", "question"]), table_name="documents")
print(load_info)

Pipeline from_json_embedded load step completed in 6.88 seconds
1 load package(s) were loaded to destination LanceDB and into dataset qanda_embedded
The LanceDB destination used <dlt.destinations.impl.lancedb.configuration.LanceDBCredentials object at 0x7fc9c80a43a0> location to store data
Load package 1721568922.6106348 is LOADED and contains no failed jobs


In [7]:
db = lancedb.connect("./.lancedb")
print(db.table_names())

['qanda____dlt_loads', 'qanda____dlt_pipeline_state', 'qanda____dlt_version', 'qanda___dltSentinelTable', 'qanda___documents', 'qanda_embedded____dlt_loads', 'qanda_embedded____dlt_pipeline_state', 'qanda_embedded____dlt_version', 'qanda_embedded___dltSentinelTable', 'qanda_embedded___documents']


In [8]:
db_table = db.open_table("qanda_embedded___documents")

# Convert to a pandas df
db_table.to_pandas()

Unnamed: 0,id__,vector__,text,section,question,_dlt_load_id,_dlt_id
0,2ae76761-6a33-5b23-a3e5-eb1dbc335d1e,"[-0.00035093643, -0.062014256, -0.037999917, 0...",The purpose of this document is to capture fre...,General course-related questions,Course - When will the course start?,1721568308.926745,W2T8ZSwSD1UwXg
1,c3631277-5a79-575c-9cb6-20194013ba61,"[0.020011364, -0.011535534, 0.013017162, -0.00...",GitHub - DataTalksClub data-engineering-zoomca...,General course-related questions,Course - What are the prerequisites for this c...,1721568308.926745,XLluZDhv1HAk3A
2,99db4267-6560-5dd0-96e2-06da87567c70,"[0.0148575725, -0.06664996, -0.013571216, 0.02...","Yes, even if you don't register, you're still ...",General course-related questions,Course - Can I still join the course after the...,1721568308.926745,6yOvcaji/idMBg
3,c0032292-678c-5af9-9960-a2a23deb3403,"[-0.023312073, -0.09461492, 0.056361627, -0.00...",You don't need it. You're accepted. You can al...,General course-related questions,Course - I have registered for the Data Engine...,1721568308.926745,xB2zlw1h/XFZxQ
4,8faabaf7-f916-586b-a49c-5dc767ddd938,"[0.026537647, -0.017796647, 0.00211565, 0.0064...",You can start by installing and setting up all...,General course-related questions,Course - What can I do before the course starts?,1721568308.926745,KR+0a8bPFzX99g
...,...,...,...,...,...,...,...
1891,efa1c97c-b129-5f9e-a06e-a2b0f6538670,"[0.016619338, -0.033603135, -0.09334716, -0.02...",Problem description\nThis is the step in the c...,Module 6: Best practices,Github actions: Permission denied error when e...,1721568922.6106348,4MqSc5RDgup8zw
1892,b416b512-684b-5e77-b653-40033e785f69,"[0.026872844, -0.0019949211, 0.008369117, -0.0...",Problem description\nWhen a docker-compose fil...,Module 6: Best practices,Managing Multiple Docker Containers with docke...,1721568922.6106348,bjbzk8xSFn8B1A
1893,a6741ee5-f1cc-50f3-b268-f7cbdf0770f2,"[0.035137586, 0.056265555, 0.024428478, -0.065...",Problem description\nIf you are having problem...,Module 6: Best practices,AWS regions need to match docker-compose,1721568922.6106348,ai12PpE+lHnH8Q
1894,dd8146e3-d884-5813-a432-b002a8827d2d,"[0.033809785, -0.0031219632, 0.0017484893, 0.0...",Problem description\nPre-commit command was fa...,Module 6: Best practices,Isort Pre-commit,1721568922.6106348,xuoTtZ+YmuuKgA


dlt automatically stores the vectors in the vector database LanceDB