In [1]:
import requests
requests.__version__

'2.32.3'

In [3]:
# Code to fetch a page from the below API

BASE_API_URL = "https://us-central1-dlthub-analytics.cloudfunctions.net/data_engineering_zoomcamp_api"
fetch_pages = 2

res_data = []

page_number = 1
while True:
    params = {'page': page_number}
    response = requests.get(BASE_API_URL, params=params)
    page_data = response.json()

    if not page_data:
        break

    # print(page_data)
    res_data.append(page_data)
    page_number += 1

    # limit the number of pages for testing
    if page_number > fetch_pages:
      break
res_data[0]


[{'End_Lat': 40.742963,
  'End_Lon': -73.980072,
  'Fare_Amt': 45.0,
  'Passenger_Count': 1,
  'Payment_Type': 'Credit',
  'Rate_Code': None,
  'Start_Lat': 40.641525,
  'Start_Lon': -73.787442,
  'Tip_Amt': 9.0,
  'Tolls_Amt': 4.15,
  'Total_Amt': 58.15,
  'Trip_Distance': 17.52,
  'Trip_Dropoff_DateTime': '2009-06-14 23:48:00',
  'Trip_Pickup_DateTime': '2009-06-14 23:23:00',
  'mta_tax': None,
  'store_and_forward': None,
  'surcharge': 0.0,
  'vendor_name': 'VTS'},
 {'End_Lat': 40.740187,
  'End_Lon': -74.005698,
  'Fare_Amt': 6.5,
  'Passenger_Count': 1,
  'Payment_Type': 'Credit',
  'Rate_Code': None,
  'Start_Lat': 40.722065,
  'Start_Lon': -74.009767,
  'Tip_Amt': 1.0,
  'Tolls_Amt': 0.0,
  'Total_Amt': 8.5,
  'Trip_Distance': 1.56,
  'Trip_Dropoff_DateTime': '2009-06-18 17:43:00',
  'Trip_Pickup_DateTime': '2009-06-18 17:35:00',
  'mta_tax': None,
  'store_and_forward': None,
  'surcharge': 1.0,
  'vendor_name': 'VTS'},
 {'End_Lat': 40.718043,
  'End_Lon': -74.004745,
  'Fare_

<hr>

In [3]:
# Example of generator using using `yield`
def number_well():
    num = 0
    while True:
        yield num
        num+=1
nw = number_well()


In [4]:
# Yield the next state
next(nw)

0

<hr>

In [5]:
# Use python's generator feature to yield each page sequentially to avoid memory overload

def paginated_getter():
    page_number = 1
    while True:
        params = {'page': page_number}
        try:
            response = requests.get(BASE_API_URL, params=params)
            response.raise_for_status()
            page_json = response.json()
            print(f'Got page {page_number} with {len(page_json)} records')

            if page_json:
                yield page_json
                page_number += 1
            else:
                break
        except Exception as e:
            print(e)
            break
res1 = paginated_getter()

In [6]:
next(res1)[0]

Got page 1 with 1000 records


{'End_Lat': 40.742963,
 'End_Lon': -73.980072,
 'Fare_Amt': 45.0,
 'Passenger_Count': 1,
 'Payment_Type': 'Credit',
 'Rate_Code': None,
 'Start_Lat': 40.641525,
 'Start_Lon': -73.787442,
 'Tip_Amt': 9.0,
 'Tolls_Amt': 4.15,
 'Total_Amt': 58.15,
 'Trip_Distance': 17.52,
 'Trip_Dropoff_DateTime': '2009-06-14 23:48:00',
 'Trip_Pickup_DateTime': '2009-06-14 23:23:00',
 'mta_tax': None,
 'store_and_forward': None,
 'surcharge': 0.0,
 'vendor_name': 'VTS'}

<hr>

In [7]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator


def paginated_getter():
    client = RESTClient(
        base_url="https://us-central1-dlthub-analytics.cloudfunctions.net",
        # Define pagination strategy - page-based pagination
        paginator=PageNumberPaginator(   # <--- Pages are numbered (1, 2, 3, ...)
            base_page=1,   # <--- Start from page 1
            total_path=None    # <--- No total count of pages provided by API, pagination should stop when a page contains no result items
        )
    )

    for page in client.paginate("data_engineering_zoomcamp_api"):    # <--- API endpoint for retrieving taxi ride data
        yield page   # remember about memory management and yield data


for page_data in paginated_getter():
    print(page_data)
    break

[{'End_Lat': 40.742963, 'End_Lon': -73.980072, 'Fare_Amt': 45.0, 'Passenger_Count': 1, 'Payment_Type': 'Credit', 'Rate_Code': None, 'Start_Lat': 40.641525, 'Start_Lon': -73.787442, 'Tip_Amt': 9.0, 'Tolls_Amt': 4.15, 'Total_Amt': 58.15, 'Trip_Distance': 17.52, 'Trip_Dropoff_DateTime': '2009-06-14 23:48:00', 'Trip_Pickup_DateTime': '2009-06-14 23:23:00', 'mta_tax': None, 'store_and_forward': None, 'surcharge': 0.0, 'vendor_name': 'VTS'}, {'End_Lat': 40.740187, 'End_Lon': -74.005698, 'Fare_Amt': 6.5, 'Passenger_Count': 1, 'Payment_Type': 'Credit', 'Rate_Code': None, 'Start_Lat': 40.722065, 'Start_Lon': -74.009767, 'Tip_Amt': 1.0, 'Tolls_Amt': 0.0, 'Total_Amt': 8.5, 'Trip_Distance': 1.56, 'Trip_Dropoff_DateTime': '2009-06-18 17:43:00', 'Trip_Pickup_DateTime': '2009-06-18 17:35:00', 'mta_tax': None, 'store_and_forward': None, 'surcharge': 1.0, 'vendor_name': 'VTS'}, {'End_Lat': 40.718043, 'End_Lon': -74.004745, 'Fare_Amt': 12.5, 'Passenger_Count': 5, 'Payment_Type': 'Credit', 'Rate_Code': N

<hr>

In [18]:
data = [
    {
        "vendor_name": "VTS",
        "record_hash": "b00361a396177a9cb410ff61f20015ad",
        "time": {
            "pickup": "2009-06-14 23:23:00",
            "dropoff": "2009-06-14 23:48:00"
        },
        "coordinates": {
            "start": {"lon": -73.787442, "lat": 40.641525},
            "end": {"lon": -73.980072, "lat": 40.742963}
        },
        "passengers": [
            {"name": "John", "rating": 4.9},
            {"name": "Jack", "rating": 3.9}
        ]
    }
]

In [19]:
import dlt

# Define a dlt pipeline with automatic normalization
pipeline = dlt.pipeline(
    pipeline_name="ny_taxi_data",
    destination="duckdb", 
    dataset_name="taxi_rides", # DB name
)
pipeline

<dlt.pipeline.pipeline.Pipeline at 0x7fcf9008b400>

In [20]:
# Run the pipeline with raw nested data
info = pipeline.run(
    data,
    table_name="rides", # Table name
    write_disposition="replace")

# Print the load summary
print(info)

Pipeline ny_taxi_data load step completed in 0.10 seconds
1 load package(s) were loaded to destination duckdb and into dataset taxi_rides
The duckdb destination used duckdb:////home/hyderreza/codehub/zoomcamp/dlt/ny_taxi_data.duckdb location to store data
Load package 1742176828.350733 is LOADED and contains no failed jobs


In [21]:
print(pipeline.last_trace)

Run started at 2025-03-17 02:00:28.300072+00:00 and COMPLETED in 0.20 seconds with 4 steps.
Step extract COMPLETED in 0.03 seconds.

Load package 1742176828.350733 is EXTRACTED and NOT YET LOADED to the destination and contains no failed jobs

Step normalize COMPLETED in 0.02 seconds.
Normalized data for the following tables:
- rides: 1 row(s)
- rides__passengers: 2 row(s)

Load package 1742176828.350733 is NORMALIZED and NOT YET LOADED to the destination and contains no failed jobs

Step load COMPLETED in 0.11 seconds.
Pipeline ny_taxi_data load step completed in 0.10 seconds
1 load package(s) were loaded to destination duckdb and into dataset taxi_rides
The duckdb destination used duckdb:////home/hyderreza/codehub/zoomcamp/dlt/ny_taxi_data.duckdb location to store data
Load package 1742176828.350733 is LOADED and contains no failed jobs

Step run COMPLETED in 0.20 seconds.
Pipeline ny_taxi_data load step completed in 0.10 seconds
1 load package(s) were loaded to destination duckdb an

In [None]:
pipeline.dataset(dataset_type="default").rides.df()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype              
---  ------                   --------------  -----              
 0   vendor_name              1 non-null      object             
 1   record_hash              1 non-null      object             
 2   time__pickup             1 non-null      datetime64[us, UTC]
 3   time__dropoff            1 non-null      datetime64[us, UTC]
 4   coordinates__start__lon  1 non-null      float64            
 5   coordinates__start__lat  1 non-null      float64            
 6   coordinates__end__lon    1 non-null      float64            
 7   coordinates__end__lat    1 non-null      float64            
 8   _dlt_load_id             1 non-null      object             
 9   _dlt_id                  1 non-null      object             
dtypes: datetime64[us, UTC](2), float64(4), object(4)
memory usage: 208.0+ bytes


In [None]:
pipeline.dataset().rides__passengers.df()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            2 non-null      object 
 1   rating          2 non-null      float64
 2   _dlt_parent_id  2 non-null      object 
 3   _dlt_list_idx   2 non-null      int64  
 4   _dlt_id         2 non-null      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 208.0+ bytes


In [27]:
data

[{'vendor_name': 'VTS',
  'record_hash': 'b00361a396177a9cb410ff61f20015ad',
  'time': {'pickup': '2009-06-14 23:23:00', 'dropoff': '2009-06-14 23:48:00'},
  'coordinates': {'start': {'lon': -73.787442, 'lat': 40.641525},
   'end': {'lon': -73.980072, 'lat': 40.742963}},
  'passengers': [{'name': 'John', 'rating': 4.9},
   {'name': 'Jack', 'rating': 3.9}]}]

In [7]:
import duckdb

# 1. Create a connection to an in-memory DuckDB database
conn = duckdb.connect("ny_taxi_manual.db")

# 2. Create the rides Table
# Since our dataset has nested structures, we must manually flatten it before inserting data.
conn.execute("""
CREATE TABLE IF NOT EXISTS rides (
    record_hash TEXT PRIMARY KEY,
    vendor_name TEXT,
    pickup_time TIMESTAMP,
    dropoff_time TIMESTAMP,
    start_lon DOUBLE,
    start_lat DOUBLE,
    end_lon DOUBLE,
    end_lat DOUBLE
);
""")


<duckdb.duckdb.DuckDBPyConnection at 0x7fbae00465b0>

In [8]:
existing_data = conn.execute("SELECT * FROM rides").df()
existing_data

Unnamed: 0,record_hash,vendor_name,pickup_time,dropoff_time,start_lon,start_lat,end_lon,end_lat


In [9]:

# 3. Insert Data Manually
# Since JSON data has nested fields, we need to extract and transform them before inserting them into DuckDB.
data = [
    {
        "vendor_name": "VTS",
        "record_hash": "b00361a396177a9cb410ff61f20015ad",
        "time": {
            "pickup": "2009-06-14 23:23:00",
            "dropoff": "2009-06-14 23:48:00"
        },
        "coordinates": {
            "start": {"lon": -73.787442, "lat": 40.641525},
            "end": {"lon": -73.980072, "lat": 40.742963}
        }
    }
]

# Prepare data for insertion
flattened_data = [
    (
        ride["record_hash"],
        ride["vendor_name"],
        ride["time"]["pickup"],
        ride["time"]["dropoff"],
        ride["coordinates"]["start"]["lon"],
        ride["coordinates"]["start"]["lat"],
        ride["coordinates"]["end"]["lon"],
        ride["coordinates"]["end"]["lat"]
    )
    for ride in data
]

flattened_data

[('b00361a396177a9cb410ff61f20015ad',
  'VTS',
  '2009-06-14 23:23:00',
  '2009-06-14 23:48:00',
  -73.787442,
  40.641525,
  -73.980072,
  40.742963)]

In [10]:

# Insert into DuckDB
conn.executemany("""
INSERT INTO rides (record_hash, vendor_name, pickup_time, dropoff_time, start_lon, start_lat, end_lon, end_lat)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""", flattened_data)

print("Data successfully loaded into DuckDB!")

Data successfully loaded into DuckDB!


In [11]:


# 4. Query Data in DuckDB
# Now that the data is loaded, we can query it using DuckDB’s SQL engine.
df = conn.execute("SELECT * FROM rides").df()

conn.close()
df

Unnamed: 0,record_hash,vendor_name,pickup_time,dropoff_time,start_lon,start_lat,end_lon,end_lat
0,b00361a396177a9cb410ff61f20015ad,VTS,2009-06-14 23:23:00,2009-06-14 23:48:00,-73.787442,40.641525,-73.980072,40.742963
