In [1]:
pip install "pyiceberg[s3fs,sql-sqlite]"

Defaulting to user installation because normal site-packages is not writeable
Collecting pyiceberg[s3fs,sql-sqlite]
  Downloading pyiceberg-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hCollecting requests<3.0.0,>=2.20.0
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 KB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting strictyaml<2.0.0,>=1.7.0
  Downloading strictyaml-1.7.3-py3-none-any.whl (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.9/123.9 KB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tenacity<9.0.0,>=8.2.3
  Downloading tenacity-8.5.0-py3-none-any.whl (28 kB)
Collecting pydantic!=2.4.0,!=2.4.1,<3.0,>=2.0
  Downloading pydantic-2.9.2-py3-none-any.whl (434 kB)
[2K     [90m━━━

In [None]:
pip install numpy pandas

# Configure the SQLCatalog (Using a SQLLite Catalog)

In [7]:
import os
from pyiceberg.catalog.sql import SqlCatalog

# Define the warehouse path
warehouse_path = "/tmp/warehouse"

# Create the warehouse directory if it doesn't exist
os.makedirs(warehouse_path, exist_ok=True)

# Initialize the SqlCatalog with the warehouse path
catalog = SqlCatalog(
    "default",
    **{
        "uri": f"sqlite:///{warehouse_path}/pyiceberg_catalog.db",
        "warehouse": f"file://{warehouse_path}",
    },
)


# Download Dataset

In [6]:
!curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet -o /tmp/yellow_tripdata_2023-01.parquet

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 45.4M  100 45.4M    0     0  28.9M      0  0:00:01  0:00:01 --:--:-- 28.9M


# Load data in a PyArrow DataFrame

In [8]:
pip install pyarrow

Defaulting to user installation because normal site-packages is not writeable
Collecting pyarrow
  Downloading pyarrow-18.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.0/40.0 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-18.0.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pyarrow.parquet as pq

df = pq.read_table("/tmp/yellow_tripdata_2023-01.parquet")

In [6]:
df.schema

VendorID: int64
tpep_pickup_datetime: timestamp[us]
tpep_dropoff_datetime: timestamp[us]
passenger_count: double
trip_distance: double
RatecodeID: double
store_and_fwd_flag: string
PULocationID: int64
DOLocationID: int64
payment_type: int64
fare_amount: double
extra: double
mta_tax: double
tip_amount: double
tolls_amount: double
improvement_surcharge: double
total_amount: double
congestion_surcharge: double
airport_fee: double
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 2492

# Define the Iceberg Table Schema

In [None]:
from pyiceberg.schema import Schema, NestedField
from pyiceberg.types import LongType, TimestampType, DoubleType, StringType

# Define the Iceberg-compatible schema using NestedField with LongType for 64-bit integers
iceberg_schema = Schema(
    NestedField(id=1, name="VendorID", field_type=LongType(), required=False),
    NestedField(id=2, name="tpep_pickup_datetime", field_type=TimestampType(), required=False),
    NestedField(id=3, name="tpep_dropoff_datetime", field_type=TimestampType(), required=False),
    NestedField(id=4, name="passenger_count", field_type=DoubleType(), required=False),
    NestedField(id=5, name="trip_distance", field_type=DoubleType(), required=False),
    NestedField(id=6, name="RatecodeID", field_type=DoubleType(), required=False),
    NestedField(id=7, name="store_and_fwd_flag", field_type=StringType(), required=False),
    NestedField(id=8, name="PULocationID", field_type=LongType(), required=False),
    NestedField(id=9, name="DOLocationID", field_type=LongType(), required=False),
    NestedField(id=10, name="payment_type", field_type=LongType(), required=False),
    NestedField(id=11, name="fare_amount", field_type=DoubleType(), required=False),
    NestedField(id=12, name="extra", field_type=DoubleType(), required=False),
    NestedField(id=13, name="mta_tax", field_type=DoubleType(), required=False),
    NestedField(id=14, name="tip_amount", field_type=DoubleType(), required=False),
    NestedField(id=15, name="tolls_amount", field_type=DoubleType(), required=False),
    NestedField(id=16, name="improvement_surcharge", field_type=DoubleType(), required=False),
    NestedField(id=17, name="total_amount", field_type=DoubleType(), required=False),
    NestedField(id=18, name="congestion_surcharge", field_type=DoubleType(), required=False),
    NestedField(id=19, name="airport_fee", field_type=DoubleType(), required=False)
)


# Now create the table with the specified schema

In [None]:
table_ice = catalog.create_table(
    "default.taxi_dataset_ice",
    schema=iceberg_schema
)

# Append data to the Iceberg table

In [25]:
table_ice.append(df)

# Read data from Iceberg Table

In [26]:
from pyiceberg.expressions import GreaterThanOrEqual
scan = ice_table.scan(
    row_filter=GreaterThanOrEqual("trip_distance", 10.0),
    selected_fields=("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime"),
    limit=100,
)

# Read data in Apache Arrow format

In [27]:
sc_record = scan.to_arrow()

In [28]:
sc_record

pyarrow.Table
VendorID: int64
tpep_pickup_datetime: timestamp[us]
tpep_dropoff_datetime: timestamp[us]
----
VendorID: [[2,2,1,2,2,...,2,2,2,2,2]]
tpep_pickup_datetime: [[2023-01-01 00:27:12.000000,2023-01-01 00:09:29.000000,2023-01-01 00:13:30.000000,2023-01-01 00:41:41.000000,2023-01-01 00:22:39.000000,...,2023-01-01 00:56:24.000000,2023-01-01 00:55:38.000000,2023-01-01 00:13:36.000000,2023-01-01 00:51:18.000000,2023-01-01 00:27:34.000000]]
tpep_dropoff_datetime: [[2023-01-01 00:49:56.000000,2023-01-01 00:29:23.000000,2023-01-01 00:44:00.000000,2023-01-01 01:19:32.000000,2023-01-01 01:30:45.000000,...,2023-01-01 01:26:29.000000,2023-01-01 01:25:34.000000,2023-01-01 00:48:23.000000,2023-01-01 01:11:18.000000,2023-01-01 01:05:05.000000]]

# Read data in Pandas

In [29]:
sc_record_pandas = scan.to_pandas()
sc_record_pandas

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime
0,2,2023-01-01 00:27:12,2023-01-01 00:49:56
1,2,2023-01-01 00:09:29,2023-01-01 00:29:23
2,1,2023-01-01 00:13:30,2023-01-01 00:44:00
3,2,2023-01-01 00:41:41,2023-01-01 01:19:32
4,2,2023-01-01 00:22:39,2023-01-01 01:30:45
...,...,...,...
95,2,2023-01-01 00:56:24,2023-01-01 01:26:29
96,2,2023-01-01 00:55:38,2023-01-01 01:25:34
97,2,2023-01-01 00:13:36,2023-01-01 00:48:23
98,2,2023-01-01 00:51:18,2023-01-01 01:11:18


# Load Iceberg Tables from an already configured Catalog

In [15]:
from pyiceberg.catalog import load_catalog
catalog_dev = load_catalog("docs", **{"uri": f"sqlite:///{warehouse_path}/pyiceberg_catalog.db"})
cust_table = catalog_dev.load_table("default.customers")