In [1]:
from pyspark.sql import SparkSession

# Create Spark session with Unity Catalog
# NOTE: Remove Delta extension - it conflicts with Unity Catalog's catalog routing
spark = (
    SparkSession.builder.appName("BronzeIngestion")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.jars.packages",
        "io.delta:delta-spark_2.13:4.0.0,io.unitycatalog:unitycatalog-spark_2.13:0.3.0",
    )
    # Unity Catalog configuration
    .config(
        "spark.sql.catalog.spark_catalog", "org.apache.spark.delta.catalog.DeltaCatalog"
    )
    .config("spark.sql.catalog.nyc_taxi", "io.unitycatalog.spark.UCSingleCatalog")
    .config("spark.sql.catalog.nyc_taxi.uri", "http://localhost:8080")
    .config("spark.sql.catalog.nyc_taxi.token", "")
    # Set nyc_taxi as default catalog
    .config("spark.sql.defaultCatalog", "nyc_taxi")
    # Performance configs
    .config("spark.driver.memory", "80g")
    .config("spark.sql.shuffle.partitions", "12")
    .config("spark.sql.adaptive.enabled", "true")
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
    .config("spark.sql.parquet.compression.codec", "snappy")
    .getOrCreate()
)

# Verify catalog is set correctly
print(f"Default catalog: {spark.catalog.currentCatalog()}")
print(f"Spark version: {spark.version}")

:: loading settings :: url = jar:file:/home/administrator/Desktop/datascience/github/nyc-taxi-eta/.venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/administrator/.ivy2.5.2/cache
The jars for the packages stored in: /home/administrator/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
io.unitycatalog#unitycatalog-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d54c983d-a8c1-4b2c-b5d1-2a9774025189;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.0 in central
	found io.delta#delta-storage;4.0.0 in central
	found org.antlr#antlr4-runtime;4.13.1 in central
	found io.unitycatalog#unitycatalog-spark_2.13;0.3.0 in central
	found io.unitycatalog#unitycatalog-client;0.3.0 in central
	found org.slf4j#slf4j-api;2.0.13 in central
	found org.apache.logging.log4j#log4j-slf4j2-impl;2.24.3 in central
	found org.apache.logging.log4j#log4j-api

Default catalog: nyc_taxi
Spark version: 4.0.0


In [2]:
df = spark.read.table("nyc_taxi.bronze.yellow_taxi_trips")
df.show()

25/11/30 19:07:50 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-----------+--------------------+---------------------+---------------+-------------+------------------+---------+---------+-----------------+------------------+---------+------------+--------+---------+-------+-------+---------+---------+----+
|vendor_name|trip_pickup_datetime|trip_dropoff_datetime|passenger_count|trip_distance|         start_lon|start_lat|rate_code|store_and_forward|           end_lon|  end_lat|payment_type|fare_amt|surcharge|mta_tax|tip_amt|tolls_amt|total_amt|year|
+-----------+--------------------+---------------------+---------------+-------------+------------------+---------+---------+-----------------+------------------+---------+------------+--------+---------+-------+-------+---------+---------+----+
|        VTS| 2009-08-12 07:28:00|  2009-08-12 07:36:00|              1|          1.8|               0.0|      0.0|     NULL|             NULL|               0.0|      0.0|        CASH|     6.9|      0.0|   NULL|    0.0|      0.0|      6.9|2009|
|        VTS| 20

In [3]:
# unique vendor ids to list
vendor_ids = df.select("vendor_name").distinct().rdd.flatMap(lambda x: x).collect()
print(vendor_ids)

[Stage 15:>                                                         (0 + 1) / 1]

['DDS', 'VTS', 'CMT']


                                                                                

In [4]:
spark.stop()