Take data from the yellow trip raw table in the bronze layer, cleanses it and loads it to the yellow trip's cleanse table in the silver layer

In [0]:
from pyspark.sql.functions import col, when, timestamp_diff, min, max

In [0]:
df = spark.read.table("nyctaxi.01_bronze.yellow_trips_raw")


In [0]:
df.agg(max("tpep_pickup_datetime"), min("tpep_pickup_datetime")).display()

In [0]:
# Filtering out the outlier data ranges
df = df.filter("tpep_pickup_datetime >= '2025-01-01' AND tpep_pickup_datetime < '2025-07-01'")

In [0]:
# Applying transformations

df = df.select( 
    when(col("VendorID") == 1, "Creative Mobile Technologies, LLC")
    .when(col("VendorID") == 2, "Curb Mobility, LLC")
    .when(col("VendorID") == 6, "Myle Technologies Inc")
    .when(col("VendorID") == 7, "Helix")
    .otherwise("Unknown")
    .alias("vendor"),

    "tpep_pickup_datetime",
    "tpep_dropoff_datetime",
    timestamp_diff('MINUTE', df.tpep_pickup_datetime, df.tpep_dropoff_datetime).alias("trip_duration"),
    "passenger_count",
    "trip_distance",

    when(col("RatecodeID") == 1, "Standard rate")
    .when(col("RatecodeID") == 2, "JFK")
    .when(col("RatecodeID") == 3, "Newark")
    .when(col("RatecodeID") == 4, "Nassau or Westchester")
    .when(col("RatecodeID") == 5, "Negotiated fare")
    .when(col("RatecodeID") == 6, "Group ride")
    .otherwise("Unknown")
    .alias("rate_type"),

    "store_and_fwd_flag",
    col("PULocationID").alias("pu_location_id"),
    col("DOLocationID").alias("do_location_id"),

    when(col("payment_type") == 0, "Flex Fare trip")
    .when(col("payment_type") == 1, "Credit card")
    .when(col("payment_type") == 2, "Cash")
    .when(col("payment_type") == 3, "No charge")
    .when(col("payment_type") == 4, "Dispute")
    .when(col("payment_type") == 6, "Voided trip")
    .otherwise("Unknown")
    .alias("payment_type"),

    "fare_amount",
    "extra",
    "mta_tax",
    "tolls_amount",
    "improvement_surcharge",
    "total_amount",
    "congestion_surcharge",
    col("Airport_fee").alias("airport_fee"),
    "cbd_congestion_fee",
    "processed_timestamp"
)

In [0]:
df.display()

In [0]:
df.write.mode("overwrite").saveAsTable("nyctaxi.02_silver.yellow_trips_cleansed")

In [0]:
spark.read.table("nyctaxi.02_silver.yellow_trips_cleansed").display()