In [0]:
import sys
sys.path.insert(0, "../utils")
from logger import log_silver_ingestion
from silver_ingestion import run_silver_pipeline
import pyspark.sql.functions as F
import uuid

In [0]:
# 1. Define Transform Logic (Specific to the dataset)
def green_taxi_derivations(df):
    return df.withColumn("trip_duration", (F.unix_timestamp('dropoff_datetime')- F.unix_timestamp('pickup_datetime')).cast("int"))\
        .withColumn("driver_pay", F.col("fare_amount") + F.col("tip_amount"))\
        .withColumn("store_and_fwd_flag", F.when(F.col("store_and_fwd_flag") == "Y", True).otherwise(False)) 



# 2. Define the Config Dictionary
green_config = {
    # Title and locations
    "dataset_name": "green_taxi_trips",
    "bronze_table": "nyc_taxi.bronze.green_taxi_trips",
    "silver_table": "nyc_taxi.silver.green_taxi_trips",
    "quarantine_table": "nyc_taxi.quarantine.green_taxi_trips",
    "init_run": False,
    "lookback_minutes":5,
    # Renaming dictionary and type mapping     
    "rename_mapping": {
                "lpep_pickup_datetime": "pickup_datetime",  "lpep_dropoff_datetime": "dropoff_datetime",
                "VendorID": "vendor_id", "RatecodeID": "rate_code_id",
                "PULocationID": "pickup_location_id", "DOLocationID": "dropoff_location_id",
                "payment_type": "payment_type_id", "extra": "extra_charge", 
                "run_id": "bronze_id"
                },
    # Hash columns for de-duplication
    "type_mapping": {    
                "year":"int","month":"int",
                "vendor_id":"int","pickup_datetime":"timestamp",
                "dropoff_datetime":"timestamp","trip_distance":"double",
                "passenger_count":"int","rate_code_id":"int",
                "pickup_location_id":"int","dropoff_location_id":"int","payment_type_id":"int",
                "fare_amount":"double","extra_charge":"double",
                "mta_tax":"double","tip_amount":"double",
                "tolls_amount":"double","improvement_surcharge":"double",
                "congestion_surcharge":"double","cbd_congestion_fee":"double",
                "trip_type":"int","total_amount":"double",
                "ehail_fee":"double",
                "store_and_fwd_flag":"boolean","bronze_id":"string"
                },
    # Hash columns for de-duplication
    "hash_columns": [
                     "vendor_id","pickup_datetime","dropoff_datetime",
                     "rate_code_id","pickup_location_id","dropoff_location_id","payment_type_id",
                     "trip_type"
                ],
    # Quarantine Rules
    "validation_rules": [
            F.when(F.col("vendor_id").isNull(), F.lit("vendor_id_null")),
            F.when(F.col("passenger_count").isNull(), F.lit("passenger_count_null")),
            F.when(~F.col("passenger_count").between(1,6), F.lit("invalid_passenger_count")),
            F.when(F.col("trip_distance").isNull(), F.lit("trip_distance_null")),
            F.when(F.col("trip_distance") < 0, F.lit("invalid_trip_distance")),
            F.when(F.col("store_and_fwd_flag").isNull(), F.lit("store_and_fwd_flag_null")),
            F.when(F.col("rate_code_id").isNull(), F.lit("rate_code_id_null")),
            F.when(F.col("pickup_location_id").isNull() | F.col("dropoff_location_id").isNull(), F.lit("pickup_dropoff_location_null")),
            F.when(F.col("pickup_datetime").isNull(), F.lit("pickup_datetime_null")),
            F.when(F.col("dropoff_datetime").isNull(), F.lit("dropoff_datetime_null")),
            F.when(F.col("pickup_datetime") > F.col("dropoff_datetime"), F.lit("pickup_datetime_after_dropoff")),
            F.when(F.col("trip_type").isNull(), F.lit("trip_type_null")),
            F.when(F.col("payment_type_id").isNull(), F.lit("payment_type_id_null")),
            F.when(F.col("fare_amount").isNull(), F.lit("fare_amount_null")),
            F.when(F.col("fare_amount") < 0, F.lit("invalid_fare_amount")),
            F.when(F.col("extra_charge").isNull(), F.lit("extra_charge_null")),
            F.when(F.col("extra_charge")<0, F.lit("extra_charge_invalid")),
            F.when(F.col("mta_tax").isNull(), F.lit("mta_tax_null")),
            F.when(~F.col("mta_tax").isin(0,0.5), F.lit("mta_tax_invalid")),
            F.when(F.col("tip_amount").isNull(), F.lit("tip_amount_null")),
            F.when(F.col("tip_amount")<0, F.lit("tip_amount_null")),  
            F.when(F.col("tolls_amount").isNull(), F.lit("tolls_amount_null")),
            F.when(F.col("tolls_amount")<0, F.lit("tolls_amount_invalid")),
            F.when(F.col("improvement_surcharge").isNull(), F.lit("improvement_surcharge_null")),
            F.when(~F.col("improvement_surcharge").isin(0,0.3,1), F.lit("improvement_surcharge_invalid")),
            F.when(F.col("congestion_surcharge").isNull(), F.lit("congestion_surcharge_null")),
            F.when(~F.col("congestion_surcharge").isin(0,2.75), F.lit("congestion_surcharge_invalid")),
            F.when(F.col("trip_duration").isNull(), F.lit("trip_duration_null")),
            F.when(F.col("trip_duration") < 0, F.lit("invalid_trip_duration"))
    ],
    "derived_transformations": [green_taxi_derivations] # <-- Function injected!
}



In [0]:
# 3. and Run
run_silver_pipeline(spark=spark,config= green_config,run_id= str(uuid.uuid4()),log_fn= log_silver_ingestion)