In [0]:
import sys
sys.path.insert(0, "../utils")
from logger import log_silver_ingestion
from silver_ingestion import run_silver_pipeline
import pyspark.sql.functions as F
import uuid

In [0]:
# 1. Define Transform Logic (Specific to the dataset)
def hvfhv_derivations(df):
    return df.withColumn("total_amount",F.col("fare_amount")+F.col("tolls_amount")+F.col("bcf_amount")+F.col("sales_tax")+F.col("airport_fee")+F.col("congestion_surcharge")+F.col("cbd_congestion_fee")+F.col("tips"))\
        .withColumn("shared_request_flag", F.when(F.col("shared_request_flag") == "Y", True).otherwise(False))\
        .withColumn("shared_match_flag", F.when(F.col("shared_match_flag") == "Y", True).otherwise(False))\
        .withColumn("access_a_ride_flag", F.when(F.col("access_a_ride_flag") == "Y", True).otherwise(False))\
        .withColumn("wav_request_flag", F.when(F.col("wav_request_flag") == "Y", True).otherwise(False))     



# 2. Define the Config Dictionary
hvfhv_config = {
    # Title and Locations
    "dataset_name": "hvfhv_trips",
    "bronze_table": "nyc_taxi.bronze.hvfhv_trips",
    "silver_table": "nyc_taxi.silver.hvfhv_trips",
    "quarantine_table": "nyc_taxi.quarantine.hvfhv_trips",
    "init_run": True,
    "lookback_minutes":5,
    # Rename and Type Mapping    
    "rename_mapping": {
                "hvfhs_license_num": "license_number", 
                "trip_miles":"trip_distance","trip_time":"trip_duration",
                "base_passenger_fare":"fare_amount","tolls":"tolls_amount",
                "bcf":"bcf_amount",
                "PULocationID": "pickup_location_id", "DOLocationID": "dropoff_location_id",
                "run_id": "bronze_id"
                },
    "type_mapping": {    
                "year":"int","month":"int",
                "license_number":"string","dispatching_base_num":"string","originating_base_num":"string",
                "request_datetime":"timestamp","on_scene_datetime":"timestamp","pickup_datetime":"timestamp",
                "dropoff_datetime":"timestamp","trip_distance":"double",
                "pickup_location_id":"int","dropoff_location_id":"int",
                "trip_distance":"double","trip_duration":"int",
                "fare_amount":"double","tolls_amount":"double","bcf_amount":"double",
                "sales_tax":"double","congestion_surcharge":"double","airport_fee":"double",
                "congestion_surcharge":"double","cbd_congestion_fee":"double",
                "total_amount":"double","driver_pay":"double","tips":"double",
                "shared_request_flag":"boolean","shared_match_flag":"boolean",
                "access_a_ride_flag":"boolean","wav_request_flag":"boolean",
                "bronze_id":"string"
                },
    # Hashing columns for de-duplication
    "hash_columns": [
                     "license_number","dispatching_base_num","originating_base_num","request_datetime","on_scene_datetime",
                     "pickup_datetime","dropoff_datetime","pickup_location_id","dropoff_location_id"
                ],
    # Quarantine Rules
    "validation_rules": [
                    F.when(F.col("license_number").isNull(), F.lit("license_number_null")),
                    F.when(F.col("dispatching_base_num").isNull(), F.lit("dispatching_base_num_null")),
                    F.when(F.col("originating_base_num").isNull(), F.lit("originating_base_num_null")),
                    F.when(F.col("request_datetime").isNull(), F.lit("request_datetime_null")),
                    F.when(F.col("on_scene_datetime").isNull(), F.lit("on_scene_datetime_null")),
                    F.when(F.col("pickup_datetime").isNull(), F.lit("pickup_datetime_null")),
                    F.when(F.col("dropoff_datetime").isNull(), F.lit("dropoff_datetime_null")),
                    F.when(F.col("on_scene_datetime")<F.col("request_datetime"), F.lit("on_scene_before_request")),
                    F.when(F.col("pickup_datetime")<F.col("on_scene_datetime"), F.lit("pickup_before_on_scene")),
                    F.when(F.col("dropoff_datetime")<F.col("pickup_datetime"), F.lit("dropoff_before_pickup")),
                    F.when(F.col("pickup_location_id").isNull(), F.lit("pickup_location_null")),
                    F.when(F.col("dropoff_location_id").isNull(), F.lit("dropoff_location_null")),
                    F.when(F.col("trip_distance")<0, F.lit("trip_distance_negative")),
                    F.when(F.col("trip_duration")<0, F.lit("trip_duration_negative")),
                    F.when(F.col("fare_amount")<0, F.lit("fare_amount_negative")),
                    F.when(F.col("tolls_amount")<0, F.lit("tolls_amount_negative")),
                    F.when(F.col("bcf_amount")<0, F.lit("bcf_amount_negative")),
                    F.when(F.col("sales_tax")<0, F.lit("sales_tax_negative")),
                    F.when(F.col("congestion_surcharge")<0, F.lit("congestion_surcharge_negative")),
                    F.when(F.col("airport_fee")<0, F.lit("airport_fee_negative")),
                    F.when(F.col("total_amount")<0, F.lit("total_amount_negative")),
                    F.when(F.col("driver_pay")<0, F.lit("driver_pay_negative")),
                    F.when(F.col("tips")<0, F.lit("tips_negative")),
                    F.when((F.col("shared_match_flag") == True) & (F.col("shared_request_flag") == False), F.lit("shared_match_but_not_request")),
                    ],
    "derived_transformations": [hvfhv_derivations] # <-- Function injected!
}



In [0]:
# 3. and Run
run_silver_pipeline(spark=spark,config= hvfhv_config,run_id= str(uuid.uuid4()),log_fn= log_silver_ingestion)