In [0]:
import sys
sys.path.insert(0, "../utils")
from logger import log_silver_ingestion
from silver_ingestion import run_silver_pipeline
import pyspark.sql.functions as F
import uuid

In [0]:
# 1. Define Transform Logic (Specific to the dataset)
def fhv_derivations(df):
    return df.withColumn("trip_duration", (F.unix_timestamp('dropoff_datetime')- F.unix_timestamp('pickup_datetime')).cast("long"))\
        .withColumn("shared_ride_flag", F.when(F.col("shared_ride_flag") == 1, True).otherwise(False)) 



# 2. Define the Config Dictionary
fhv_config = {
    "dataset_name": "fhv_trips",
    "bronze_table": "nyc_taxi.bronze.fhv_trips",
    "silver_table": "nyc_taxi.silver.fhv_trips",
    "quarantine_table": "nyc_taxi.quarantine.fhv_trips",
    "init_run": False,
    "lookback_minutes":5,    
    "rename_mapping": {
                 "dropOff_datetime": "dropoff_datetime",
                "PULocationID": "pickup_location_id", "DOLocationID": "dropoff_location_id",
                "Affiliated_base_number": "affiliated_base_num",
                "SR_Flag":"shared_ride_flag",
                "run_id": "bronze_id"
                },
    "type_mapping": {    
                "year":"int","month":"int",
                "affiliated_base_num":"string","dispatching_base_num":"string",
                "pickup_datetime":"timestamp","dropoff_datetime":"timestamp",
                "pickup_location_id":"int","dropoff_location_id":"int",
                "shared_ride_flag":"boolean",
                },
    "hash_columns": [
                     "affiliated_base_num","dispatching_base_num","pickup_datetime","dropoff_datetime",
                     "shared_ride_flag"
                ],
    "validation_rules": [
            F.when(F.col("pickup_location_id").isNull() | F.col("dropoff_location_id").isNull(), F.lit("pickup_dropoff_location_null")),
            F.when(F.col("pickup_datetime").isNull(), F.lit("pickup_datetime_null")),
            F.when(F.col("dropoff_datetime").isNull(), F.lit("dropoff_datetime_null")),
            F.when(F.col("pickup_datetime") > F.col("dropoff_datetime"), F.lit("pickup_datetime_after_dropoff")),
            F.when(F.col("trip_duration") < 0, F.lit("trip_duration_negative")),
            F.when(F.col("shared_ride_flag").isNull(), F.lit("shared_ride_flag_null")),
            F.when(F.col("affiliated_base_num").isNull(), F.lit("affiliated_base_num_null")),
            F.when(F.col("dispatching_base_num").isNull(), F.lit("dispatching_base_num_null"))
    ],
    "derived_transformations": [fhv_derivations] # <-- Function injected!
}



In [0]:
# 3. and Run
run_silver_pipeline(spark=spark,config= fhv_config,run_id= str(uuid.uuid4()),log_fn= log_silver_ingestion)