In [0]:
import sys
sys.path.insert(0, "../utils")
from bronze_logger import log_bronze_ingestion
import uuid
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import col, current_timestamp, lit

In [0]:
run_id=str(uuid.uuid4())
dataset_name='yellow_taxi_trips'
landing_path='s3://nyc-tlc-data-398563707364/landing/taxi_zone/'
bronze_path='s3://nyc-tlc-data-398563707364/bronze/taxi_zone/'
catalog_table='nyc_taxi.bronze.taxi_zones'
log_table='nyc_taxi.logs.bronze_ingestion_logs'

In [0]:
try:
    csv_schema = StructType(
        [
            StructField("locationid", IntegerType(), True),
            StructField("borough", StringType(), True),
            StructField("zone", StringType(), True),
            StructField("service_zone", StringType(), True),
        ]
    )
    df = (
        spark.read.csv(path=landing_path, header=True, schema=csv_schema, sep=",")
        .withColumn("_ingest_ts", current_timestamp())
        .withColumn("_source_file", col("_metadata.file_name"))
        .withColumn ("run_id", lit(run_id))
        .mode("overwrite")
        .saveAsTable(catalog_table)
    )
    log_bronze_ingestion(
        spark=spark,
        run_id=run_id,
        dataset_name=dataset_name,
        catalog_table=catalog_table,
        log_table=log_table,
        success=True
    )
except Exception as e:
    print(e)
    log_bronze_ingestion(
        spark=spark,
        run_id=run_id,
        dataset_name=dataset_name,
        catalog_table=catalog_table,
        success=False,
        log_table=log_table,
        error_msg=str(e)
    )