In [9]:
from pyspark.sql.types import StructType, StructField, StringType, DecimalType, TimestampType
from pyspark.sql.functions import create_map, lit

In [None]:
try:
    pipeline_id = dbutils.widgets.get("pipeline_id")
    run_id = dbutils.widgets.get("run_id")
    task_id = dbutils.widgets.get("task_id")
    processed_timestamp = dbutils.widgets.get("processed_timestamp")
    catalog = dbutils.widgets.get("catalog")
    
except KeyError:
    pipeline_id = "default_pipeline"
    run_id = "default_run"
    task_id = "default_task"
    processed_timestamp = ""




In [11]:
schema = StructType([
    StructField("ride_id",StringType(),True),
    StructField("rideable_type",StringType(),True),
    StructField("started_at",TimestampType(),True),
    StructField("ended_at",TimestampType(),True),
    StructField("start_station_name",StringType(),True),
    StructField("start_station_id",StringType(),True),
    StructField("end_station_name",StringType(),True),
    StructField("end_station_id",StringType(),True),
    StructField("start_lat",DecimalType(),True),
    StructField("start_lng",DecimalType(),True),
    StructField("end_lat",DecimalType(),True),
    StructField("end_lng",DecimalType(),True),
    StructField("member_casual",StringType(),True),
])

In [None]:
df = spark.read.csv(f"/Volumes/{catalog}/00_landing/source_citibike_data/JC-202503-citibike-tripdata.csv",schema=schema,header=True)

In [13]:
display(df)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,29DAF43DD84B4B7A,electric_bike,2025-03-20 18:58:31.217,2025-03-20 19:00:46.466,6 St & Grand St,HB302,Mama Johnson Field - 4 St & Jackson St,HB404,41,-74,41,-74,member
1,B11B4220F7195025,electric_bike,2025-03-29 11:01:25.124,2025-03-29 11:11:09.383,Heights Elevator,JC059,Jersey & 3rd,JC074,41,-74,41,-74,member
2,18D5B30305F602B9,electric_bike,2025-03-01 16:05:32.346,2025-03-01 16:07:43.156,Jersey & 3rd,JC074,Hamilton Park,JC009,41,-74,41,-74,member
3,532EB2D9DB68567D,electric_bike,2025-03-21 18:44:15.137,2025-03-21 18:51:00.763,Jersey & 3rd,JC074,Jersey & 6th St,JC027,41,-74,41,-74,member
4,EA7C9C945D7D57AA,electric_bike,2025-03-20 11:08:27.226,2025-03-20 11:12:28.545,6 St & Grand St,HB302,Madison St & 1 St,HB402,41,-74,41,-74,member
5,DA232FF47222E86C,classic_bike,2025-03-13 11:11:25.452,2025-03-13 11:15:29.146,6 St & Grand St,HB302,Madison St & 1 St,HB402,41,-74,41,-74,member
6,416547516DE5132F,electric_bike,2025-03-28 21:51:52.621,2025-03-28 21:57:01.336,Hilltop,JC019,Leonard Gordon Park,JC080,41,-74,41,-74,member
7,E25EDA33910F90F0,electric_bike,2025-03-13 18:21:57.969,2025-03-13 18:26:48.536,Hilltop,JC019,Leonard Gordon Park,JC080,41,-74,41,-74,member
8,D209FF2521E26D16,classic_bike,2025-03-01 14:59:20.947,2025-03-01 15:06:34.299,Jackson Square,JC063,Bergen Ave,JC095,41,-74,41,-74,member
9,BC9F0D06A5AFF751,electric_bike,2025-03-04 09:55:18.140,2025-03-04 09:59:59.031,6 St & Grand St,HB302,Southwest Park - Jackson St & Observer Hwy,HB401,41,-74,41,-74,member


In [14]:
df = df.withColumn("metadata", 
              create_map(
                  lit("pipeline_id"), lit(pipeline_id),
                  lit("run_id"), lit(run_id),
                  lit("task_id"), lit(task_id),
                  lit("processed_timestamp"), lit(processed_timestamp)
                  ))

In [7]:
display(df)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,metadata
0,29DAF43DD84B4B7A,electric_bike,2025-03-20 18:58:31.217,2025-03-20 19:00:46.466,6 St & Grand St,HB302,Mama Johnson Field - 4 St & Jackson St,HB404,41,-74,41,-74,member,"{'pipeline_id': 'default_pipeline', 'run_id': 'default_run', 'task_id': 'default_task', 'processed_timestamp': ''}"
1,B11B4220F7195025,electric_bike,2025-03-29 11:01:25.124,2025-03-29 11:11:09.383,Heights Elevator,JC059,Jersey & 3rd,JC074,41,-74,41,-74,member,"{'pipeline_id': 'default_pipeline', 'run_id': 'default_run', 'task_id': 'default_task', 'processed_timestamp': ''}"
2,18D5B30305F602B9,electric_bike,2025-03-01 16:05:32.346,2025-03-01 16:07:43.156,Jersey & 3rd,JC074,Hamilton Park,JC009,41,-74,41,-74,member,"{'pipeline_id': 'default_pipeline', 'run_id': 'default_run', 'task_id': 'default_task', 'processed_timestamp': ''}"
3,532EB2D9DB68567D,electric_bike,2025-03-21 18:44:15.137,2025-03-21 18:51:00.763,Jersey & 3rd,JC074,Jersey & 6th St,JC027,41,-74,41,-74,member,"{'pipeline_id': 'default_pipeline', 'run_id': 'default_run', 'task_id': 'default_task', 'processed_timestamp': ''}"
4,EA7C9C945D7D57AA,electric_bike,2025-03-20 11:08:27.226,2025-03-20 11:12:28.545,6 St & Grand St,HB302,Madison St & 1 St,HB402,41,-74,41,-74,member,"{'pipeline_id': 'default_pipeline', 'run_id': 'default_run', 'task_id': 'default_task', 'processed_timestamp': ''}"
5,DA232FF47222E86C,classic_bike,2025-03-13 11:11:25.452,2025-03-13 11:15:29.146,6 St & Grand St,HB302,Madison St & 1 St,HB402,41,-74,41,-74,member,"{'pipeline_id': 'default_pipeline', 'run_id': 'default_run', 'task_id': 'default_task', 'processed_timestamp': ''}"
6,416547516DE5132F,electric_bike,2025-03-28 21:51:52.621,2025-03-28 21:57:01.336,Hilltop,JC019,Leonard Gordon Park,JC080,41,-74,41,-74,member,"{'pipeline_id': 'default_pipeline', 'run_id': 'default_run', 'task_id': 'default_task', 'processed_timestamp': ''}"
7,E25EDA33910F90F0,electric_bike,2025-03-13 18:21:57.969,2025-03-13 18:26:48.536,Hilltop,JC019,Leonard Gordon Park,JC080,41,-74,41,-74,member,"{'pipeline_id': 'default_pipeline', 'run_id': 'default_run', 'task_id': 'default_task', 'processed_timestamp': ''}"
8,D209FF2521E26D16,classic_bike,2025-03-01 14:59:20.947,2025-03-01 15:06:34.299,Jackson Square,JC063,Bergen Ave,JC095,41,-74,41,-74,member,"{'pipeline_id': 'default_pipeline', 'run_id': 'default_run', 'task_id': 'default_task', 'processed_timestamp': ''}"
9,BC9F0D06A5AFF751,electric_bike,2025-03-04 09:55:18.140,2025-03-04 09:59:59.031,6 St & Grand St,HB302,Southwest Park - Jackson St & Observer Hwy,HB401,41,-74,41,-74,member,"{'pipeline_id': 'default_pipeline', 'run_id': 'default_run', 'task_id': 'default_task', 'processed_timestamp': ''}"


In [None]:
df.write.mode("overwrite").option("overwriteSchema","true").saveAsTable(f"{catalog}.01_bronze.jc_citibike")