In [0]:
spark.sql("CREATE DATABASE IF NOT EXISTS gold")

DataFrame[]

**Dimension: Rider**
- * Contains one record per rider.
- * Uses a surrogate key (rider_key) for joins in fact tables.



In [0]:
from pyspark.sql.functions import col, year, monotonically_increasing_id

dim_rider = (
    spark.table("bronze.riders")
    .withColumn("rider_key", monotonically_increasing_id())
    .withColumn(
        "age_at_account_start",
        year(col("account_start_date")) - year(col("birthday"))
    )
    .select(
        "rider_key",
        "rider_id",
        "first",
        "last",
        "address",
        "birthday",
        "account_start_date",
        "is_member",
        "age_at_account_start"
    )
)

dim_rider.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("gold.dim_rider")



**Dimension: Date**
- Conformed date dimension built from trips and payments transaction dates.


In [0]:
from pyspark.sql.functions import dayofweek, dayofmonth, month, year, quarter, date_format

dates_df = (
    spark.table("bronze.trips")
    .select(col("started_at").cast("date").alias("full_date"))
    .union(
        spark.table("bronze.payments")
        .select(col("date").cast("date"))
    )
    .distinct()
)

dim_date = dates_df \
    .withColumn("date_key", date_format(col("full_date"), "yyyyMMdd")) \
    .withColumn("day", dayofmonth(col("full_date"))) \
    .withColumn("day_name", date_format(col("full_date"), "EEEE")) \
    .withColumn("month", month(col("full_date"))) \
    .withColumn("month_name", date_format(col("full_date"), "MMMM")) \
    .withColumn("quarter", quarter(col("full_date"))) \
    .withColumn("year", year(col("full_date"))) \
    .withColumn("is_weekend", dayofweek(col("full_date")).isin([1,7]))

dim_date.write.format("delta").mode("overwrite").saveAsTable("gold.dim_date")

**Dimension: Station**
- Contains the respective station data.
- Preserves station_id as business key and adds station_key as surrogate key.


In [0]:
from pyspark.sql.functions import monotonically_increasing_id

dim_station = (
    spark.table("bronze.stations")
    .withColumn("station_key", monotonically_increasing_id())
    .select(
        "station_key",
        "station_id",
        "name",
        "latitude",
        "longitude"
    )
)

dim_station.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("gold.dim_station")
