###### Read Silver Airport Attributes

Load distinct airport-level attributes from Silver data as the source for the airport dimension.

In [0]:
from pyspark.sql.functions import col

SILVER_PATH = "wasbs://silver@flightdatastorage.blob.core.windows.net/flight_market/"

df_silver = (
    spark.read
         .format("delta")
         .load(SILVER_PATH)
)


In [0]:
df_silver.printSchema()

root
 |-- ITIN_ID: string (nullable = true)
 |-- MKT_ID: string (nullable = true)
 |-- MARKET_COUPONS: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- quarter: integer (nullable = true)
 |-- ORIGIN_AIRPORT_ID: string (nullable = true)
 |-- ORIGIN_AIRPORT_SEQ_ID: string (nullable = true)
 |-- ORIGIN_CITY_MARKET_ID: string (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- ORIGIN_COUNTRY: string (nullable = true)
 |-- ORIGIN_STATE_FIPS: string (nullable = true)
 |-- ORIGIN_STATE_ABR: string (nullable = true)
 |-- ORIGIN_STATE_NM: string (nullable = true)
 |-- ORIGIN_WAC: string (nullable = true)
 |-- DEST_AIRPORT_ID: string (nullable = true)
 |-- DEST_AIRPORT_SEQ_ID: string (nullable = true)
 |-- DEST_CITY_MARKET_ID: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DEST_COUNTRY: string (nullable = true)
 |-- DEST_STATE_FIPS: string (nullable = true)
 |-- DEST_STATE_ABR: string (nullable = true)
 |-- DEST_STATE_NM: string (nullable = true)
 |-

###### Extract Airport Master Records

Consolidate origin and destination airports into a single airport master dataset.

In [0]:
from pyspark.sql.functions import col

origin_airports = (
    df_silver
    .select(
        col("ORIGIN_AIRPORT_SEQ_ID").alias("airport_seq_id"),
        col("ORIGIN_AIRPORT_ID").alias("airport_id"),
        col("ORIGIN").alias("airport_code"),
        col("ORIGIN_CITY_MARKET_ID").alias("city_market_id"),
        col("ORIGIN_STATE_FIPS").alias("state_fips"),
        col("ORIGIN_STATE_ABR").alias("state_abr"),
        col("ORIGIN_STATE_NM").alias("state_name"),
        col("ORIGIN_COUNTRY").alias("country"),
        col("ORIGIN_WAC").alias("wac")
    )
)

dest_airports = (
    df_silver
    .select(
        col("DEST_AIRPORT_SEQ_ID").alias("airport_seq_id"),
        col("DEST_AIRPORT_ID").alias("airport_id"),
        col("DEST").alias("airport_code"),
        col("DEST_CITY_MARKET_ID").alias("city_market_id"),
        col("DEST_STATE_FIPS").alias("state_fips"),
        col("DEST_STATE_ABR").alias("state_abr"),
        col("DEST_STATE_NM").alias("state_name"),
        col("DEST_COUNTRY").alias("country"),
        col("DEST_WAC").alias("wac")
    )
)

df_airport_master = (
    origin_airports
    .unionByName(dest_airports)
    .dropDuplicates(["airport_seq_id"])
)


###### Initialize SCD Type 2 Columns

Add effective date tracking columns required to support historical versioning of airport attributes.

In [0]:
from pyspark.sql.functions import current_date, lit

df_airport_scd = (
    df_airport_master
    .withColumn("effective_from", current_date())
    .withColumn("effective_to", lit("9999-12-31").cast("date"))
    .withColumn("is_current", lit(True))
)


###### Generate Airport Surrogate Key

Creates a deterministic surrogate key using the airport natural key and effective start date.

In [0]:
from pyspark.sql.functions import sha2, concat_ws

df_dim_airport_staged = (
    df_airport_scd
    .withColumn(
        "airport_key",
        sha2(
            concat_ws(
                "||",
                col("airport_seq_id").cast("string"),
                col("effective_from").cast("string")
            ),
            256
        )
    )
)


###### Load Existing Airport Dimension

Loads the current Gold airport dimension to enable SCD Type 2 change detection and versioning.

In [0]:
from delta.tables import DeltaTable

GOLD_DIM_AIRPORT_PATH = (
    "wasbs://gold@flightdatastorage.blob.core.windows.net/dim_airport/"
)

dim_airport_exists = DeltaTable.isDeltaTable(
    spark, GOLD_DIM_AIRPORT_PATH
)

###### Detect New and Changed Airport Records

Compares incoming airport attributes against the current Gold dimension to identify new airports and attribute changes requiring new SCD versions.

In [0]:
from functools import reduce
if dim_airport_exists:
    df_dim_airport_current = (
        spark.read
             .format("delta")
             .load(GOLD_DIM_AIRPORT_PATH)
             .filter(col("is_current") == True)
    )

    tracked_cols = [
        "airport_code",
        "city_market_id",
        "state_fips",
        "state_abr",
        "state_name",
        "country",
        "wac"
    ]

    join_cond = (
        df_dim_airport_staged["airport_seq_id"]
        == df_dim_airport_current["airport_seq_id"]
    )

    change_cond = reduce(
        lambda a, b: a | b,
        [
            col(f"stg.{c}") != col(f"cur.{c}")
            for c in tracked_cols
        ]
    )

    df_changes = (
        df_dim_airport_staged.alias("stg")
        .join(df_dim_airport_current.alias("cur"), join_cond, "left")
        .where(
            col("cur.airport_seq_id").isNull() | change_cond
        )
    )

else:
    df_changes = df_dim_airport_staged


###### Insert New Airport Dimension Versions

Inserts new airport records and new SCD versions for changed airports into the Gold dimension.

In [0]:
staged_cols = [
    "airport_id",
    "airport_seq_id",
    "airport_code",
    "city_market_id",
    "state_fips",
    "state_abr",
    "state_name",
    "country",
    "wac",
    "effective_from",
    "effective_to",
    "is_current"
]
df_changes_to_insert = df_changes.select(*staged_cols)


###### Persist Airport Dimension

Writes the conformed airport dimension to the Gold layer with SCD Type 2 history preserved.

In [0]:
GOLD_DIM_AIRPORT_PATH = (
    "wasbs://gold@flightdatastorage.blob.core.windows.net/dim_airport/"
)

(
    df_dim_airport_staged
    .write
    .format("delta")
    .mode("append")  # append-only due to SCD Type 2
    .save(GOLD_DIM_AIRPORT_PATH)
)
