In [0]:
from pyspark.sql.functions import * 

In [0]:
customers = spark.read.format('delta').table('stoyan.silver__customers')

In [0]:
customers.createOrReplaceTempView("customers")

In [0]:
spark.sql("""
CREATE OR REPLACE TABLE stoyan.dim_customers AS
WITH ranked_customers AS (
-- Step 1: Select customer data along with row numbering for each customer

    SELECT
        customer_id AS cust_nk,                 -- Customer identifier
        cust_first_name,-- Customer first name
        cust_last_name, -- Customer last name
        cust_address_country_id,-- Customer address country ID
        cust_address_state_province,-- Customer address state/province
        cust_address_city,-- Customer address city
        cust_address_postal_code,-- Customer address postal code
        cust_address_street_address,-- Customer address street address
        phone_number AS cust_phone_number,-- Customer phone number
        cust_email,-- Customer email
        account_mgr_id,-- Account manager ID
        date_of_birth,-- Customer date of birth
        marital_status, -- Customer marital status
        gender, -- Customer gender
        consume_timestamp,-- Timestamp of when the customer record was ingested
        ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY consume_timestamp DESC) AS row_num, -- Row number to identify most recent record
        -- Lead function to get the next address and marital status for changes over time
        LEAD(cust_address_country_id) OVER (PARTITION BY customer_id ORDER BY consume_timestamp) AS next_cust_address_country_id,
        LEAD(cust_address_state_province) OVER (PARTITION BY customer_id ORDER BY consume_timestamp) AS next_cust_address_state_province,
        LEAD(cust_address_city) OVER (PARTITION BY customer_id ORDER BY consume_timestamp) AS next_cust_address_city,
        LEAD(cust_address_postal_code) OVER (PARTITION BY customer_id ORDER BY consume_timestamp) AS next_cust_address_postal_code,
        LEAD(cust_address_street_address) OVER (PARTITION BY customer_id ORDER BY consume_timestamp) AS next_cust_address_street_address,
        LEAD(marital_status) OVER (PARTITION BY customer_id ORDER BY consume_timestamp) AS next_marital_status,
        LEAD(consume_timestamp) OVER (PARTITION BY customer_id ORDER BY consume_timestamp) AS next_consume_timestamp
    FROM stoyan.silver__customers
),
effective_customers AS (
    -- Step 2: Select the "effective" customer records based on changes over time

    SELECT
        cust_nk,                    -- Customer identifier (primary key)
        cust_first_name,            -- Customer first name
        cust_last_name,             -- Customer last name
        cust_address_country_id,        -- Customer address country ID
        cust_address_state_province,-- Customer address state/province
        cust_address_city,          -- Customer address city
        cust_address_postal_code,   -- Customer address postal code
        cust_address_street_address,-- Customer address street address
        cust_phone_number,          -- Customer phone number
        cust_email,                 -- Customer email
        account_mgr_id,             -- Account manager ID
        date_of_birth,              -- Customer date of birth
        marital_status,             -- Customer marital status
        gender,                     -- Customer gender
        consume_timestamp AS effective_from,                          -- Timestamp of when the customer data is effective
        COALESCE(next_consume_timestamp, '9999-12-31') AS effective_to, -- Timestamp of when the customer data stops being effective (default to max date if not available)
        consume_timestamp AS insert_timestamp,-- Timestamp when the record was inserted
        consume_timestamp AS update_timestamp,-- Timestamp when the record was last updated
        row_num,                              -- Row number to determine the most recent record

        -- Step 3: Detect changes in customer data
        CASE
            WHEN cust_address_country_id != next_cust_address_country_id 
              OR cust_address_state_province != next_cust_address_state_province 
              OR cust_address_city != next_cust_address_city 
              OR cust_address_postal_code != next_cust_address_postal_code 
              OR cust_address_street_address != next_cust_address_street_address 
              OR marital_status != next_marital_status 
              THEN 1                     -- Mark as changed if any of the above fields are different from the next record
            ELSE 0                       -- Otherwise, no change
        END AS is_changed
    FROM ranked_customers
)
-- Final selection of customer data, ensuring only changed or most recent records are included

SELECT
    MD5(CAST(cust_nk AS STRING)) AS cust_sk,-- Generate a surrogate key (cust_sk) based on customer ID
    cust_nk,                                -- Customer natural key (cust_nk)
    cust_first_name,                        -- Customer first name
    cust_last_name,                         -- Customer last name
    cust_address_country_id,                -- Customer address country ID
    cust_address_state_province,            -- Customer address state/province
    cust_address_city,                      -- Customer address city
    cust_address_postal_code,               -- Customer address postal code
    cust_address_street_address,            -- Customer address street address
    cust_phone_number,                      -- Customer phone number
    cust_email,                             -- Customer email
    account_mgr_id,                         -- Account manager ID
    date_of_birth,                          -- Customer date of birth
    marital_status,                         -- Customer marital status
    gender,                                 -- Customer gender
    effective_from,                         -- Effective from timestamp
    effective_to,                           -- Effective to timestamp
    insert_timestamp,                       -- Insert timestamp
    update_timestamp,                       -- Update timestamp
    row_num                                 -- Row number indicating the most recent record
FROM effective_customers
-- Only include records that are either marked as changed or the most recent record for each customer
WHERE is_changed = 1 OR row_num = 1
""") 

num_affected_rows,num_inserted_rows


In [0]:
spark.sql('select * from stoyan.dim_customers').display()

cust_sk,cust_nk,cust_first_name,cust_last_name,cust_address_country_id,cust_address_state_province,cust_address_city,cust_address_postal_code,cust_address_street_address,cust_phone_number,cust_email,account_mgr_id,date_of_birth,marital_status,gender,effective_from,effective_to,insert_timestamp,update_timestamp,row_num
38b3eff8baf56627478ec76a704e9b52,101,Constantin,Welles,US,IN,Kokomo,46901,514 W Superior St,+1-317-123-4104,Constantin.Welles@ANHINGA.EXAMPLE.COM,145,1972-02-20,married,M,2024-12-15T18:51:31.301Z,9999-12-31,2024-12-15T18:51:31.301Z,2024-12-15T18:51:31.301Z,1
ec8956637a99787bd197eacd77acce5e,102,Harrison,Pacino,US,IN,Indianapolis,46218,2515 Bloyd Ave,+1-317-123-4111,Harrison.Pacino@ANI.EXAMPLE.COM,145,1953-03-02,single,M,2024-12-15T18:51:31.301Z,9999-12-31,2024-12-15T18:51:31.301Z,2024-12-15T18:51:31.301Z,1
0a09c8844ba8f0936c20bd791130d6b6,144,Sivaji,Landis,US,IA,Cedar Rapids,52401,221 3Rd Ave Se # 300,+1-319-123-4301,Sivaji.Landis@GOLDENEYE.EXAMPLE.COM,145,1970-02-09,married,M,2024-12-15T18:51:31.301Z,2024-12-15 18:51:31.301,2024-12-15T18:51:31.301Z,2024-12-15T18:51:31.301Z,1
0a09c8844ba8f0936c20bd791130d6b6,144,Sivaji,Landis,US,South Carolina,Nelsonmouth,81406,"64461 Billy Coves Apt. 235 East Courtneyberg, NV 50319",+1-319-123-4301,Sivaji.Landis@GOLDENEYE.EXAMPLE.COM,145,1970-02-09,single,M,2024-12-15T18:51:31.301Z,2024-12-15 18:51:31.301,2024-12-15T18:51:31.301Z,2024-12-15T18:51:31.301Z,2
2b24d495052a8ce66358eb576b8912c8,145,Mammutti,Pacino,US,WI,Eau Claire,54701,2120 Heights Dr,+1-745-123-4306,Mammutti.Pacino@GREBE.EXAMPLE.COM,145,1946-02-19,single,M,2024-12-15T18:51:31.301Z,2024-12-15 18:51:31.301,2024-12-15T18:51:31.301Z,2024-12-15T18:51:31.301Z,1
2b24d495052a8ce66358eb576b8912c8,145,Mammutti,Pacino,US,Kansas,Warnerland,54191,USNV Adams FPO AP 80202,+1-745-123-4306,Mammutti.Pacino@GREBE.EXAMPLE.COM,145,1946-02-19,single,M,2024-12-15T18:51:31.301Z,2024-12-15 18:51:31.301,2024-12-15T18:51:31.301Z,2024-12-15T18:51:31.301Z,2
a5e00132373a7031000fd987a3c9f87b,146,Elia,Fawcett,US,WI,Milwaukee,53217,8989 N Port Washington Rd,+1-414-123-4307,Elia.Fawcett@JACANA.EXAMPLE.COM,145,1963-03-12,married,F,2024-12-15T18:51:31.301Z,2024-12-15 18:51:31.301,2024-12-15T18:51:31.301Z,2024-12-15T18:51:31.301Z,1
a5e00132373a7031000fd987a3c9f87b,146,Elia,Fawcett,US,Colorado,Matthewberg,59432,"95643 Shawn Heights Richardborough, ND 52255",+1-414-123-4307,Elia.Fawcett@JACANA.EXAMPLE.COM,145,1963-03-12,married,F,2024-12-15T18:51:31.301Z,2024-12-15 18:51:31.301,2024-12-15T18:51:31.301Z,2024-12-15T18:51:31.301Z,2
8d5e957f297893487bd98fa830fa6413,147,Ishwarya,Roberts,US,WI,Milwaukee,53223,6555 W Good Hope Rd,+1-414-123-4308,Ishwarya.Roberts@LAPWING.EXAMPLE.COM,145,1944-03-21,single,F,2024-12-15T18:51:31.301Z,2024-12-15 18:51:31.301,2024-12-15T18:51:31.301Z,2024-12-15T18:51:31.301Z,1
8d5e957f297893487bd98fa830fa6413,147,Ishwarya,Roberts,US,Michigan,Richardschester,68851,"7784 Stevens Flats Markborough, NC 73331",+1-414-123-4308,Ishwarya.Roberts@LAPWING.EXAMPLE.COM,145,1944-03-21,married,F,2024-12-15T18:51:31.301Z,2024-12-15 18:51:31.301,2024-12-15T18:51:31.301Z,2024-12-15T18:51:31.301Z,2
