In [5]:
%%sql
/*
-- 1. CREATE TABLES (Using MD5 Hash Keys)
*/
/*
 ---------------------------
   Dimensions
 --------------------------- 
*/
CREATE TABLE IF NOT EXISTS gold_dim_date (
  date_pk INT,
  full_date DATE NOT NULL,
  day_of_month INT,
  month_name STRING,
  month_number INT,
  year INT,
  day_of_week STRING,
  day_of_week_sort INT, 
  is_weekend BOOLEAN,
  calendar_quarter INT
) USING DELTA;

CREATE TABLE IF NOT EXISTS gold_dim_user (
  user_pk STRING NOT NULL,
  spotify_user_id STRING NOT NULL,
  display_name STRING,
  email STRING,
  country_code STRING,
  subscription_type STRING,
  follower_count BIGINT,
  explicit_filter_enabled BOOLEAN,
  explicit_filter_locked BOOLEAN,
  spotify_uri STRING,
  profile_image_url STRING,
  snapshot_date DATE,
  ingested_at TIMESTAMP,
  last_updated_at TIMESTAMP
) USING DELTA;

CREATE TABLE IF NOT EXISTS gold_dim_artist (
  artist_pk STRING NOT NULL,
  artist_id STRING NOT NULL,
  artist_name STRING,
  popularity INT,
  followers BIGINT,
  artist_genres STRING,
  external_url STRING,
  artist_image_url STRING,
  last_updated_at TIMESTAMP
) USING DELTA;

CREATE TABLE IF NOT EXISTS gold_dim_track (
  track_pk STRING NOT NULL,
  spotify_track_id STRING NOT NULL,
  recco_id STRING,
  track_name STRING,
  artist_id STRING,
  artist_name STRING,
  album_id STRING,
  album_name STRING,
  duration_ms BIGINT,
  popularity INT,
  valence FLOAT,
  energy FLOAT,
  danceability FLOAT,
  tempo FLOAT,
  loudness_db FLOAT,
  acousticness FLOAT,
  instrumentalness FLOAT,
  speechiness FLOAT,
  liveness FLOAT,
  mood_category STRING,
  release_date DATE,
  last_updated_at TIMESTAMP
) USING DELTA;

CREATE TABLE IF NOT EXISTS gold_dim_playlist (
  playlist_pk STRING NOT NULL,
  playlist_id STRING NOT NULL,
  owner_spotify_id STRING,
  snapshot_date DATE,
  last_updated_at TIMESTAMP
) USING DELTA;

CREATE TABLE IF NOT EXISTS gold_dim_artist_genres (
  artist_fk STRING NOT NULL, 
  genre_name STRING NOT NULL,
  ingested_at TIMESTAMP
) USING DELTA;

/*
 ---------------------------
   Facts
 --------------------------- 
*/
CREATE TABLE IF NOT EXISTS gold_fact_user_activity (
  activity_pk STRING NOT NULL,
  user_fk STRING NOT NULL,
  artist_fk STRING,
  track_fk STRING,
  playlist_fk STRING,
  activity_date_id INT NOT NULL,
  activity_timestamp TIMESTAMP NOT NULL,
  activity_hour INT,
  activity_type STRING NOT NULL,
  duration_ms BIGINT,
  rank_value INT,
  event_flag TINYINT,
  source_table STRING NOT NULL,
  activity_hash STRING,
  ingested_at TIMESTAMP NOT NULL
) USING DELTA;


/*
---------------------------
   Seed Unknown rows (Using MD5 of 'UNKNOWN')
--------------------------- 
*/
/*
INSERT INTO gold_dim_user (user_pk, spotify_user_id, display_name, email, country_code, subscription_type, follower_count, last_updated_at)
VALUES (md5('UNKNOWN'), 'UNKNOWN', 'Unknown User', 'unknown@unknown.local', 'XX', 'Unknown', 0, current_timestamp());

INSERT INTO gold_dim_artist (artist_pk, artist_id, artist_name, popularity, followers, artist_genres, last_updated_at)
VALUES (md5('UNKNOWN'), 'UNKNOWN', 'Unknown Artist', 0, 0, 'Unknown', current_timestamp());

INSERT INTO gold_dim_artist_genres (artist_fk, genre_name, ingested_at)
VALUES (md5('UNKNOWN'), 'Unknown', current_timestamp());

INSERT INTO gold_dim_track (track_pk, spotify_track_id, track_name, artist_id, duration_ms, popularity, valence, energy, danceability, last_updated_at)
VALUES (md5('UNKNOWN'), 'UNKNOWN', 'Unknown Track', 'UNKNOWN', 0, 0, 0.0, 0.0, 0.0, current_timestamp());

INSERT INTO gold_dim_playlist (playlist_pk, playlist_id, owner_spotify_id, last_updated_at)
VALUES (md5('UNKNOWN'), 'UNKNOWN', 'UNKNOWN', current_timestamp());*/

StatementMeta(, fc39db64-3ddc-4a7a-a6b6-10f729ac80ea, 49, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

In [None]:
%%sql

from pyspark.sql.functions import (
    col, expr, dayofmonth, month, year, date_format,
    when, lit, quarter, dayofweek
)

# 1. Define start and end date
start_date = "2015-01-01"
end_date = "2030-12-31"

# 2. Generate sequence of dates
df_dates = spark.sql(f"""
    SELECT explode(
        sequence(to_date('{start_date}'), to_date('{end_date}'), interval 1 day)
    ) AS full_date
""")

# 3. Calculate dimension columns
df_gold_date = (
    df_dates
        .withColumn("date_pk", date_format(col("full_date"), "yyyyMMdd").cast("int"))
        .withColumn("day_of_month", dayofmonth(col("full_date")))
        .withColumn("month_name", date_format(col("full_date"), "MMMM"))
        .withColumn("month_number", month(col("full_date")))
        .withColumn("year", year(col("full_date")))
        .withColumn("day_of_week", date_format(col("full_date"), "EEEE"))
        .withColumn("day_of_week_sort", ((dayofweek(col("full_date")) + 5) % 7) + 1)
        .withColumn(
            "is_weekend",
            when(date_format(col("full_date"), "E").isin("Sat", "Sun"), True).otherwise(False)
        )
        .withColumn("calendar_quarter", quarter(col("full_date")))
)

# 4. Insert into Delta Table
df_gold_date.write.format("delta").mode("overwrite").saveAsTable("gold_dim_date")

print("Gold Date Dimension populated successfully.")


StatementMeta(, fc39db64-3ddc-4a7a-a6b6-10f729ac80ea, -1, Cancelled, , Cancelled)

In [6]:
%%sql
-----------------------------------------------------------------------
-- 1) gold_dim_user
-----------------------------------------------------------------------
MERGE INTO gold_dim_user AS target
USING (
  SELECT *
  FROM (
    SELECT
      md5(spotify_user_id) AS user_pk_new,
      spotify_user_id,
      COALESCE(display_name, 'Unknown User') AS display_name,
      COALESCE(email, 'unknown@unknown.local') AS email,
      COALESCE(country, 'XX') AS country_code,
      COALESCE(subscription_type, 'Unknown') AS subscription_type,
      COALESCE(follower_count, 0) AS follower_count,
      COALESCE(explicit_filter_enabled, false) AS explicit_filter_enabled,
      COALESCE(explicit_filter_locked, false) AS explicit_filter_locked,
      spotify_uri,
      CASE WHEN images IS NOT NULL AND size(images) > 0 THEN images[0].url ELSE NULL END AS profile_image_url,
      snapshot_date,
      ingested_at,
      current_timestamp() AS last_updated_at,

      ROW_NUMBER() OVER (
        PARTITION BY spotify_user_id
        ORDER BY ingested_at DESC, snapshot_date DESC
      ) AS rn

    FROM silver_user_profile
    WHERE spotify_user_id IS NOT NULL
  ) s
  WHERE rn = 1
) src
ON target.spotify_user_id = src.spotify_user_id
WHEN MATCHED THEN UPDATE SET
  display_name = src.display_name,
  email = src.email,
  country_code = src.country_code,
  subscription_type = src.subscription_type,
  follower_count = src.follower_count,
  explicit_filter_enabled = src.explicit_filter_enabled,
  explicit_filter_locked = src.explicit_filter_locked,
  spotify_uri = src.spotify_uri,
  profile_image_url = src.profile_image_url,
  snapshot_date = src.snapshot_date,
  ingested_at = src.ingested_at,
  last_updated_at = src.last_updated_at
WHEN NOT MATCHED THEN INSERT (
  user_pk, spotify_user_id, display_name, email, country_code,
  subscription_type, follower_count, explicit_filter_enabled,
  explicit_filter_locked, spotify_uri, profile_image_url,
  snapshot_date, ingested_at, last_updated_at
) VALUES (
  src.user_pk_new, src.spotify_user_id, src.display_name, src.email,
  src.country_code, src.subscription_type, src.follower_count,
  src.explicit_filter_enabled, src.explicit_filter_locked,
  src.spotify_uri, src.profile_image_url,
  src.snapshot_date, src.ingested_at, src.last_updated_at
);



-----------------------------------------------------------------------
-- 2) gold_dim_artist (FIXED – deterministic)
-----------------------------------------------------------------------
MERGE INTO gold_dim_artist tgt
USING (
  SELECT *
  FROM (
    SELECT
      md5(artist_id) AS artist_pk_new,
      artist_id,
      artist_name,
      COALESCE(popularity, 0) AS popularity,
      COALESCE(followers, 0) AS followers,
      CASE WHEN artist_genres IS NOT NULL THEN concat_ws(',', artist_genres) END AS artist_genres,
      external_url,
      artist_image_url,
      current_timestamp() AS last_updated_at,

      ROW_NUMBER() OVER (
        PARTITION BY artist_id
        ORDER BY
          current_timestamp() DESC,
          popularity DESC,
          followers DESC,
          artist_name ASC,
          md5(concat_ws('|',
            COALESCE(artist_name,''),
            COALESCE(popularity,0),
            COALESCE(followers,0),
            COALESCE(external_url,'')
          )) DESC
      ) AS rn

    FROM (
      SELECT artist_id, artist_name, popularity, followers, artist_genres, external_url, artist_image_url
      FROM silver_artists_raw

      UNION ALL
      SELECT artist_id, artist_name, artist_popularity AS popularity, artist_followers AS followers,
             artist_genres, artist_uri AS external_url, artist_image_url
      FROM silver_followed_artists

      UNION ALL
      SELECT artist_id, artist_name, popularity, followers, artist_genres, external_url, artist_image_url
      FROM silver_top_artists
    ) all_artists
    WHERE artist_id IS NOT NULL
  ) ranked
  WHERE rn = 1
) src
ON tgt.artist_id = src.artist_id
WHEN MATCHED THEN UPDATE SET
  artist_name = src.artist_name,
  popularity = src.popularity,
  followers = src.followers,
  artist_genres = src.artist_genres,
  external_url = src.external_url,
  artist_image_url = src.artist_image_url,
  last_updated_at = src.last_updated_at
WHEN NOT MATCHED THEN INSERT (
  artist_pk, artist_id, artist_name, popularity, followers,
  artist_genres, external_url, artist_image_url, last_updated_at
) VALUES (
  src.artist_pk_new, src.artist_id, src.artist_name, src.popularity,
  src.followers, src.artist_genres, src.external_url,
  src.artist_image_url, src.last_updated_at
);



-----------------------------------------------------------------------
-- 2.5) gold_dim_artist_genres (insert-only, already safe)
-----------------------------------------------------------------------
MERGE INTO gold_dim_artist_genres tgt
USING (
  SELECT DISTINCT
    md5(artist_id) AS artist_fk,
    trim(genre) AS genre_name,
    current_timestamp() AS ingested_at
  FROM (
    SELECT artist_id, artist_genres FROM silver_artists_raw
    UNION ALL
    SELECT artist_id, artist_genres FROM silver_followed_artists
    UNION ALL
    SELECT artist_id, artist_genres FROM silver_top_artists
  ) a
  LATERAL VIEW explode(a.artist_genres) AS genre
  WHERE artist_id IS NOT NULL AND trim(genre) <> ''
) src
ON tgt.artist_fk = src.artist_fk AND tgt.genre_name = src.genre_name
WHEN NOT MATCHED THEN INSERT (artist_fk, genre_name, ingested_at)
VALUES (src.artist_fk, src.genre_name, src.ingested_at);



-----------------------------------------------------------------------
-- 3) gold_dim_track (FIXED – deterministic)
-----------------------------------------------------------------------
MERGE INTO gold_dim_track tgt
USING (
  SELECT *
  FROM (
    SELECT
      md5(t.track_id) AS track_pk_new,
      t.track_id AS spotify_track_id,
      t.recco_id,
      t.track_name,
      t.artist_id,
      t.artist_name,
      t.album_id,
      t.album_name,
      CAST(t.duration_ms AS BIGINT) AS duration_ms,
      CAST(t.popularity AS INT) AS popularity,

      af.valence, af.energy, af.danceability, af.tempo,
      af.loudness AS loudness_db, af.acousticness,
      af.instrumentalness, af.speechiness, af.liveness,

      CASE
        WHEN af.valence IS NOT NULL AND af.energy IS NOT NULL AND af.danceability IS NOT NULL THEN
          CASE
            WHEN af.valence > 0.7 AND af.energy > 0.6 THEN 'High Energy/Happy'
            WHEN af.valence > 0.5 AND af.energy > 0.5 THEN 'Energetic'
            WHEN af.valence < 0.3 AND af.energy < 0.4 THEN 'Calm/Sad'
            WHEN af.danceability > 0.7 AND af.energy > 0.5 THEN 'Danceable'
            ELSE 'Neutral'
          END
        ELSE 'Unknown'
      END AS mood_category,

      CASE 
        WHEN t.release_date IS NULL OR t.release_date = '' THEN NULL
        WHEN TO_DATE(t.release_date) < TO_DATE('1900-01-01') THEN NULL 
        ELSE TO_DATE(t.release_date)
      END AS release_date,

      current_timestamp() AS last_updated_at,

      ROW_NUMBER() OVER (
        PARTITION BY t.track_id
        ORDER BY
          t.priority DESC,
          t.popularity DESC,
          t.track_name ASC,
          md5(concat_ws('|',
            t.track_name,
            t.artist_name,
            t.album_name,
            CAST(t.duration_ms AS STRING),
            CAST(t.popularity AS STRING)
          )) DESC
      ) AS rn

    FROM (
      SELECT spotify_id AS track_id, recco_id, track_name, artist_recco_id AS artist_id,
             artist_name, NULL AS album_id, NULL AS album_name,
             duration_ms, popularity, NULL AS release_date, 1 AS priority
      FROM silver_tracks_raw

      UNION ALL
      SELECT track_id, NULL AS recco_id, track_name, artist_id, artist_name,
             album_id, album_name, duration_ms, popularity, release_date, 2 AS priority
      FROM silver_saved_tracks

      UNION ALL
      SELECT track_id, NULL AS recco_id, track_name, artist_id, artist_name,
             album_id, album_name, duration_ms, popularity, release_date, 2 AS priority
      FROM silver_top_tracks
    ) t
    LEFT JOIN silver_audio_features_raw af ON af.spotify_id = t.track_id
  ) ranked
  WHERE rn = 1 AND spotify_track_id IS NOT NULL
) src
ON tgt.spotify_track_id = src.spotify_track_id
WHEN MATCHED THEN UPDATE SET
  recco_id = src.recco_id,
  track_name = src.track_name,
  artist_id = src.artist_id,
  artist_name = src.artist_name,
  album_id = src.album_id,
  album_name = src.album_name,
  duration_ms = src.duration_ms,
  popularity = src.popularity,
  valence = src.valence,
  energy = src.energy,
  danceability = src.danceability,
  tempo = src.tempo,
  loudness_db = src.loudness_db,
  acousticness = src.acousticness,
  instrumentalness = src.instrumentalness,
  speechiness = src.speechiness,
  liveness = src.liveness,
  mood_category = src.mood_category,
  release_date = src.release_date,
  last_updated_at = src.last_updated_at
WHEN NOT MATCHED THEN INSERT (
  track_pk, spotify_track_id, recco_id, track_name, artist_id, artist_name,
  album_id, album_name, duration_ms, popularity, valence, energy,
  danceability, tempo, loudness_db, acousticness, instrumentalness,
  speechiness, liveness, mood_category, release_date, last_updated_at
) VALUES (
  src.track_pk_new, src.spotify_track_id, src.recco_id, src.track_name,
  src.artist_id, src.artist_name, src.album_id, src.album_name,
  src.duration_ms, src.popularity, src.valence, src.energy,
  src.danceability, src.tempo, src.loudness_db, src.acousticness,
  src.instrumentalness, src.speechiness, src.liveness,
  src.mood_category, src.release_date, src.last_updated_at
);



-----------------------------------------------------------------------
-- 4) gold_dim_playlist
-----------------------------------------------------------------------
MERGE INTO gold_dim_playlist tgt
USING (
  SELECT *
  FROM (
    SELECT
      md5(playlist_id) AS playlist_pk_new,
      playlist_id,
      added_by_id AS owner_spotify_id, -- Alias defined here
      snapshot_date,
      current_timestamp() AS last_updated_at,

      ROW_NUMBER() OVER (
        PARTITION BY playlist_id
        ORDER BY 
          snapshot_date DESC,
          added_by_id ASC, -- <<< FIXED: Used original column name instead of alias
          md5(concat_ws('|', playlist_id, added_by_id, CAST(snapshot_date AS STRING))) DESC -- <<< FIXED here too
      ) AS rn

    FROM silver_playlist_tracks
    WHERE playlist_id IS NOT NULL
  ) ranked
  WHERE rn = 1
) src
ON tgt.playlist_id = src.playlist_id
WHEN MATCHED THEN UPDATE SET
  owner_spotify_id = src.owner_spotify_id,
  snapshot_date = src.snapshot_date,
  last_updated_at = src.last_updated_at
WHEN NOT MATCHED THEN INSERT (
  playlist_pk, playlist_id, owner_spotify_id, snapshot_date, last_updated_at
) VALUES (
  src.playlist_pk_new, src.playlist_id, src.owner_spotify_id,
  src.snapshot_date, src.last_updated_at
);


-----------------------------------------------------------------------
-- 5) gold_fact_user_activity (same as original)
-----------------------------------------------------------------------
WITH activity_staging AS (
  SELECT spotify_user_id, played_at AS activity_timestamp, track_id, artist_id,
         context_uri AS playlist_id, CAST(duration_ms AS BIGINT) AS duration_ms,
         NULL AS rank_value, 0 AS event_flag,
         'silver_recently_played' AS source_table,
         CAST(snapshot_date AS TIMESTAMP) AS ingested_at
  FROM silver_recently_played WHERE played_at IS NOT NULL

  UNION ALL
  SELECT spotify_user_id, CAST(added_at AS TIMESTAMP), track_id, artist_id,
         NULL, CAST(duration_ms AS BIGINT), NULL, 1, 'silver_saved_tracks',
         CAST(snapshot_date AS TIMESTAMP)
  FROM silver_saved_tracks WHERE added_at IS NOT NULL

  UNION ALL
  SELECT spotify_user_id, CAST(snapshot_date AS TIMESTAMP), NULL, artist_id,
         NULL, NULL, NULL, 1, 'silver_followed_artists',
         CAST(snapshot_date AS TIMESTAMP)
  FROM silver_followed_artists

  UNION ALL
  SELECT spotify_user_id, added_at, track_id, artist_id, playlist_id,
         CAST(duration_ms AS BIGINT), NULL, 1, 'silver_playlist_tracks',
         CAST(snapshot_date AS TIMESTAMP)
  FROM silver_playlist_tracks WHERE added_at IS NOT NULL

  UNION ALL
  SELECT spotify_user_id, CAST(snapshot_date AS TIMESTAMP), track_id, artist_id,
         NULL, CAST(duration_ms AS BIGINT), rank, 0, 'silver_top_tracks',
         CAST(snapshot_date AS TIMESTAMP)
  FROM silver_top_tracks

  UNION ALL
  SELECT spotify_user_id, CAST(snapshot_date AS TIMESTAMP), NULL, artist_id,
         NULL, NULL, rank, 0, 'silver_top_artists',
         CAST(snapshot_date AS TIMESTAMP)
  FROM silver_top_artists
)

INSERT INTO gold_fact_user_activity (
  activity_pk, user_fk, artist_fk, track_fk, playlist_fk,
  activity_date_id, activity_timestamp, activity_hour, activity_type,
  duration_ms, rank_value, event_flag, source_table,
  activity_hash, ingested_at
)
SELECT
  md5(concat(COALESCE(a.spotify_user_id,''), CAST(a.activity_timestamp AS STRING),
             COALESCE(a.track_id,''), a.source_table)) AS activity_pk,

  COALESCE(md5(a.spotify_user_id), md5('UNKNOWN')) AS user_fk,
  COALESCE(md5(a.artist_id), md5('UNKNOWN')) AS artist_fk,
  COALESCE(md5(a.track_id), md5('UNKNOWN')) AS track_fk,
  COALESCE(md5(a.playlist_id), md5('UNKNOWN')) AS playlist_fk,

  CAST(date_format(a.activity_timestamp, 'yyyyMMdd') AS INT) AS activity_date_id,
  a.activity_timestamp,
  HOUR(a.activity_timestamp) AS activity_hour,

  CASE
    WHEN a.source_table = 'silver_recently_played' THEN 'Played'
    WHEN a.source_table = 'silver_saved_tracks' THEN 'Saved'
    WHEN a.source_table = 'silver_followed_artists' THEN 'Followed'
    WHEN a.source_table = 'silver_playlist_tracks' THEN 'PlaylistAdd'
    WHEN a.source_table = 'silver_top_tracks' THEN 'TopTrackRank'
    WHEN a.source_table = 'silver_top_artists' THEN 'TopArtistRank'
    ELSE 'Other'
  END AS activity_type,

  a.duration_ms,
  a.rank_value,
  a.event_flag,
  a.source_table,

  md5(concat(COALESCE(a.spotify_user_id,''), CAST(a.activity_timestamp AS STRING),
             COALESCE(a.track_id,''), a.source_table)) AS activity_hash,
  current_timestamp() AS ingested_at

FROM activity_staging a
LEFT JOIN gold_fact_user_activity f
  ON f.activity_hash =
       md5(concat(COALESCE(a.spotify_user_id,''), CAST(a.activity_timestamp AS STRING),
                  COALESCE(a.track_id,''), a.source_table))
WHERE f.activity_pk IS NULL;


StatementMeta(, fc39db64-3ddc-4a7a-a6b6-10f729ac80ea, 55, Finished, Available, Finished)

<Spark SQL result set with 1 rows and 4 fields>

<Spark SQL result set with 1 rows and 4 fields>

<Spark SQL result set with 1 rows and 4 fields>

<Spark SQL result set with 1 rows and 4 fields>

<Spark SQL result set with 1 rows and 4 fields>

<Spark SQL result set with 0 rows and 0 fields>