In [1]:
# =========================================================
# SONIC ‚Äì Fabric Ingestion Notebook (Phase 2: Bronze Layer)
# =========================================================
# This notebook:
# - Calls your Sonic backend on Cloud Run
# - Retrieves all onboarded Spotify users
# - Gets their refresh tokens
# - Exchanges refresh ‚Üí access tokens (directly with Spotify)
# - Calls Spotify APIs (recently played, top tracks, top artists, etc)
# - Writes RAW JSON into OneLake under /Files/bronze/...
#
# =========================================================
# ==== CELL 1 ‚Äì Imports & basic setup ====

import os
import json
import time
from datetime import datetime
from typing import Dict, Any, List, Optional

import requests

# In Microsoft Fabric, this is the helper to interact with OneLake
from notebookutils import mssparkutils

StatementMeta(, 9229bdf7-a0d5-47fd-8898-69540d66960d, 3, Finished, Available, Finished)

In [11]:
# ==== CELL 2 ‚Äì Configuration (Hard-coded Secrets) ====

# ---------------------------------------------------------------
# Hard-coded secrets for APP versions 1, 2, 3
# ---------------------------------------------------------------
# ‚ö†Ô∏è Replace the placeholder values with your real secrets.
# ‚ö†Ô∏è IMPORTANT:
# Secrets are hard-coded here TEMPORARILY for local/testing purposes only.
# In production, all secrets MUST be retrieved securely from Azure Key Vault
# This approach was used only because the Azure Key Vault subscription
# was temporarily suspended at development time.

APP_VERSIONS = ["3", "2", "1"]

SECRET_CONFIG = {
    "1": {
        "backend_base_url": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
        "backend_api_key": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
        "spotify_client_id": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
        "spotify_client_secret": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
    },
    "2": {
        "backend_base_url": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
        "backend_api_key": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
        "spotify_client_id": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
        "spotify_client_secret": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
    },
    "3": {
        "backend_base_url": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
        "backend_api_key": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
        "spotify_client_id": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
        "spotify_client_secret": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
    }
}

# ---------------------------------------------------------------
# App version loader (reads from SECRET_CONFIG instead of Key Vault)
# ---------------------------------------------------------------
def load_secrets_for_app(app_version: str):
    """
    Loads backend URL, API key, and Spotify credentials
    from the hard-coded SECRET_CONFIG dictionary.
    """
    if app_version not in SECRET_CONFIG:
        raise RuntimeError(f"No secrets found for app version {app_version}")

    return SECRET_CONFIG[app_version]


# ---------------------------------------------------------------
# Other notebook configuration (unchanged)
# ---------------------------------------------------------------

BRONZE_BASE_PATH = "Files/bronze/spotify"  

RUN_UTC = datetime.utcnow()
RUN_DATE_STR = RUN_UTC.strftime("%Y-%m-%d")
RUN_TS_STR = RUN_UTC.isoformat()

print("Notebook run started at (UTC):", RUN_TS_STR)
print("Loaded secrets from hard-coded config.")


StatementMeta(, 9229bdf7-a0d5-47fd-8898-69540d66960d, 13, Finished, Available, Finished)

Notebook run started at (UTC): 2025-11-27T16:36:18.330653
Loaded secrets from hard-coded config.


In [3]:
# ===== CELL 2.1 ‚Äì Runtime secret placeholders =====

BACKEND_BASE_URL = None
BACKEND_API_KEY = None
SPOTIFY_CLIENT_ID = None
SPOTIFY_CLIENT_SECRET = None


StatementMeta(, 9229bdf7-a0d5-47fd-8898-69540d66960d, 5, Finished, Available, Finished)

In [4]:
# ==== CELL 3 ‚Äì Helper: Generic backend caller ====

def call_backend(
    path: str,
    method: str = "GET",
    params: Optional[Dict[str, Any]] = None,
    json_body: Optional[Dict[str, Any]] = None,
    timeout: int = 30
) -> Any:
    """
    Generic helper to call the Sonic backend (Cloud Run).
    Assumes an API key is passed in a header named 'x-api-key'.
    # """
    url = BACKEND_BASE_URL.rstrip("/") + path

    headers = {
        "x-api-key": BACKEND_API_KEY,  
        "Accept": "application/json",
    }

    response = requests.request(
        method=method.upper(),
        url=url,
        headers=headers,
        params=params,
        json=json_body,
        timeout=timeout,
    )

    if not response.ok:
        raise RuntimeError(
            f"Backend call failed: {method} {url} "
            f"status={response.status_code}, body={response.text}"
        )

    try:
        return response.json()
    except Exception:
        return response.text


StatementMeta(, 9229bdf7-a0d5-47fd-8898-69540d66960d, 6, Finished, Available, Finished)

In [5]:
# ==== CELL 4 ‚Äì Get onboarded users from backend (with display names) ====

def get_onboarded_users() -> List[Dict[str, Any]]:
    """
    Calls /admin/users and returns:
    [
      {
        "spotify_user_id": "xxxxxxxxxxxx",
        "display_name": "xxxxx"
      }
    ]
    """
    data = call_backend("/admin/users", method="GET")

    users = []
    for item in data:
        if isinstance(item, dict):
            spotify_user_id = item.get("spotify_user_id")
            display_name = item.get("display_name", spotify_user_id)
            if spotify_user_id:
                users.append({
                    "spotify_user_id": spotify_user_id,
                    "display_name": display_name
                })

    print(f"Found {len(users)} onboarded users.")
    return users




def get_refresh_token_for_user(spotify_user_id: str) -> str:
    """
    Calls /internal/get-token/{spotify_user_id} on the backend.
    Assumes backend returns JSON like:
        {"spotify_user_id": "...", "refresh_token": "...", ...}
    """
    data = call_backend(f"/internal/get-token/{spotify_user_id}", method="GET")

    if isinstance(data, dict) and "refresh_token" in data:
        return data["refresh_token"]

    raise RuntimeError(
        f"Could not find refresh_token in backend response for user {spotify_user_id}: {data}"
    )


StatementMeta(, 9229bdf7-a0d5-47fd-8898-69540d66960d, 7, Finished, Available, Finished)

In [6]:
# ==== CELL 5 ‚Äì Spotify auth: refresh ‚Üí access token ====

SPOTIFY_TOKEN_URL = "https://accounts.spotify.com/api/token"


def get_spotify_access_token(refresh_token: str) -> str:
    """
    Exchanges a Spotify refresh token for a new access token.
    Uses client_id + client_secret from config.
    """
    payload = {
        "grant_type": "refresh_token",
        "refresh_token": refresh_token,
        "client_id": SPOTIFY_CLIENT_ID,
        "client_secret": SPOTIFY_CLIENT_SECRET,
    }

    response = requests.post(SPOTIFY_TOKEN_URL, data=payload, timeout=30)

    if not response.ok:
        raise RuntimeError(
            f"Spotify token refresh failed: status={response.status_code}, body={response.text}"
        )

    data = response.json()
    access_token = data.get("access_token")

    if not access_token:
        raise RuntimeError(f"No access_token in Spotify token response: {data}")

    return access_token


StatementMeta(, 9229bdf7-a0d5-47fd-8898-69540d66960d, 8, Finished, Available, Finished)

In [7]:
# ================================================================
# ========================= CELL 6 (SAFE+) ========================
#      Spotify API Helpers + RATE-LIMIT DETECTION + THROTTLING
# ================================================================

import time
import requests

# Globals for tracking rate limits
RATE_LIMIT_HITS = 0
RATE_LIMIT_WAIT_SECONDS = 0.0
GLOBAL_MIN_INTERVAL = 0.12  # ~8 requests/second
_last_call = 0

def throttle():
    global _last_call
    now = time.time()
    diff = now - _last_call
    if diff < GLOBAL_MIN_INTERVAL:
        time.sleep(GLOBAL_MIN_INTERVAL - diff)
    _last_call = time.time()


# ---------------------------------------------------------------
# Enhanced GET with rate-limit detection
# ---------------------------------------------------------------
def spotify_get(url_path: str, access_token: str, params=None):
    global RATE_LIMIT_HITS, RATE_LIMIT_WAIT_SECONDS

    throttle()

    base = "https://api.spotify.com/v1"
    url = base + url_path
    headers = {"Authorization": f"Bearer {access_token}"}

    while True:
        resp = requests.get(url, headers=headers, params=params)

        if resp.status_code == 429:
            RATE_LIMIT_HITS += 1
            retry_after = int(resp.headers.get("Retry-After", 1))
            RATE_LIMIT_WAIT_SECONDS += retry_after
            print(f"‚ö† Rate Limit Hit #{RATE_LIMIT_HITS} ‚Äî Waiting {retry_after}s...")
            time.sleep(retry_after)
            continue

        if resp.status_code == 403:
            raise RuntimeError("‚ùå Spotify 403 ‚Äî Insufficient permission or restricted content.")

        if not resp.ok:
            raise RuntimeError(f"‚ùå Spotify GET failed: {resp.status_code}, body={resp.text}")

        return resp.json()


# ---------------------------------------------------------------
# Paginated GET with rate-limit detection
# ---------------------------------------------------------------
def spotify_paginated_get(url_path: str, access_token: str, params=None):
    global RATE_LIMIT_HITS, RATE_LIMIT_WAIT_SECONDS

    throttle()

    base = "https://api.spotify.com/v1"
    url = base + url_path
    headers = {"Authorization": f"Bearer {access_token}"}

    pages = []

    while url:
        resp = requests.get(url, headers=headers, params=params)

        if resp.status_code == 429:
            RATE_LIMIT_HITS += 1
            retry_after = int(resp.headers.get("Retry-After", 1))
            RATE_LIMIT_WAIT_SECONDS += retry_after
            print(f"‚ö† Rate Limit Hit #{RATE_LIMIT_HITS} ‚Äî Waiting {retry_after}s...")
            time.sleep(retry_after)
            continue

        if resp.status_code == 403:
            raise RuntimeError("‚ùå Spotify 403 ‚Äî Insufficient permission or restricted content.")

        if not resp.ok:
            raise RuntimeError(f"‚ùå Spotify GET failed: {resp.status_code}, body={resp.text}")

        data = resp.json()
        pages.append(data)

        url = data.get("next")
        params = None

    return pages


# ---------------------------------------------------------------
# Endpoint wrappers (SONIC LITE)
# ---------------------------------------------------------------
def fetch_user_profile(access_token):
    return spotify_get("/me", access_token)

def fetch_recently_played_all(access_token):
    return spotify_paginated_get("/me/player/recently-played", access_token, params={"limit": 50})

def fetch_top_tracks_all(access_token, time_range="medium_term"):
    return spotify_paginated_get("/me/top/tracks", access_token,
                                 params={"limit": 50, "time_range": time_range})

def fetch_top_artists_all(access_token, time_range="medium_term"):
    return spotify_paginated_get("/me/top/artists", access_token,
                                 params={"limit": 50, "time_range": time_range})

def fetch_saved_tracks_all(access_token):
    return spotify_paginated_get("/me/tracks", access_token, params={"limit": 50})

def fetch_saved_albums_all(access_token):
    return spotify_paginated_get("/me/albums", access_token, params={"limit": 50})

def fetch_playlists_all(access_token):
    return spotify_paginated_get("/me/playlists", access_token, params={"limit": 50})

def fetch_playlist_tracks_all(access_token, pid):
    return spotify_paginated_get(f"/playlists/{pid}/tracks", access_token, params={"limit": 100})

def fetch_followed_artists_all(access_token):
    return spotify_get("/me/following?type=artist&limit=50", access_token)


StatementMeta(, 9229bdf7-a0d5-47fd-8898-69540d66960d, 9, Finished, Available, Finished)

In [8]:
# ==== CELL 7 ‚Äì OneLake write helper (bronze JSON files, with display_name) ====

def build_bronze_path(
    folder_name: str,
    category: str,
    run_date: str = RUN_DATE_STR,
    filename: str = "data.json"
) -> str:
    """
    Build a path like:
    Files/bronze/spotify/{folder_name}/{category}/{run_date}/data.json
    
    folder_name is something like:
       Younis_31ukk3c43unwneaykxksflanexmq
    """
    path = f"{BRONZE_BASE_PATH}/{folder_name}/{category}/{run_date}/{filename}"
    return path


def write_json_to_onelake(
    data: Any,
    spotify_user_id: str,
    display_name: str,
    category: str,
    filename: str = "data.json"
) -> str:
    """
    Writes raw JSON to OneLake under the bronze layer using a clean folder name
    that includes both display_name and spotify_user_id.
    """

    # Create safe folder name: Replace spaces and apostrophes to avoid path issues
    safe_display = display_name.replace(" ", "_").replace("'", "_")
    folder_name = f"{safe_display}_{spotify_user_id}"

    lakehouse_path = build_bronze_path(
        folder_name=folder_name,
        category=category,
        run_date=RUN_DATE_STR,
        filename=filename,
    )

    # Serialize JSON with metadata
    json_str = json.dumps(
        {
            "ingestion_metadata": {
                "run_utc": RUN_TS_STR,
                "category": category,
                "spotify_user_id": spotify_user_id,
                "display_name": display_name,
                "folder_name": folder_name,
            },
            "payload": data,
        },
        indent=2,
        ensure_ascii=False,
    )

    # Write to OneLake
    mssparkutils.fs.put(lakehouse_path, json_str, True)

    print(f"‚úÖ Wrote {category} for {display_name} ({spotify_user_id}) ‚Üí {lakehouse_path}")
    return lakehouse_path


StatementMeta(, 9229bdf7-a0d5-47fd-8898-69540d66960d, 10, Finished, Available, Finished)

In [None]:
# ================================================================
# =====================   CELL 8 (SONIC LITE)   ===================
#            FAST MODE ‚Äì NO AUDIO FEATURES, NO SLOW OPS
# ================================================================

def ingest_user(spotify_user_id: str, display_name: str):
    print(f"\n=== Ingesting {display_name} ===")

    # 1. REFRESH TOKEN ‚Üí ACCESS TOKEN
    refresh_token = get_refresh_token_for_user(spotify_user_id)
    access_token = get_spotify_access_token(refresh_token)

    # 2. USER PROFILE
    profile = fetch_user_profile(access_token)
    write_json_to_onelake(profile, spotify_user_id, display_name, "user_profile")

    # 3. RECENTLY PLAYED
    rp = fetch_recently_played_all(access_token)
    write_json_to_onelake(rp, spotify_user_id, display_name, "recently_played")

    # 4. TOP TRACKS + ARTISTS (3 time ranges)
    for tr in ["short_term", "medium_term", "long_term"]:
        tt = fetch_top_tracks_all(access_token, tr)
        ta = fetch_top_artists_all(access_token, tr)

        write_json_to_onelake(tt, spotify_user_id, display_name, f"top_tracks_{tr}")
        write_json_to_onelake(ta, spotify_user_id, display_name, f"top_artists_{tr}")

    # 5. SAVED TRACKS + ALBUMS
    st = fetch_saved_tracks_all(access_token)
    sa = fetch_saved_albums_all(access_token)

    write_json_to_onelake(st, spotify_user_id, display_name, "saved_tracks")
    write_json_to_onelake(sa, spotify_user_id, display_name, "saved_albums")

    # 6. PLAYLISTS + PLAYLIST TRACKS
    playlists_pages = fetch_playlists_all(access_token)
    write_json_to_onelake(playlists_pages, spotify_user_id, display_name, "playlists")

    playlist_ids = []
    for page in playlists_pages:
        for pl in page.get("items", []):
            if pl.get("id"):
                playlist_ids.append(pl["id"])

    for pid in playlist_ids:
        tracks = fetch_playlist_tracks_all(access_token, pid)
        write_json_to_onelake(tracks, spotify_user_id, display_name, f"playlist_{pid}_tracks")

    # 7. FOLLOWED ARTISTS
    fa = fetch_followed_artists_all(access_token)
    write_json_to_onelake(fa, spotify_user_id, display_name, "followed_artists")

    # DONE
    print(f"‚úî Finished ingestion for {display_name}")


# ================================================================
# Run ingestion for all onboarded users
# ================================================================
def run_full_ingestion():
    users = get_onboarded_users()

    for user in users:
        spotify_user_id = user.get("spotify_user_id")
        display_name = user.get("display_name", spotify_user_id)

        try:
            ingest_user(spotify_user_id, display_name)
        except Exception as e:
            print(f"‚ùå Error ingesting user {spotify_user_id}: {e}")

    print("\nüéß SONIC ‚Äì Fabric ingestion run complete.")


# ================================================================
# Run all app versions sequentially
# ================================================================

for app_ver in APP_VERSIONS:
    print(f"\n===============================================")
    print(f"üî• Starting ingestion for APP VERSION {app_ver}")
    print(f"===============================================\n")

    # Load secrets dynamically
    secrets = load_secrets_for_app(app_ver)

    # Override global variables
    BACKEND_BASE_URL = secrets["backend_base_url"]
    BACKEND_API_KEY = secrets["backend_api_key"]
    SPOTIFY_CLIENT_ID = secrets["spotify_client_id"]
    SPOTIFY_CLIENT_SECRET = secrets["spotify_client_secret"]

    print(f"‚úî Loaded secrets for app version {app_ver}")

    # Run the ingestion
    run_full_ingestion()

    print(f"\nüéâ Finished APP VERSION {app_ver}")
    print("--------------------------------------------------")

print("\nüéß ALL APP VERSIONS COMPLETED SUCCESSFULLY.")

print("\n===== RATE-LIMIT REPORT =====")
print(f"Rate limits hit: {RATE_LIMIT_HITS}")
print(f"Total wait time: {RATE_LIMIT_WAIT_SECONDS:.2f} seconds")
print("==============================")

