In [1]:
# =========================================================
# CELL 1: SETUP & AUTH (OFFICIAL + RECCOBEATS SIMPLE)
# =========================================================
import json
import time
import requests
import base64
import os
from datetime import datetime
from notebookutils import mssparkutils

# PATHS
BRONZE_BASE_PATH = "Files/bronze/spotify"
SILVER_BASE_PATH = "Files/silver/spotify"
RUN_DATE_STR = datetime.now().strftime("%Y-%m-%d")

# ‚ö†Ô∏è IMPORTANT:
# Secrets are hard-coded here TEMPORARILY for local/testing purposes only.
# In production, all secrets MUST be retrieved securely from Azure Key Vault
# This approach was used only because the Azure Key Vault subscription
# was temporarily suspended at development time.

# SPOTIFY AUTH
try:
    SPOTIFY_CLIENT_ID = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
    SPOTIFY_CLIENT_SECRET = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
    print("‚úÖ Loaded Spotify App Credentials.")
except:
    SPOTIFY_CLIENT_ID = None
    SPOTIFY_CLIENT_SECRET = None
    print("‚ö†Ô∏è Could not load Spotify credentials.")

def get_app_token():
    if not SPOTIFY_CLIENT_ID or not SPOTIFY_CLIENT_SECRET:
        raise RuntimeError("Missing Spotify credentials.")
    token = f"{SPOTIFY_CLIENT_ID}:{SPOTIFY_CLIENT_SECRET}"
    b64 = base64.b64encode(token.encode()).decode()
    resp = requests.post(
        "https://accounts.spotify.com/api/token",
        headers={"Authorization": f"Basic {b64}"},
        data={"grant_type": "client_credentials"},
        timeout=10
    )
    if not resp.ok:
        raise RuntimeError(resp.text)
    return resp.json()["access_token"]

# RECCOBEATS (NO AUTH REQUIRED)
RECCOBEATS_BASE_URL = "https://api.reccobeats.com"

def get_reccobeats_headers():
    return {"Accept": "application/json"}


StatementMeta(, 624ab789-1c7c-4a89-a2b6-0e287b7d0f4a, 3, Finished, Available, Finished)

‚úÖ Loaded Spotify App Credentials.


In [2]:
# =========================================================
# CELL 2: MASTER DATA MANAGER (READ/WRITE JSONL)
# =========================================================

def get_silver_path(category, user_folder):
    return f"/lakehouse/default/{SILVER_BASE_PATH}/{user_folder}/{category}.json"

def load_processed_ids(category, user_folder):
    """Return set of IDs already stored in the master file."""
    path = get_silver_path(category, user_folder)
    ids = set()
    if os.path.exists(path):
        try:
            with open(path, "r", encoding="utf-8") as f:
                for line in f:
                    obj = json.loads(line)
                    if obj.get("id"):
                        ids.add(obj["id"])
        except Exception as e:
            print(f"‚ö†Ô∏è Error reading {category}: {e}")
    return ids

def append_to_master(new_items, category, user_folder):
    if not new_items:
        print(f"   ‚ÑπÔ∏è No new {category} to append.")
        return 0

    path = get_silver_path(category, user_folder)
    os.makedirs(os.path.dirname(path), exist_ok=True)

    count = 0
    with open(path, "a", encoding="utf-8") as f:
        for obj in new_items:
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")
            count += 1

    print(f"   üíæ Appended {count} ‚Üí {category}")
    return count


StatementMeta(, 624ab789-1c7c-4a89-a2b6-0e287b7d0f4a, 4, Finished, Available, Finished)

In [3]:
# =========================================================
# CELL 3: HARVEST IDS FROM BRONZE (ALL DATES)
# =========================================================
import os

def harvest_ids_dynamic(folder_name):
    all_track_ids = set()
    all_artist_ids = set()
    top_artist_ids = set()

    base_path = f"/lakehouse/default/{BRONZE_BASE_PATH}/{folder_name}"
    if not os.path.exists(base_path):
        print(f"‚ö†Ô∏è Missing folder: {base_path}")
        return [], [], []

    # List category folders
    try:
        categories = [c.name for c in os.scandir(base_path) if c.is_dir()]
    except Exception as e:
        print("‚ö†Ô∏è Unable to scan categories:", e)
        return [], [], []

    print(f"   üîé Scanning {len(categories)} category folders...")

    # Loop each category (saved_tracks, playlists, recently_played, etc.)
    for cat in categories:
        cat_path = f"{base_path}/{cat}"

        # List all date folders inside category
        try:
            dates = [d.name for d in os.scandir(cat_path) if d.is_dir()]
        except:
            continue

        # Loop each date inside this category
        for date in dates:
            file_path = f"{cat_path}/{date}/data.json"
            if not os.path.exists(file_path):
                continue

            # Load JSON safely
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                payload = data.get("payload", [])
            except:
                continue

            # Flatten payload ‚Üí extract list of items
            items = []
            if isinstance(payload, list):
                for page in payload:
                    if isinstance(page, dict):
                        items.extend(page.get("items", []))
            elif isinstance(payload, dict):
                # might be items or artists.items
                if "items" in payload:
                    items = payload.get("items", [])
                elif "artists" in payload and isinstance(payload["artists"], dict):
                    items = payload["artists"].get("items", [])

            # ============================
            # EXTRACT TRACKS & ARTISTS
            # ============================
            for item in items:
                if not isinstance(item, dict):
                    continue

                # ----------------------------------------------------------
                # A) Saved Albums (album structure with embedded tracks)
                # ----------------------------------------------------------
                if "album" in item and "added_at" in item:
                    album = item.get("album", {})
                    tracks_src = album.get("tracks", {})

                    album_tracks = (
                        tracks_src.get("items", [])
                        if isinstance(tracks_src, dict)
                        else []
                    )

                    for t in album_tracks:
                        if isinstance(t, dict) and t.get("id"):
                            all_track_ids.add(t["id"])
                            for a in t.get("artists", []):
                                if isinstance(a, dict) and a.get("id"):
                                    all_artist_ids.add(a["id"])
                    continue

                # ----------------------------------------------------------
                # B) Track Wrapper (playlist item, recently played)
                # ----------------------------------------------------------
                if "track" in item:
                    t = item.get("track")
                    if isinstance(t, dict) and t.get("id"):
                        all_track_ids.add(t["id"])
                        for a in t.get("artists", []):
                            if isinstance(a, dict) and a.get("id"):
                                all_artist_ids.add(a["id"])
                    continue

                # ----------------------------------------------------------
                # C) Direct Track Object (raw Spotify object)
                # ----------------------------------------------------------
                if item.get("type") == "track" and item.get("id"):
                    all_track_ids.add(item["id"])
                    for a in item.get("artists", []):
                        if isinstance(a, dict) and a.get("id"):
                            all_artist_ids.add(a["id"])
                    continue

                # ----------------------------------------------------------
                # D) Direct Artist Object
                # ----------------------------------------------------------
                if item.get("type") == "artist" and item.get("id"):
                    all_artist_ids.add(item["id"])
                    if "top_artists" in cat:
                        top_artist_ids.add(item["id"])
                    continue

    # return unique lists
    return list(all_track_ids), list(all_artist_ids), list(top_artist_ids)


StatementMeta(, 624ab789-1c7c-4a89-a2b6-0e287b7d0f4a, 5, Finished, Available, Finished)

In [4]:
# =========================================================
# CELL 4 (UPDATED): FETCH RECCOBEATS METADATA WITH SPOTIFY ID
# =========================================================

def discover_users_from_onelake():
    """
    Scans Files/bronze/spotify for user folders.
    Each subfolder represents a user.
    Returns list of:
      { "folder_name": <folder>, "display_name": <folder> }
    """
    base_path = f"/lakehouse/default/{BRONZE_BASE_PATH}"
    users = []

    if not os.path.exists(base_path):
        print(f"‚ö†Ô∏è User base folder not found: {base_path}")
        return []

    try:
        for entry in os.scandir(base_path):
            if entry.is_dir():
                folder = entry.name
                users.append({
                    "folder_name": folder,
                    "display_name": folder
                })
    except Exception as e:
        print(f"‚ùå Error scanning OneLake users: {repr(e)}")
        return []

    print(f"üë• Found {len(users)} user folders in OneLake.")
    return users

def fetch_reccobeats_tracks(spotify_track_ids):
    """
    Calls: GET /v1/track?ids=<list>
    Saves BOTH:
      - spotify_id   (from your Bronze raw data)
      - recco_id     (ReccoBeats-generated ID)
    """
    base_url = f"{RECCOBEATS_BASE_URL}/v1/track"
    headers = get_reccobeats_headers()

    results = []

    # Process batches of 40 (API max)
    for i in range(0, len(spotify_track_ids), 40):
        batch = spotify_track_ids[i:i+40]

        try:
            resp = requests.get(
                base_url,
                headers=headers,
                params={"ids": batch},
                timeout=10
            )

            if resp.status_code == 200:
                content = resp.json().get("content", [])
                for idx, item in enumerate(content):
                    if not isinstance(item, dict):
                        continue

                    rec_id = item.get("id")
                    sp_id  = batch[idx]  # spotify ID from input

                    # Inject spotify_id into record
                    item["spotify_id"] = sp_id
                    item["recco_id"]   = rec_id

                    results.append(item)
            else:
                print(f"‚ö†Ô∏è Error {resp.status_code} on batch: {resp.text[:200]}")

        except Exception as e:
            print(f"‚ùå Exception fetching batch: {repr(e)}")

        time.sleep(0.15)

    return results

def fetch_artist_details(access_token, artist_ids):
    url = "https://api.spotify.com/v1/artists"
    results = []

    print(f"üé® Fetching {len(artist_ids)} artists from Spotify API...")

    for i in range(0, len(artist_ids), 50):
        batch = artist_ids[i:i+50]

        resp = requests.get(
            url,
            params={"ids": ",".join(batch)},
            headers={"Authorization": f"Bearer {access_token}"},
            timeout=10
        )

        if resp.status_code == 200:
            results.extend(resp.json().get("artists", []))

        elif resp.status_code == 429:
            wait = int(resp.headers.get("Retry-After", 5))
            print(f"‚è≥ Rate-limit hit. Waiting {wait}s...")
            time.sleep(wait)
            continue

        else:
            print(f"‚ö†Ô∏è Error {resp.status_code}: {resp.text[:200]}")

        time.sleep(0.1)

    return results

StatementMeta(, 624ab789-1c7c-4a89-a2b6-0e287b7d0f4a, 6, Finished, Available, Finished)

In [None]:
# =========================================================
# CELL 5 (UPDATED): INGEST TRACK METADATA + ARTISTS
# =========================================================

print("üîë Getting Spotify Token...")
app_token = get_app_token()

print("üìÇ Discovering users...")
users = discover_users_from_onelake()

TEST_LIMIT = None

for user in users:
    fname = user["folder_name"]
    uname = user["display_name"]

    print(f"\n=== Processing {uname} ===")

    # ----------------------------------------
    # 1) HARVEST BRONZE
    # ----------------------------------------
    raw_tracks, raw_artists, _ = harvest_ids_dynamic(fname)
    raw_tracks = list(set(raw_tracks))
    raw_artists = list(set(raw_artists))

    print(f"   üîç {len(raw_tracks)} Spotify tracks harvested.")
    print(f"   üé® {len(raw_artists)} Spotify artists harvested.")

    # ----------------------------------------
    # 2) LOAD EXISTING (using spotify_id, not recco_id)
    # ----------------------------------------
    path_tracks = get_silver_path("master_reccobeats_tracks", fname)

    existing_spotify_ids = set()
    if os.path.exists(path_tracks):
        with open(path_tracks, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    obj = json.loads(line)
                    if obj.get("spotify_id"):
                        existing_spotify_ids.add(obj["spotify_id"])
                except:
                    pass

    # Compute delta (NEW tracks only)
    new_track_ids = [tid for tid in raw_tracks if tid not in existing_spotify_ids]

    print(f"   üöÄ {len(new_track_ids)} new tracks to fetch.")

    if TEST_LIMIT:
        new_track_ids = new_track_ids[:TEST_LIMIT]
        print(f"   ‚ö†Ô∏è TEST MODE: limiting to {len(new_track_ids)} tracks")

    # ----------------------------------------
    # 3) FETCH RECCOBEATS METADATA
    # ----------------------------------------
    if new_track_ids:
        track_records = fetch_reccobeats_tracks(new_track_ids)
        appended = append_to_master(track_records, "master_reccobeats_tracks", fname)
        print(f"   ‚úÖ Saved metadata for {appended} tracks")
    else:
        print("   ‚ÑπÔ∏è No new tracks to fetch.")

    # ----------------------------------------
    # 4) ARTIST INGESTION (unchanged)
    # ----------------------------------------
    existing_artists = load_processed_ids("master_artists", fname)
    new_artist_ids = [aid for aid in raw_artists if aid not in existing_artists]

    print(f"   üé® NEW artists: {len(new_artist_ids)}")

    if TEST_LIMIT:
        new_artist_ids = new_artist_ids[:TEST_LIMIT]

    if new_artist_ids:
        artist_data = fetch_artist_details(app_token, new_artist_ids)
        appended_art = append_to_master(artist_data, "master_artists", fname)
        print(f"   ‚úÖ Artist Details Saved: {appended_art}")
    else:
        print("   ‚ÑπÔ∏è No new artists to fetch.")


In [None]:
# =========================================================
# CELL 6: FETCH AUDIO FEATURES (STEP 4)
# =========================================================

def fetch_reccobeats_audio_features(rid):
    url = f"{RECCOBEATS_BASE_URL}/v1/track/{rid}/audio-features"
    headers = get_reccobeats_headers()

    try:
        resp = requests.get(url, headers=headers, timeout=8)
        if resp.status_code == 200:
            data = resp.json()
            data["id"] = data.get("id", rid)
            return data
        else:
            print(f"‚ö†Ô∏è {resp.status_code} on feature fetch for {rid}")
            return None
    except Exception as e:
        print(f"‚ùå Error fetching features for {rid}: {repr(e)}")
        return None


# ========= CONFIG =========
TEST_FEATURE_LIMIT = None
BATCH_SIZE = 100  # adjust for speed vs stability


print("\nüîç Starting batched audio-feature ingestion...\n")

for user in users:
    fname = user["folder_name"]
    uname = user["display_name"]

    print(f"\n=== Features for {uname} ===")

    # Load track metadata
    path = get_silver_path("master_reccobeats_tracks", fname)
    if not os.path.exists(path):
        print("   ‚ÑπÔ∏è No track metadata found.")
        continue

    # Collect all ReccoBeats track IDs
    recc_ids = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                obj = json.loads(line)
                if obj.get("id"):
                    recc_ids.append(obj["id"])
            except:
                continue

    recc_ids = list(set(recc_ids))
    print(f"   üéØ {len(recc_ids)} ReccoBeats track IDs found")

    # Load existing feature rows
    existing_feats = load_processed_ids("master_reccobeats_features", fname)
    new_ids = [rid for rid in recc_ids if rid not in existing_feats]

    print(f"   üöÄ {len(new_ids)} tracks still missing audio features")

    if TEST_FEATURE_LIMIT is not None:
        new_ids = new_ids[:TEST_FEATURE_LIMIT]
        print(f"   ‚ö†Ô∏è TEST MODE: limiting to {len(new_ids)} tracks")

    # ================================
    # BATCHED INGESTION LOOP (LIMITED TO 100 BATCHES)
    # ================================
    MAX_BATCHES = 100
    batched_features = []

    total_batches = (len(new_ids) + BATCH_SIZE - 1) // BATCH_SIZE
    batches_to_run = min(total_batches, MAX_BATCHES)

    for b in range(batches_to_run):
        batch = new_ids[b * BATCH_SIZE : (b + 1) * BATCH_SIZE]
        print(f"   üì¶ Batch {b+1}/{batches_to_run} ‚Äî {len(batch)} tracks")

        batch_features = []

        # fetch features for each track in this batch
        for rid in batch:
            feat = fetch_reccobeats_audio_features(rid)
            if not feat:
                continue
            batch_features.append(feat)
            time.sleep(0.08)
        appended = append_to_master(batch_features, "master_reccobeats_features", fname)
        print(f"     üíæ Saved {appended} feature records (batch {b+1})")

print("\nüéâ DONE ‚Äî Batched Audio-Features Pipeline Complete\n")
