In [0]:
import os

import logging
logging.basicConfig(level=logging.ERROR, handlers=[logging.StreamHandler()])
logger = logging.getLogger("mdp_prop")

spark


## Config

Prerequisite is to allow the Databrick environment to load from ADLS container. Follow this tutorial:

[Tutorial: Connect to Azure Data Lake Storage Gen2](https://learn.microsoft.com/en-gb/azure/databricks/connect/storage/tutorial-azure-storage)

In [0]:
DBK_SECRET_SCOPE = "tichack2024kv" # Databricks secret scope to access Azure Key Vault
AKV_KEY_NAME = "analytical-databricks-key" # Azure Key Vault
ENTRA_APP_ID = "9156dfe1-254b-4047-9f1a-a8fd3e79787d"
ENTRA_DIRECTORY_ID = "565f1c8e-754e-473e-8352-ac5b86a38c93" # Tenant ID of Entra App

STORAGE_ACC = "agenticaiamlws" # Storage Account
ADLS_CONTAINER = "azureml-blobstore-03a975f6-17cd-4334-a581-d30d363b62ab"

In [0]:
service_credential = dbutils.secrets.get(scope=DBK_SECRET_SCOPE, key=AKV_KEY_NAME)

## Extract MDP Zip

[Mounting cloud object storage on Azure Databricks](https://learn.microsoft.com/en-gb/azure/databricks/dbfs/mounts)

In [0]:
# Already mounted to /mnt/adls/ No need to mount twice
# configs = {"fs.azure.account.auth.type": "OAuth",
#           "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
#           "fs.azure.account.oauth2.client.id": ENTRA_APP_ID,
#           "fs.azure.account.oauth2.client.secret": service_credential,
#           "fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{ENTRA_DIRECTORY_ID}/oauth2/token"}

# dbutils.fs.mount(
#     source=f"abfss://{ADLS_CONTAINER}@{STORAGE_ACC}.dfs.core.windows.net",
#     mount_point="/mnt/adls",
#     extra_configs=configs
# )

In [0]:
# import zipfile
# import os

# Define Paths
adls_mnt_path = "/dbfs/mnt/adls"

# # Fake path
# zip_adls_file = "<fake/path>" # Challenge path
# extract_path = "<fake/path>"

# # Challenge
# zip_adls_file = "million_playlist_dataset/spotify_million_playlist_dataset_challenge.zip" # Challenge path
# extract_path = "/dbfs/mnt/adls/challenge_dataset/"

# # MLD
# zip_adls_file = "million_playlist_dataset/spotify_million_playlist_dataset.zip" # MPD path
# extract_path = "/dbfs/mnt/adls/mld_dataset/"

# # Create extract folder
# zip_path = os.path.join(adls_mnt_path, zip_adls_file)  # Databricks paths use /dbfs/
# os.makedirs(extract_path, exist_ok=True)

The extraction can take 30 minutes. Total extracted JSON size ~33GB and 1000 files.

In [0]:
# # Extract ZIP file
# with zipfile.ZipFile(zip_path, 'r') as zf:
#     zf.extractall(extract_path)

## Flatten MPD JSONs

In [0]:
# Set spark connection to ADLS
spark.conf.set(f"fs.azure.account.auth.type.{STORAGE_ACC}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{STORAGE_ACC}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{STORAGE_ACC}.dfs.core.windows.net", ENTRA_APP_ID)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{STORAGE_ACC}.dfs.core.windows.net", service_credential)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{STORAGE_ACC}.dfs.core.windows.net", f"https://login.microsoftonline.com/{ENTRA_DIRECTORY_ID}/oauth2/token")

# Spark configurations


# Test spark connection
df_titanic = spark.read.csv(f"abfss://{ADLS_CONTAINER}@{STORAGE_ACC}.dfs.core.windows.net/titanic.csv", header=True)
display(df_titanic)

In [0]:
# Million Playlist Dataset path
mpd_json_datastore = "mld_dataset/data/" # mdp json path
mpd_dataset = "mdp_dataset" # mdp parquet path
os.makedirs(os.path.join(adls_mnt_path, mpd_dataset), exist_ok=True)

# List all files in the ADLS
logger.info("List JSON files as ADLS path.")
mdp_adls_path = f"abfss://{ADLS_CONTAINER}@{STORAGE_ACC}.dfs.core.windows.net/{mpd_json_datastore}" # adls path
mdp_json_paths = []
for file in dbutils.fs.ls(mdp_adls_path):
    mdp_json_paths.append(file.path)

# List all files in local mount
logger.info("List JSON files as local path.")
mdp_json_local = "mld_dataset/data"
mdp_json_paths_local = []
for file in dbutils.fs.ls(f"/mnt/adls/{mdp_json_local}"):
    mdp_json_paths_local.append(file.path)

In [0]:

fname

In [0]:
from tqdm.notebook import tqdm
import json
import re

for fpath in tqdm(mdp_json_paths_local[:2], desc=f"Loading JSON files: {fpath}"):
    fpath = re.sub("dbfs:", "/dbfs", fpath) # Convert to Databricks path
    with open(fpath, 'r') as fi:
        data = json.load(fi)
    if 'playlists' in data:
        playlists = data['playlists']
        df_playlists = spark.createDataFrame(playlists).coalesce(1)
        fname_parts = os.path.basename(fpath).split(".")[:-1]
        fname_parts.append('parquet')
        fname = '.'.join(fname_parts)
        df_playlists.write.mode("overwrite").save(f"{adls_mnt_path}/{mpd_dataset}/{fname}")
    else:
        playlist = []
        logger.error(f"No playlists in {fpath}")

display(df_playlists)


In [0]:
df_test = spark.read.parquet(f"{adls_mnt_path}/{mpd_dataset}/{fname}")
display(df_test)

In [0]:
data['info']

In [0]:
data['playlists'][0]

In [0]:
# Load MDP. About 6 minutes
df = spark.read.option("multiline", "true").json(mdp_json_paths[0])


```
+--------------------+--------------------+
|                info|           playlists|
+--------------------+--------------------+
|{2017-12-03 08:41...|[{false, NULL, 11...|
+--------------------+--------------------+
```

In [0]:
display(df[["playlists"]])

An example playlist entry.

```
{
        "name": "musical",
        "collaborative": "false",
        "pid": 5,
        "modified_at": 1493424000,
        "num_albums": 7,
        "num_tracks": 12,
        "num_followers": 1,
        "num_edits": 2,
        "duration_ms": 2657366,
        "num_artists": 6,
        "tracks": [
            {
                "pos": 0,
                "artist_name": "Degiheugi",
                "track_uri": "spotify:track:7vqa3sDmtEaVJ2gcvxtRID",
                "artist_uri": "spotify:artist:3V2paBXEoZIAhfZRJmo2jL",
                "track_name": "Finalement",
                "album_uri": "spotify:album:2KrRMJ9z7Xjoz1Az4O6UML",
                "duration_ms": 166264,
                "album_name": "Dancing Chords and Fireflies"
            },
            {
                "pos": 1,
                "artist_name": "Degiheugi",
                "track_uri": "spotify:track:23EOmJivOZ88WJPUbIPjh6",
                "artist_uri": "spotify:artist:3V2paBXEoZIAhfZRJmo2jL",
                "track_name": "Betty",
                "album_uri": "spotify:album:3lUSlvjUoHNA8IkNTqURqd",
                "duration_ms": 235534,
                "album_name": "Endless Smile"
            },
            {
                "pos": 2,
                "artist_name": "Degiheugi",
                "track_uri": "spotify:track:1vaffTCJxkyqeJY7zF9a55",
                "artist_uri": "spotify:artist:3V2paBXEoZIAhfZRJmo2jL",
                "track_name": "Some Beat in My Head",
                "album_uri": "spotify:album:2KrRMJ9z7Xjoz1Az4O6UML",
                "duration_ms": 268050,
                "album_name": "Dancing Chords and Fireflies"
            },
            // 8 tracks omitted
            {
                "pos": 11,
                "artist_name": "Mo' Horizons",
                "track_uri": "spotify:track:7iwx00eBzeSSSy6xfESyWN",
                "artist_uri": "spotify:artist:3tuX54dqgS8LsGUvNzgrpP",
                "track_name": "Fever 99\u00b0",
                "album_uri": "spotify:album:2Fg1t2tyOSGWkVYHlFfXVf",
                "duration_ms": 364320,
                "album_name": "Come Touch The Sun"
            }
        ],

    }
```