In [0]:
spark


## Config

Prerequisite is to allow the Databrick environment to load from ADLS container. Follow this tutorial:

[Tutorial: Connect to Azure Data Lake Storage Gen2](https://learn.microsoft.com/en-gb/azure/databricks/connect/storage/tutorial-azure-storage)

In [0]:
DBK_SECRET_SCOPE = "tichack2024kv" # Databricks secret scope to access Azure Key Vault
AKV_KEY_NAME = "analytical-databricks-key" # Azure Key Vault
ENTRA_APP_ID = "9156dfe1-254b-4047-9f1a-a8fd3e79787d"
ENTRA_DIRECTORY_ID = "565f1c8e-754e-473e-8352-ac5b86a38c93" # Tenant ID of Entra App

STORAGE_ACC = "agenticaiamlws" # Storage Account
ADLS_CONTAINER = "azureml-blobstore-03a975f6-17cd-4334-a581-d30d363b62ab"

In [0]:
service_credential = dbutils.secrets.get(scope=DBK_SECRET_SCOPE, key=AKV_KEY_NAME)

## Mount ADLS Container and Unzip

[Mounting cloud object storage on Azure Databricks](https://learn.microsoft.com/en-gb/azure/databricks/dbfs/mounts)

In [0]:
configs = {"fs.azure.account.auth.type": "OAuth",
          "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
          "fs.azure.account.oauth2.client.id": ENTRA_APP_ID,
          "fs.azure.account.oauth2.client.secret": service_credential,
          "fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{ENTRA_DIRECTORY_ID}/oauth2/token"}

dbutils.fs.mount(
    source=f"abfss://{ADLS_CONTAINER}@{STORAGE_ACC}.dfs.core.windows.net",
    mount_point="/mnt/adls",
    extra_configs=configs
)

In [0]:
import zipfile
import os

# Define Paths
adls_mnt_path = "/dbfs/mnt/adls"

# Fake path
zip_adls_file = "<fake/path>" # Challenge path
extract_path = "<fake/path>"

# # Challenge
# zip_adls_file = "million_playlist_dataset/spotify_million_playlist_dataset_challenge.zip" # Challenge path
# extract_path = "/dbfs/mnt/adls/challenge_dataset/"

# # MLD
# zip_adls_file = "million_playlist_dataset/spotify_million_playlist_dataset.zip" # MPD path
# extract_path = "/dbfs/mnt/adls/mld_dataset/"

# Create extract folder
zip_path = os.path.join(adls_mnt_path, zip_adls_file)  # Databricks paths use /dbfs/
os.makedirs(extract_path, exist_ok=True)

The extraction can take 30 minutes. Total extracted JSON size ~33GB and 1000 files.

In [0]:
# Extract ZIP file
with zipfile.ZipFile(zip_path, 'r') as zf:
    zf.extractall(extract_path)

## Spark Load JSONs

In [0]:
# Set spark connection
spark.conf.set(f"fs.azure.account.auth.type.{STORAGE_ACC}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{STORAGE_ACC}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{STORAGE_ACC}.dfs.core.windows.net", ENTRA_APP_ID)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{STORAGE_ACC}.dfs.core.windows.net", service_credential)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{STORAGE_ACC}.dfs.core.windows.net", f"https://login.microsoftonline.com/{ENTRA_DIRECTORY_ID}/oauth2/token")

In [0]:
# Test spark connection
df_titanic = spark.read.csv(f"abfss://{ADLS_CONTAINER}@{STORAGE_ACC}.dfs.core.windows.net/titanic.csv", header=True)
display(df_titanic)

In [0]:
# Million Playlist Dataset path
mpd_dataset = "mld_dataset/data/"
mdp_adls_path = f"abfss://{ADLS_CONTAINER}@{STORAGE_ACC}.dfs.core.windows.net/{mpd_dataset}"

In [0]:
# Load MDP. About 6 minutes
df = spark.read.option("multiline", "true").json(mdp_adls_path)
display(df)