In [9]:
import zipfile
import os

# Path inside Fabric Lakehouse
zip_path = "/lakehouse/default/Files/bronze/external/archive.zip"
extract_to = "/lakehouse/default/Files/bronze/external/"

print("üì¶ Unzipping:", zip_path)

# Extract ZIP
with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(extract_to)

print("‚úÖ Extraction complete!")
print("üìÅ Extracted files:", os.listdir(extract_to))


StatementMeta(, 2655d44c-8bfc-4834-a476-0879d1405fd0, 11, Finished, Available, Finished)

üì¶ Unzipping: /lakehouse/default/Files/bronze/external/archive.zip
‚úÖ Extraction complete!
üìÅ Extracted files: ['tracks_features.csv', 'archive.zip']


In [10]:
from pyspark.sql.functions import col, when
from pyspark.sql.types import DoubleType, IntegerType, StringType

# ====================================================
# 1. Config: Path to your downloaded Kaggle CSV
# ====================================================
# ‚ö†Ô∏è Update this path to where you actually save the file in Bronze
source_path = "Files/bronze/external/tracks_features.csv" 

# ====================================================
# 2. Read CSV (with Header)
# ====================================================
try:
    df_raw = spark.read.format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load(source_path)
except Exception as e:
    print(f"‚ùå File not found. Make sure you uploaded the CSV to: {source_path}")
    dbutils.notebook.exit("Missing Data")

# ====================================================
# 3. Select & Cast for ML
#    We strictly cast metrics to Double for VectorAssembler compatibility
# ====================================================
df_clean = df_raw.select(
    # --- Identity ---
    col("id").alias("spotify_id"),
    col("name").alias("track_name"),
    col("album").alias("album_name"),
    col("album_id"),
    col("artists"),
    col("artist_ids"),
    
    # --- The Features (Critical for ML) ---
    col("danceability").cast(DoubleType()),
    col("energy").cast(DoubleType()),
    col("key").cast(IntegerType()),
    col("loudness").cast(DoubleType()),
    col("mode").cast(IntegerType()),
    col("speechiness").cast(DoubleType()),
    col("acousticness").cast(DoubleType()),
    col("instrumentalness").cast(DoubleType()),
    col("liveness").cast(DoubleType()),
    col("valence").cast(DoubleType()),
    col("tempo").cast(DoubleType()),
    
    # --- Metadata ---
    col("duration_ms").cast(IntegerType()),
    col("time_signature").cast(IntegerType()),
    col("year").cast(IntegerType()),
    col("release_date")
)

# ====================================================
# 4. Clean Up (Drop Rows with Null IDs or Features)
#    ML models hate Nulls. We purge them here.
# ====================================================
df_final = df_clean.dropna(subset=["spotify_id", "danceability", "energy", "valence"])

# ====================================================
# 5. Save as Delta
# ====================================================
table_name = "silver_ml_training_set"
df_final.write.format("delta").mode("overwrite").saveAsTable(table_name)

# ====================================================
# 6. üöÄ OPTIMIZE for Machine Learning
#    We Z-ORDER by Features so looking up "Sad Songs" (low valence) is instant.
# ====================================================
print("‚ö° Optimizing layout for ML queries...")
spark.sql(f"OPTIMIZE {table_name} ZORDER BY (valence, energy, tempo)")

print(f"‚úÖ {table_name} is ready. Row Count: {df_final.count()}")
display(spark.table(table_name).limit(5))

StatementMeta(, 2655d44c-8bfc-4834-a476-0879d1405fd0, 12, Finished, Available, Finished)

‚ö° Optimizing layout for ML queries...
‚úÖ silver_ml_training_set is ready. Row Count: 1194506


SynapseWidget(Synapse.DataFrame, 5e449ccc-8ff2-4da0-905d-1a5fe35adf39)