In [0]:
# %sql
# CREATE VOLUME IF NOT EXISTS workspace.default.ml_cache_vol3;

In [0]:
import os
os.environ['SPARKML_TEMP_DFS_PATH'] = "/Volumes/workspace/default/ml_cache_vol3"
os.environ['MLFLOW_DFS_TMP'] = "/Volumes/workspace/default/ml_cache_vol3"

In [0]:
silver_table_path='default.spotify_silver'

In [0]:
import mlflow
import mlflow.spark
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col, count as sql_count, row_number
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.sql import Window
from pyspark.sql import Window
from pyspark.sql.functions import row_number
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [0]:
feature_cols = [
    "danceability", "energy", "loudness", "speechiness", 
    "acousticness", "instrumentalness", "liveness", "valence", "tempo"
]
label_col = "track_genre"

df = spark.read.table("default.spotify_silver") \
    .select(label_col, *feature_cols) \
    .dropna()
display(feature_cols)

Databricks visualization. Run in Databricks to view.

In [0]:
# LOAD DATA
NUM_GENRES = 5

top_genres_df = df.groupBy("track_genre") \
    .agg(sql_count("*").alias("count")) \
    .orderBy(col("count").desc()) \
    .limit(NUM_GENRES)

top_genres = [row.track_genre for row in top_genres_df.collect()]
print(f"Training on top {NUM_GENRES} genres: {top_genres}")

df = df.filter(col("track_genre").isin(top_genres))



window = Window.partitionBy("track_genre").orderBy("danceability")
df = df.withColumn("row_num", row_number().over(window)) \
       .filter(col("row_num") <= 500) \
       .drop("row_num")

print(f"Total rows after sampling: {df.count()}")

In [0]:
genre_counts = df.groupBy("track_genre").count()

display(genre_counts)

Databricks visualization. Run in Databricks to view.

In [0]:
# TRAIN/TEST SPLIT
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

print(f"Training rows: {train_data.count()}")
print(f"Testing rows: {test_data.count()}")

In [0]:
# Define Stages
indexer = StringIndexer(inputCol="track_genre", outputCol="label", handleInvalid="keep")
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_raw")
scaler = StandardScaler(inputCol="features_raw", outputCol="features")

# Define Model (Logistic Regression)
lr = LogisticRegression(
    labelCol="label", 
    featuresCol="features", 
    maxIter=3, 
    regParam=0.2, 
    elasticNetParam=0.0
)

# Create Pipeline FIRST 
pipeline_lr = Pipeline(stages=[indexer, assembler, scaler, lr])

# Setup Hyperparameter Tuning (CrossValidator)
paramGrid = ParamGridBuilder() \
    .addGrid(lr.maxIter, [5, 10]) \
    .addGrid(lr.regParam, [0.1, 0.2]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5]) \
    .build()

crossval = CrossValidator(estimator=pipeline_lr,  
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(metricName="f1"),
                          numFolds=3) 

In [0]:
# TRAIN MODEL
with mlflow.start_run(run_name="Spotify Genre - Logistic Regression"):
    
    
    # print("Running Cross Validation...")
    # model_lr = crossval.fit(train_data)
    # best_lr = model_lr.bestModel.stages[-1]
    # best_reg_param = best_lr.getRegParam()
    # best_elastic_param = best_lr.getElasticNetParam()
    # We did not use this because of 100 mb limitation
    print("Running Direct Fit (Optimization for 100MB limit)...")
    model_lr = pipeline_lr.fit(train_data)
    
    lr_stage = model_lr.stages[-1]
    best_reg_param = lr_stage.getRegParam()
    best_elastic_param = lr_stage.getElasticNetParam()
    
    predictions_lr = model_lr.transform(test_data)
    
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
    
    accuracy_lr = evaluator.setMetricName("accuracy").evaluate(predictions_lr)
    f1_lr = evaluator.setMetricName("f1").evaluate(predictions_lr)
    precision_lr = evaluator.setMetricName("weightedPrecision").evaluate(predictions_lr)
    recall_lr = evaluator.setMetricName("weightedRecall").evaluate(predictions_lr)
    
    print("-" * 50)
    print("LOGISTIC REGRESSION RESULTS:")
    print(f"Accuracy:  {accuracy_lr:.4f}")
    print(f"F1 Score:  {f1_lr:.4f}")
    print("-" * 50)
    
    mlflow.log_metric("accuracy", accuracy_lr)
    mlflow.log_metric("f1_score", f1_lr)
    mlflow.log_metric("precision", precision_lr)
    mlflow.log_metric("recall", recall_lr)
    
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("maxIter", lr_stage.getMaxIter())
    mlflow.log_param("regParam", best_reg_param)
    mlflow.log_param("elasticNetParam", best_elastic_param)

    mlflow.spark.log_model(
        model_lr, 
        "genre_classifier_lr",
        dfs_tmpdir="/Volumes/workspace/default/ml_cache_vol3"
    )
best_model_lr = model_lr 

In [0]:
print("Confusion Matrix")

label_converter = best_model_lr.stages[0]
labels_list = label_converter.labels

label_mapping = [(float(i), label) for i, label in enumerate(labels_list)]
label_mapping_df = spark.createDataFrame(label_mapping, ["label_id", "genre_name"])

confusion_matrix_df = predictions_lr.groupBy("label", "prediction").count() \
    .orderBy("label", "prediction")

confusion_with_names = confusion_matrix_df \
    .join(label_mapping_df.withColumnRenamed("label_id", "label").withColumnRenamed("genre_name", "actual_genre"), "label") \
    .join(label_mapping_df.withColumnRenamed("label_id", "prediction").withColumnRenamed("genre_name", "predicted_genre"), "prediction") \
    .select("actual_genre", "predicted_genre", "count") \
    .orderBy("actual_genre", "predicted_genre")

display(confusion_with_names)

In [0]:
display(confusion_matrix_df)

In [0]:
pdf = confusion_matrix_df.toPandas()

matrix = pdf.pivot(index='label', columns='prediction', values='count').fillna(0)

genre_names = labels_list  
matrix.index = [genre_names[int(i)] for i in matrix.index]
matrix.columns = [genre_names[int(i)] for i in matrix.columns]


plt.figure(figsize=(10, 8))
sns.set(font_scale=1.2) 
sns.heatmap(matrix, annot=True, fmt='g', cmap='Blues', linewidths=0.5, linecolor='gray')

plt.title("Confusion Matrix: Predicted vs Actual Genre")
plt.ylabel("Actual Genre")
plt.xlabel("Predicted Genre")
plt.tight_layout()
plt.show()

In [0]:
print("\nPer-Genre Performance:")
for idx, genre_label in enumerate(labels_list):
    genre_predictions = predictions_lr.filter(col("label") == float(idx))
    correct = genre_predictions.filter(col("prediction") == float(idx)).count()
    total = genre_predictions.count()
    if total > 0:
        genre_acc = correct / total
        print(f"{genre_label}: {genre_acc:.4f} ({correct}/{total})")

In [0]:
# MODEL SERVING 
new_songs_df = df.sample(fraction=0.01, seed=123).drop("track_genre")

loaded_model = model_lr
predictions_df = loaded_model.transform(new_songs_df)

label_converter = best_model_lr.stages[0]
labels_list = label_converter.labels

label_mapping = [(float(i), label) for i, label in enumerate(labels_list)]
label_mapping_df = spark.createDataFrame(label_mapping, ["prediction", "predicted_genre"])

final_output = predictions_df.join(label_mapping_df, "prediction", "left")

display(final_output.select(
    "danceability", "energy", "tempo", "valence",
    "prediction", "predicted_genre"
).limit(20))

final_output.write.mode("overwrite").saveAsTable("default.spotify_gold_predictions")

In [0]:
lr_stage = model_lr.stages[-1]  
coefficients = lr_stage.coefficientMatrix.toArray()

label_converter = model_lr.stages[0]
labels_list = label_converter.labels

feature_importance_data = []
for class_idx, genre in enumerate(labels_list):
    for feat_idx, feature in enumerate(feature_cols):
        feature_importance_data.append({
            'genre': genre,
            'feature': feature,
            'coefficient': float(abs(coefficients[class_idx][feat_idx])) 
        })

from pyspark.sql.types import StructType, StructField, StringType, DoubleType

schema = StructType([
    StructField("genre", StringType(), True),
    StructField("feature", StringType(), True),
    StructField("coefficient", DoubleType(), True)
])

importance_df = spark.createDataFrame(feature_importance_data, schema=schema)

print("Top 5 most important features per genre:")
display(importance_df.orderBy(col("coefficient").desc()).limit(50))

In [0]:

pdf_imp = importance_df.toPandas()

genres = pdf_imp['genre'].unique()

plt.figure(figsize=(18, 10))

for i, genre in enumerate(genres):
    plt.subplot(2, 3, i+1)
    
    genre_data = pdf_imp[pdf_imp['genre'] == genre].sort_values(by="coefficient", ascending=False)

    sns.barplot(x="coefficient", y="feature", data=genre_data, palette="viridis")

    plt.title(f"Key Features: {genre.upper()}", fontsize=14, fontweight='bold')
    plt.xlabel("Importance Impact")
    plt.ylabel("") 

plt.tight_layout()
plt.show()

In [0]:
print(f"Logistic Regression:  Accuracy={accuracy_lr:.4f}, F1={f1_lr:.4f}")