In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, mean

In [2]:
spark = SparkSession.builder \
    .appName("MusicPersonalization") \
    .getOrCreate()

In [3]:
music_spark_df = spark.read.csv("dataset_music.csv", header=True, inferSchema=True)

In [4]:
numeric_columns = ['popularity', 'duration_ms', 'danceability', 'energy', 'valence', 'tempo']
for col_name in numeric_columns:
    mean_value = music_spark_df.select(mean(col(col_name))).collect()[0][0]
    music_spark_df = music_spark_df.fillna({col_name: mean_value})

In [5]:
music_spark_df = music_spark_df.na.drop(subset=["track_id", "track_name"])
music_spark_df.show(5)

+---+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+
|_c0|            track_id|             artists|          album_name|          track_name|popularity|duration_ms|explicit|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|track_genre|
+---+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+
|  0|5SuOikwiRyPMVoIQD...|         Gen Hoshino|              Comedy|              Comedy|        73|     230666|   FALSE|       0.676| 0.461|1.0|  -6.746| 0.0|      0.143|      0.0322|         1.01E-6|   0.358|  0.715| 87.917|           4

In [9]:
from pyspark.sql.functions import hour, count, avg, col

In [11]:


# Simulate or use existing user playback data
# Example: Adding timestamps for when songs are played
listening_data = music_spark_df.withColumn("timestamp", col("duration_ms") * 0.001)  # Simulated timestamp



In [12]:
listening_data = listening_data.withColumn("hour_of_day", (col("timestamp") % 24).cast("integer"))


In [17]:
user_preferences = listening_data.groupBy("artists").agg(
    count("track_id").alias("total_plays"),
    avg("danceability").alias("avg_danceability"),
    avg("energy").alias("avg_energy"),
    avg("valence").alias("avg_valence"),
    count("track_genre").alias("genre_count")
)
user_preferences.show(10)


+--------------------+-----------+------------------+-------------------+-------------------+-----------+
|             artists|total_plays|  avg_danceability|         avg_energy|        avg_valence|genre_count|
+--------------------+-----------+------------------+-------------------+-------------------+-----------+
|Boyce Avenue;Mega...|          1|             0.619|               0.28|              0.292|          1|
|    Ramshackle Glory|          3|0.5416666666666666|              0.437| 0.7400000000000001|          3|
|       Brendan James|          1|             0.636|              0.735|              0.537|          1|
|      The Black Keys|         16|         0.5650625|          0.6983125| 0.5730000000000001|         16|
|    Jane's Addiction|          2|             0.218|              0.932|              0.315|          2|
|  Mon Laferte;Juanes|          1|             0.565|              0.667|               0.78|          1|
|        Yann Tiersen|          9| 0.446888888