In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import col, when

# Initialize Spark Session
spark = SparkSession.builder.appName("MLBasedRecommendation").getOrCreate()

# Load the dataset into a PySpark DataFrame
file_path = './tracks_features.csv'
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Attributes for recommendation
attributes = ["danceability", "energy", "liveness", "valence", "tempo"]

# Replace null values in all columns with 0
#df = df.select([when(col(c).isNull(), 0).otherwise(col(c)).alias(c) for c in df.columns])

# Replace nulls or non-numeric values and cast to double
for attr in attributes:
    df = df.withColumn(attr, when(col(attr).isNull(), 0.0).otherwise(col(attr).cast("double")))

df = df.fillna(0)


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/07 14:54:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [2]:
spark.sparkContext.setLogLevel("DEBUG")

In [2]:
from pyspark.sql.functions import col, sum

# Check for nulls in each column
null_counts = df.select([(sum(col(c).isNull().cast("int")).alias(c)) for c in df.columns])

# Show the count of nulls per column
null_counts.show(truncate=False)

[Stage 2:=====>                                                    (1 + 9) / 10]

+---+----+-----+--------+-------+----------+------------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+-----------+--------------+----+------------+
|id |name|album|album_id|artists|artist_ids|track_number|disc_number|explicit|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo|duration_ms|time_signature|year|release_date|
+---+----+-----+--------+-------+----------+------------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+-----------+--------------+----+------------+
|0  |0   |1    |0       |0      |1         |0           |0          |0       |0           |0     |0  |0       |0   |0          |0           |0               |0       |0      |0    |0          |0             |0   |0           |
+---+----+-----+--------+-------+----------+------------+-----------+--------+------------+-

                                                                                

24/12/07 14:54:47 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [3]:

# Verify the schema to ensure all attributes are of type double
df.printSchema()

# Combine attributes into a single feature vector
vector_assembler = VectorAssembler(inputCols=attributes, outputCol="features")
df = vector_assembler.transform(df)

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- album: string (nullable = true)
 |-- album_id: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- artist_ids: string (nullable = true)
 |-- track_number: string (nullable = true)
 |-- disc_number: string (nullable = true)
 |-- explicit: string (nullable = true)
 |-- danceability: double (nullable = false)
 |-- energy: double (nullable = false)
 |-- key: string (nullable = true)
 |-- loudness: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- speechiness: string (nullable = true)
 |-- acousticness: string (nullable = true)
 |-- instrumentalness: string (nullable = true)
 |-- liveness: double (nullable = false)
 |-- valence: double (nullable = false)
 |-- tempo: double (nullable = false)
 |-- duration_ms: string (nullable = true)
 |-- time_signature: string (nullable = true)
 |-- year: string (nullable = true)
 |-- release_date: string (nullable = true)



In [4]:


# Train a KMeans model to cluster similar songs
kmeans = KMeans(k=10, seed=1, featuresCol="features", predictionCol="cluster")
model = kmeans.fit(df)

# Predict clusters for all songs
df = model.transform(df)

24/12/07 14:54:55 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/12/07 14:54:55 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
                                                                                

In [5]:

from pyspark.ml.linalg import Vectors

# Example user preference
sample_song = {
    "danceability": 0.5,
    "energy": 0.6,
    "liveness": 0.3,
    "valence": 0.5,
    "tempo": 120.0
}

# Convert user preferences into a feature vector
sample_song_vector = Vectors.dense([sample_song[attr] for attr in attributes])

# Assign the sample song to a cluster
sample_cluster = model.predict(sample_song_vector)

# Filter songs in the same cluster as the sample song
recommended_songs = df.filter(col("cluster") == sample_cluster).select(
    "name", "artists", "danceability", "energy", "liveness", "valence", "tempo"
).limit(10)

# Show recommendations
recommended_songs.show(truncate=False)

+-------------------------+----------------------------+------------------+------+-------------------+-------------------+------------------+
|name                     |artists                     |danceability      |energy|liveness           |valence            |tempo             |
+-------------------------+----------------------------+------------------+------+-------------------+-------------------+------------------+
|Testify                  |['Rage Against The Machine']|0.47              |0.978 |0.35600000000000004|0.503              |117.906           |
|Sleep Now In the Fire    |['Rage Against The Machine']|0.426             |0.929 |0.0789             |0.539              |127.059           |
|Know Your Enemy          |['Rage Against The Machine']|0.574             |0.765 |0.136              |0.613              |117.63600000000001|
|Man on a Mission         |['Daryl Hall & John Oates'] |0.787             |0.903 |0.10099999999999999|0.9620000000000001 |119.946           |
|Life'

24/12/07 13:30:51 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 143982 ms exceeds timeout 120000 ms
24/12/07 13:30:51 WARN SparkContext: Killing executors is not supported by current scheduler.
24/12/07 13:30:53 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:80)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:642)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1223)
	at o

In [None]:
# Define the 5 given songs by their IDs
given_song_ids = ["1EuAkAQH9gGd9WKntqSP9S", "76STwPRqjmouEhJc6FI5pX"]

# Filter the dataset for the given songs
given_songs = df.filter(col("id").isin(given_song_ids))

# Get the clusters of the given songs
given_songs_clusters = given_songs.select("id", "cluster").collect()

# Prepare a dictionary to store recommendations for each given song
recommendations = {}

# Find similar songs for each given song
for row in given_songs_clusters:
    song_id = row["id"]
    cluster_id = row["cluster"]
    
    # Get other songs in the same cluster, excluding the given song itself
    similar_songs = df.filter((col("cluster") == cluster_id) & (col("id") != song_id)) \
                      .select("name", "artists", "danceability", "energy", "liveness", "valence", "tempo") \
                      .limit(1)  # Take one recommendation per song
    
    recommendations[song_id] = similar_songs.collect()

# Display recommendations
for song_id, rec in recommendations.items():
    print(f"\nRecommendations for Song ID {song_id}:")
    for row in rec:
        print(f"  Name: {row['name']}, Artists: {row['artists']}, Danceability: {row['danceability']}, "
              f"Energy: {row['energy']}, Liveness: {row['liveness']}, Valence: {row['valence']}, Tempo: {row['tempo']}")

                                                                                


Recommendations for Song ID 1EuAkAQH9gGd9WKntqSP9S:
  Name: Southland Of The Heart, Artists: ['Bruce Cockburn'], Danceability: 0.562, Energy: 0.303, Liveness: 0.0837, Valence: 0.242, Tempo: 70.465

Recommendations for Song ID 76STwPRqjmouEhJc6FI5pX:
  Name: Voice of the Voiceless, Artists: ['Rage Against The Machine'], Danceability: 0.441, Energy: 0.882, Liveness: 0.15, Valence: 0.418, Tempo: 83.37100000000002


24/12/07 17:11:39 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 136696 ms exceeds timeout 120000 ms
24/12/07 17:11:39 WARN SparkContext: Killing executors is not supported by current scheduler.
24/12/07 17:11:40 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$

In [12]:
# Check for infinite values in attributes
from pyspark.sql.functions import col, expr

for attr in attributes:
    count_infinite = df.filter((col(attr) == float('inf')) | (col(attr) == float('-inf'))).count()
    print(f"Column {attr}: {count_infinite} infinite values")

Column danceability: 0 infinite values
Column energy: 0 infinite values
Column liveness: 0 infinite values
Column valence: 0 infinite values
Column tempo: 0 infinite values


In [2]:
# Select a small sample for testing
test_df = df.limit(5)

# Apply VectorAssembler
vector_assembler = VectorAssembler(inputCols=attributes, outputCol="features")
test_df = vector_assembler.transform(test_df)

# Show results
test_df.select("features").show(truncate=False)

+-------------------------------------------------------+
|features                                               |
+-------------------------------------------------------+
|[0.47,0.978,0.35600000000000004,0.503,117.906]         |
|[0.599,0.9570000000000001,0.155,0.489,103.68]          |
|[0.315,0.97,0.122,0.37,149.749]                        |
|[0.44,0.9670000000000001,0.121,0.574,96.75200000000001]|
|[0.426,0.929,0.0789,0.539,127.059]                     |
+-------------------------------------------------------+



24/12/07 12:49:21 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [3]:
# Check the data types programmatically
for attr in attributes:
    print(f"{attr} data type: {df.schema[attr].dataType}")

danceability data type: DoubleType()
energy data type: DoubleType()
liveness data type: DoubleType()
valence data type: DoubleType()
tempo data type: DoubleType()


In [6]:
from pyspark.sql.functions import isnan, col

# Check for null or NaN values in the selected columns
df.select([col(c).isNull().alias(f"{c}_isNull") for c in attributes]).show()
df.select([isnan(c).alias(f"{c}_isNan") for c in attributes]).show()

+-------------------+-------------+---------------+--------------+------------+
|danceability_isNull|energy_isNull|liveness_isNull|valence_isNull|tempo_isNull|
+-------------------+-------------+---------------+--------------+------------+
|              false|        false|          false|         false|       false|
|              false|        false|          false|         false|       false|
|              false|        false|          false|         false|       false|
|              false|        false|          false|         false|       false|
|              false|        false|          false|         false|       false|
|              false|        false|          false|         false|       false|
|              false|        false|          false|         false|       false|
|              false|        false|          false|         false|       false|
|              false|        false|          false|         false|       false|
|              false|        false|     

In [7]:
df.select(attributes).show(truncate=False)

+-------------------+------------------+-------------------+-------------------+------------------+
|danceability       |energy            |liveness           |valence            |tempo             |
+-------------------+------------------+-------------------+-------------------+------------------+
|0.47               |0.978             |0.35600000000000004|0.503              |117.906           |
|0.599              |0.9570000000000001|0.155              |0.489              |103.68            |
|0.315              |0.97              |0.122              |0.37               |149.749           |
|0.44               |0.9670000000000001|0.121              |0.574              |96.75200000000001 |
|0.426              |0.929             |0.0789             |0.539              |127.059           |
|0.298              |0.848             |0.201              |0.19399999999999998|148.282           |
|0.41700000000000004|0.976             |0.107              |0.483              |90.395            |


In [None]:

# Example user preference
sample_song = {
    "danceability": 0.5,
    "energy": 0.6,
    "liveness": 0.3,
    "valence": 0.5,
    "tempo": 120.0
}

# Convert user preferences into a feature vector
sample_song_vector = Vectors.dense([sample_song[attr] for attr in attributes])

# Assign the sample song to a cluster
def predict_cluster(features):
    return model.predict(features)

predict_cluster_udf = udf(predict_cluster)
sample_cluster = predict_cluster_udf(lit(sample_song_vector))

# Filter songs in the same cluster as the sample song
recommended_songs = df.filter(col("cluster") == sample_cluster).select("name", "artists", "danceability", "energy", "liveness", "valence", "tempo").limit(10)

# Show recommendations
recommended_songs.show(truncate=False)

24/12/07 12:33:09 WARN DAGScheduler: Broadcasting large task binary with size 60.8 MiB
24/12/07 12:33:09 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
24/12/07 12:33:12 WARN DAGScheduler: Broadcasting large task binary with size 60.8 MiB
[Stage 9:>                                                        (0 + 10) / 10]
Exception: java.lang.OutOfMemoryError thrown from the UncaughtExceptionHandler in thread "refresh progress"

Exception: java.lang.OutOfMemoryError thrown from the UncaughtExceptionHandler in thread "Spark Context Cleaner"

Exception: java.lang.OutOfMemoryError thrown from the UncaughtExceptionHandler in thread "RemoteBlock-temp-file-clean-thread"
24/12/07 12:33:52 ERROR Executor: Exception in task 2.0 in stage 9.0 (TID 36)
java.lang.OutOfMemoryError: Java heap space
24/12

ConnectionRefusedError: [Errno 61] Connection refused

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/Users/karthik540/Documents/College/Sem_3/BigData/Project/Spotify Recommendation/.venv/lib/python3.12/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/karthik540/Documents/College/Sem_3/BigData/Project/Spotify Recommendation/.venv/lib/python3.12/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/karthik540/Documents/College/Sem_3/BigData/Project/Spotify Recommendation/.venv/lib/python3.12/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while send

In [None]:
# Function to get recommendations for a given song
def get_recommendations(song_id, num_recommendations=5):
    # Validate if the song exists in the data
    song_data = data.filter(data.songId == song_id).select("trackId", "name", "artists", "album").distinct()

    if song_data.count() == 0:
        print(f"Song ID {song_id} not found in the dataset.")
        return

    # Fetch details of the input song
    song_info = song_data.collect()[0]
    track_id = song_info["trackId"]
    track_name = song_info["name"]
    artist_name = song_info["artists"]
    album_name = song_info["album"]

    print(f"Recommendations based on the song: {track_name} by {artist_name} from the album {album_name}")

    # Get recommendations for all items and filter for the input song
    recommendations = model.recommendForAllItems(num_recommendations)
    song_recommendations = recommendations.filter(col("trackId") == track_id)

    # Extract recommended track IDs
    recommended_track_ids = (
        song_recommendations
        .select(explode(col("recommendations")).alias("recommendation"))
        .select(col("recommendation.trackId"))
        .rdd.flatMap(lambda x: x)
        .collect()
    )

    # Fetch recommended track details
    recommended_tracks = data.filter(col("trackId").isin(recommended_track_ids)).select("name", "artists", "album")

    print("Recommended tracks:")
    recommended_tracks.show(truncate=False)